해결된 질문
작성
·
653
0
안녕하세요 캐글 T2-1을 푸는 도중 도저히 에러를 해결할 수 없어 질문드립니다!ㅠㅠ코드는 가장 아래에 있습니다
데이터가 3개 주어지고 결측치가 있는 경우 제가 짠 코드와 같이 결측치를 채우면 될까요?
에러는 아래와 같이 나타납니다. test 데이터에 ‘Ali', 'Mr Ahmed'가 없다는 뜻같은데 무엇이 문제이고 어떻게 해결해야 좋을까요?ㅠㅠ
에러메시지, 코드:
KeyError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_label.py in _encode_python(values, uniques, encode)
65 try:
---> 66 encoded = np.array([table[v] for v in values])
67 except KeyError as e:
/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_label.py in <listcomp>(.0)
65 try:
---> 66 encoded = np.array([table[v] for v in values])
67 except KeyError as e:
KeyError: 'Ali, Mr. Ahmed'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
/tmp/ipykernel_20/403961971.py in <module>
53 le = LabelEncoder()
54 X_train[col] = le.fit_transform(X_train[col])
---> 55 X_test[col] = le.transform(X_test[col])
/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_label.py in transform(self, y)
275 return np.array([])
276
--> 277 _, y = _encode(y, uniques=self.classes_, encode=True)
278 return y
279
/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_label.py in _encode(values, uniques, encode, check_unknown)
111 if values.dtype == object:
112 try:
--> 113 res = _encode_python(values, uniques, encode)
114 except TypeError:
115 types = sorted(t.__qualname__
/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_label.py in _encode_python(values, uniques, encode)
67 except KeyError as e:
68 raise ValueError("y contains previously unseen labels: %s"
---> 69 % str(e))
70 return uniques, encoded
71 else:
ValueError: y contains previously unseen labels: 'Ali, Mr. Ahmed'
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
def exam_data_load(df, target, id_name="", null_name=""):
if id_name == "":
df = df.reset_index().rename(columns={"index": "id"})
id_name = 'id'
else:
id_name = id_name
if null_name != "":
df[df == null_name] = np.nan
X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
y_train = X_train[[id_name, target]]
X_train = X_train.drop(columns=[target])
y_test = X_test[[id_name, target]]
X_test = X_test.drop(columns=[target])
return X_train, X_test, y_train, y_test
df = pd.read_csv("../input/titanic/train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Survived', id_name='PassengerId')
X_train.shape, X_test.shape, y_train.shape, y_test.shape
import pandas as pd
df = pd.concat([X_train,y_train['Survived']], axis = 1)
df['Age'] = df['Age'].fillna(df['Age'].median())
X_test['Age'] = X_test['Age'].fillna(df['Age'].median())
df['Cabin'] = df['Cabin'].fillna('N')
X_test['Cabin'] = X_test['Cabin'].fillna('N')
df['Embarked'] = df['Embarked'].fillna('S')
X_test['Embarked'] = X_test['Embarked'].fillna('S')
#print(df.isnull().sum())
#print(X_test.isnull().sum())
print(df.info())
print(X_test)
cols = ['Name','Sex', 'Ticket', 'Cabin', 'Embarked']
from sklearn.preprocessing import LabelEncoder
for col in cols:
le = LabelEncoder()
X_train[col] = le.fit_transform(X_train[col])
X_test[col] = le.transform(X_test[col])
답변 2
1
Name이 train과 test에 있는 종류가 같지 않습니다.
Name을 제외하거나
Name을 사용한다면 train과 test를 합쳐서 인코딩 하는 방법이 있습니다.
이름을 인코딩할 거라면 제외하는 것이 좋을 것 같아요!
0
강사님 해당문제 아래와 같이 코드를 짰고 결과값을 도출했습니다. 이정도 평가지표와 예측결과면 40점 받는데에 문제없을까요? 맨아래 사진이 코드이고 그 위의값들이 결과값입니다
0.839453284373725
[2]:
0.9876977152899824
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
def exam_data_load(df, target, id_name="", null_name=""):
if id_name == "":
df = df.reset_index().rename(columns={"index": "id"})
id_name = 'id'
else:
id_name = id_name
if null_name != "":
df[df == null_name] = np.nan
X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
y_train = X_train[[id_name, target]]
X_train = X_train.drop(columns=[target])
y_test = X_test[[id_name, target]]
X_test = X_test.drop(columns=[target])
return X_train, X_test, y_train, y_test
df = pd.read_csv("../input/titanic/train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Survived', id_name='PassengerId')
X_train.shape, X_test.shape, y_train.shape, y_test.shape
import pandas as pd
cols = ['Name', 'Cabin', 'Ticket']
for col in cols:
X_train = X_train.drop(col, axis =1)
X_test = X_test.drop(col, axis = 1)
X_train['Age'] = X_train['Age'].fillna(X_train['Age'].median())
X_test['Age'] = X_test['Age'].fillna(X_test['Age'].median())
X_train['Embarked'] = X_train['Embarked'].fillna('S')
X_test['Embarked'] = X_test['Embarked'].fillna('S')
#print(df.isnull().sum())
#print(X_test.isnull().sum())
#print(df.head())
#print(X_test)
cols = ['Sex','Embarked']
from sklearn.preprocessing import LabelEncoder
for col in cols:
le = LabelEncoder()
X_train[col] = le.fit_transform(X_train[col])
X_test[col] = le.transform(X_test[col])
df = pd.concat([X_train,y_train['Survived']], axis = 1)
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
X_train = X_train.drop('PassengerId', axis = 1)
X_test_id = X_test.pop('PassengerId')
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train['Survived'], test_size = 0.2, random_state = 2023)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_tr, y_tr)
pred = model.predict(X_val)
print(roc_auc_score(y_val, pred))
pred = model.predict(X_test)
submit = pd.DataFrame({'PassengerId': X_test_id, 'Survived': pred })
submit.to_csv('0000.csv', index = False)
pd.read_csv('0000.csv')
model.score(X_tr, y_tr)
아 drop을 먼저하니까 해결됐습니다! 감사합니다