해결된 질문
작성
·
111
0
안녕하세요! 캐글에 올려주신 자료로 공부 중 에러가 떴는데 어떻게 해결해야할지 몰라 질문 남깁니다!
너무 길어서 보기 어려운 점 미리 사과드립니다..ㅠㅠ
import pandas as pd
train = pd.read_csv("/kaggle/input/big-data-analytics-certification-kr-2024-3/train.csv")
test = pd.read_csv("/kaggle/input/big-data-analytics-certification-kr-2024-3/test.csv")
# EDA
# print(train.shape, test.shape) #(1168, 81) (292, 80)
# print(train.info()) #float64(3), int64(35), object(43)
# print(test.info()) #float64(3), int64(34), object(43)
# print(train.isnull().sum().sort_values(ascending=False)[:10])
#범주형 데이터가 너무 많아서 수치형만 선택
train=train.select_dtypes(exclude=['object'])
test=train.select_dtypes(exclude=['object'])
# print(train.head(2))
# print(train.isnull().sum()) #LotFrontage 218, GarageYrBlt 69
# print(test.isnull().sum())
# print(train['LotFrontage'].describe())
# print(train['GarageYrBlt'].describe())
#전처리(결측치, 타겟값 분리)
target=train.pop('SalePrice')
train=train.drop('Id',axis=1)
test_id=test.pop('Id')
train['LotFrontage']=train['LotFrontage'].fillna(train['LotFrontage'].mean())
train['GarageYrBlt']=train['GarageYrBlt'].fillna(train['GarageYrBlt'].mean())
train['MasVnrArea']=train['MasVnrArea'].fillna(train['MasVnrArea'].mean())
test['LotFrontage']=test['LotFrontage'].fillna(test['LotFrontage'].mean())
test['GarageYrBlt']=test['GarageYrBlt'].fillna(test['GarageYrBlt'].mean())
test['MasVnrArea']=test['MasVnrArea'].fillna(test['MasVnrArea'].mean())
# print(train.isnull().sum().sum())
# print(test.isnull().sum().sum())
#데이터 분리
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=0)
# print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape)
#랜포
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(random_state=0)
rf.fit(X_tr, y_tr)
pred=rf.predict(X_val)
#평가
from sklearn.metrics import mean_squared_error
def rmse(y, y_pred):
return mean_squared_error(y, y_pred)**0.5
# print(rmse(y_val, pred))
#중앙값 : 34668.70085343153
#평균 : 33430.8118326734
# 최댓값 : 34100.46200633792
#최솟값 : 34023.36640178194
#예측
pred=rf.predict(test)
submit=pd.DataFrame({'Id':test_id, 'SalePrice':pred})
submit.to_csv('0000.csv', index=False)
pd.read_csv('0000.csv')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[55], line 68
60 return mean_squared_error(y, y_pred)**0.5
61 # print(rmse(y_val, pred))
62 #중앙값 : 34668.70085343153
63 #평균 : 33430.8118326734
(...)
66
67 #예측
---> 68 pred=rf.predict(test)
69 submit=pd.DataFrame({'Id':test_id, 'SalePrice':pred})
70 submit.to_csv('0000.csv', index=False)
File /opt/conda/lib/python3.10/site-packages/sklearn/ensemble/_forest.py:981, in ForestRegressor.predict(self, X)
979 check_is_fitted(self)
980 # Check data
--> 981 X = self._validate_X_predict(X)
983 # Assign chunk of trees to jobs
984 n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
File /opt/conda/lib/python3.10/site-packages/sklearn/ensemble/_forest.py:602, in BaseForest._validate_X_predict(self, X)
599 """
600 Validate X whenever one tries to predict, apply, predict_proba."""
601 check_is_fitted(self)
--> 602 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
603 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
604 raise ValueError("No support for np.int64 index based sparse matrices")
File /opt/conda/lib/python3.10/site-packages/sklearn/base.py:548, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
483 def _validate_data(
484 self,
485 X="no_validation",
(...)
489 **check_params,
490 ):
491 """Validate input data and set or check the `n_features_in_` attribute.
492
493 Parameters
(...)
546 validated.
547 """
--> 548 self._check_feature_names(X, reset=reset)
550 if y is None and self._get_tags()["requires_y"]:
551 raise ValueError(
552 f"This {self.__class__.__name__} estimator "
553 "requires y to be passed, but the target y is None."
554 )
File /opt/conda/lib/python3.10/site-packages/sklearn/base.py:481, in BaseEstimator._check_feature_names(self, X, reset)
476 if not missing_names and not unexpected_names:
477 message += (
478 "Feature names must be in the same order as they were in fit.\n"
479 )
--> 481 raise ValueError(message)
ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- SalePric
답변 1
1
학습에서는 수치형만 선택해서 학습했는데
test 데이터를 사용할 때는 전체 컬럼을 사용했네요
코드 넣을 때 "코드 블럭"을 먼저 선택하고 코드를 복사-붙여넣기 해주세요 🙂
반대로 하면 보기가 힘들어지네요 하하! 화이팅입니다.