에러 이유

안녕하세요! 캐글에 올려주신 자료로 공부 중 에러가 떴는데 어떻게 해결해야할지 몰라 질문 남깁니다!

너무 길어서 보기 어려운 점 미리 사과드립니다..ㅠㅠ

import pandas as pd

train = pd.read_csv("/kaggle/input/big-data-analytics-certification-kr-2024-3/train.csv")

test = pd.read_csv("/kaggle/input/big-data-analytics-certification-kr-2024-3/test.csv")

# EDA

# print(train.shape, test.shape) #(1168, 81) (292, 80)

# print(train.info()) #float64(3), int64(35), object(43)

# print(test.info()) #float64(3), int64(34), object(43)

# print(train.isnull().sum().sort_values(ascending=False)[:10])

#범주형 데이터가 너무 많아서 수치형만 선택

train=train.select_dtypes(exclude=['object'])

test=train.select_dtypes(exclude=['object'])

# print(train.head(2))

# print(train.isnull().sum()) #LotFrontage      218, GarageYrBlt       69

# print(test.isnull().sum())

# print(train['LotFrontage'].describe())

# print(train['GarageYrBlt'].describe())

#전처리(결측치, 타겟값 분리)

target=train.pop('SalePrice')

train=train.drop('Id',axis=1)

test_id=test.pop('Id')

train['LotFrontage']=train['LotFrontage'].fillna(train['LotFrontage'].mean())

train['GarageYrBlt']=train['GarageYrBlt'].fillna(train['GarageYrBlt'].mean())

train['MasVnrArea']=train['MasVnrArea'].fillna(train['MasVnrArea'].mean())

test['LotFrontage']=test['LotFrontage'].fillna(test['LotFrontage'].mean())

test['GarageYrBlt']=test['GarageYrBlt'].fillna(test['GarageYrBlt'].mean())

test['MasVnrArea']=test['MasVnrArea'].fillna(test['MasVnrArea'].mean())

# print(train.isnull().sum().sum())

# print(test.isnull().sum().sum())

#데이터 분리

from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

# print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape)

#랜포

from sklearn.ensemble import RandomForestRegressor

rf=RandomForestRegressor(random_state=0)

rf.fit(X_tr, y_tr)

pred=rf.predict(X_val)

#평가

from sklearn.metrics import mean_squared_error

def rmse(y, y_pred):

    return mean_squared_error(y, y_pred)**0.5

# print(rmse(y_val, pred))

#중앙값 : 34668.70085343153

#평균 : 33430.8118326734

# 최댓값 : 34100.46200633792

#최솟값 : 34023.36640178194

#예측

pred=rf.predict(test)

submit=pd.DataFrame({'Id':test_id, 'SalePrice':pred})

submit.to_csv('0000.csv', index=False)

pd.read_csv('0000.csv')

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[55], line 68
     60     return mean_squared_error(y, y_pred)**0.5
     61 # print(rmse(y_val, pred))
     62 #중앙값 : 34668.70085343153
     63 #평균 : 33430.8118326734
   (...)
     66 
     67 #예측
---> 68 pred=rf.predict(test)
     69 submit=pd.DataFrame({'Id':test_id, 'SalePrice':pred})
     70 submit.to_csv('0000.csv', index=False)

File /opt/conda/lib/python3.10/site-packages/sklearn/ensemble/_forest.py:981, in ForestRegressor.predict(self, X)
    979 check_is_fitted(self)
    980 # Check data
--> 981 X = self._validate_X_predict(X)
    983 # Assign chunk of trees to jobs
    984 n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

File /opt/conda/lib/python3.10/site-packages/sklearn/ensemble/_forest.py:602, in BaseForest._validate_X_predict(self, X)
    599 """
    600 Validate X whenever one tries to predict, apply, predict_proba."""
    601 check_is_fitted(self)
--> 602 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
    603 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
    604     raise ValueError("No support for np.int64 index based sparse matrices")

File /opt/conda/lib/python3.10/site-packages/sklearn/base.py:548, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    483 def _validate_data(
    484     self,
    485     X="no_validation",
   (...)
    489     **check_params,
    490 ):
    491     """Validate input data and set or check the `n_features_in_` attribute.
    492 
    493     Parameters
   (...)
    546         validated.
    547     """
--> 548     self._check_feature_names(X, reset=reset)
    550     if y is None and self._get_tags()["requires_y"]:
    551         raise ValueError(
    552             f"This {self.__class__.__name__} estimator "
    553             "requires y to be passed, but the target y is None."
    554         )

File /opt/conda/lib/python3.10/site-packages/sklearn/base.py:481, in BaseEstimator._check_feature_names(self, X, reset)
    476 if not missing_names and not unexpected_names:
    477     message += (
    478         "Feature names must be in the same order as they were in fit.\n"
    479     )
--> 481 raise ValueError(message)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- SalePric

인프런 커뮤니티 질문&답변