房租赛-模型融合

it2025-04-29  9

import pandas as pd import numpy as np import lightgbm as lgb import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import KFold from sklearn.metrics import r2_score def parseData(df): """ 预处理数据 """ df['rentType'][df['rentType']=='--'] = '未知方式' def parseRoom(info, index): res = int(info[index*2]) return res df.insert(3,'室',None) df.insert(4, '厅', None) df.insert(5, '卫', None) df['室'] = df['houseType'].apply(parseRoom, index=0) df['厅'] = df['houseType'].apply(parseRoom, index=1) df['卫'] = df['houseType'].apply(parseRoom, index=2) df['交易月份'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1])) df['houseType_1sumcsu']=df['室'].map(lambda x:str(x))+df['communityName'].map(lambda x:str(x)) #df['houseType_2sumcsu']=df['室'].map(lambda x:str(x))+df['交易月份'].map(lambda x:str(x)) #df['houseType_3sumcsu']=df['communityName'].map(lambda x:str(x))+df['交易月份'].map(lambda x:str(x)) #平滑操作 big_num_cols = ['totalTradeMoney','totalTradeArea','tradeMeanPrice','totalNewTradeMoney', 'totalNewTradeArea', 'tradeNewMeanPrice','remainNewNum', 'supplyNewNum', 'supplyLandArea', 'tradeLandArea','landTotalPrice','landMeanPrice','totalWorkers','newWorkers', 'residentPopulation','pv','uv'] for col in big_num_cols: df[col] = df[col].map(lambda x: np.log1p(x)) # 转换object类型数据 ''' columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate','houseType_1sumcsu','houseType_2sumcsu','houseType_3sumcsu'] for col in columns: df[col] = df[col].astype('category') ''' columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate','houseType_1sumcsu'] for col in columns: df[col] = df[col].astype('category') # 处理pv和uv的空值 df['pv'].fillna(df['pv'].mean(),inplace=True) df['uv'].fillna(df['uv'].mean(),inplace=True) df['pv'] = df['pv'].astype('int') df['uv'] = df['uv'].astype('int') #df.loc[df['buildYear']=='暂无信息','buildYear'] = None # 将buildYear列转换为整型数据 tmp = df['buildYear'].copy() tmp2 = tmp[tmp!='暂无信息'].astype('int') tmp[tmp=='暂无信息'] = tmp2.mode().iloc[0] df['buildYear'] = tmp df['buildYear'] = df['buildYear'].astype('int') #处理renttype df.loc[(df['rentType'] == '未知方式') & (df['室'] <= 1), 'rentType'] = '整租' # print(data.loc[(data['rentType']=='未知方式')&(data['Room_Bath']>1),'rentType']) df.loc[(df['rentType'] == '未知方式') & (df['室'] > 1) & (df['area'] < 50), 'rentType'] = '合租' df.loc[(df['rentType'] == '未知方式') & (df['area'] / df['室'] < 20), 'rentType'] = '合租' # data.loc[(data['rentType']=='未知方式')&(data['area']>60),'rentType']='合租' df.loc[(df['rentType'] == '未知方式') & (df['area'] <= 50) & (df['室'] == 2), 'rentType'] = '合租' df.loc[(df['rentType'] == '未知方式') & (df['area'] > 60) & (df['室'] == 2), 'rentType'] = '整租' df.loc[(df['rentType'] == '未知方式') & (df['area'] <= 60) & (df['室'] == 3), 'rentType'] = '合租' df.loc[(df['rentType'] == '未知方式') & (df['area'] > 60) & (df['室'] == 3), 'rentType'] = '整租' df.loc[(df['rentType'] == '未知方式') & (df['area'] >= 100) & (df['室'] > 3), 'rentType'] = '整租' #groupby特征 items=['area'] for item in items: xiaoquname_mean=df.groupby('communityName',as_index=False)[item].agg({ item+'mean小区名':'mean',} ) df = df.merge(xiaoquname_mean,on='communityName',how='left') # 去掉部分特征,房屋朝向直接剔除 #df.drop('communityName',axis=1, inplace=True) df.drop('city',axis=1,inplace=True) df.drop('houseToward',axis=1,inplace=True) df.drop('houseDecoration',axis=1,inplace=True) df.drop(['ID'],axis=1,inplace=True) return df def washData(df_train, df_test): """ 清洗数据 """ #测试集里面面积只存在200以下,为了训练集与测试集相符只选区面积200以下的进行训练 df_train = df_train[(df_train['area']<=200)&(df_train['area']>6)] df_train = df_train[df_train['tradeMoney']<=100000] df_train = df_train.drop(df_train[(df_train.tradeMoney/df_train.area>300)].index,axis=0) df_train = df_train.drop(df_train[(df_train.tradeMoney/df_train.area<25)].index,axis=0) df_train = df_train.drop(df_train[df_train['houseType'] =='0室0厅1卫' ].index) df_train=df_train.drop(df_train[df_train.totalFloor==0].index) df_train = df_train.drop(df_train[(df_train['tradeMoney']>25000)&(df_train['area']<100)].index) df_train = df_train.drop(df_train[(df_train['tradeMoney']<75000)&(df_train['area']>800)].index) return df_train, df_test def feature(df): """ 特征 """ # 将houseType转化为‘房间数’,‘厅数’,‘卫生间数’ ''' def parseRoom(info, index): res = int(info[index*2]) return res df.insert(3,'室',None) df.insert(4, '厅', None) df.insert(5, '卫', None) df['室'] = df['houseType'].apply(parseRoom, index=0) df['厅'] = df['houseType'].apply(parseRoom, index=1) df['卫'] = df['houseType'].apply(parseRoom, index=2) df['交易月份'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1])) ''' # df['pv/uv'] = df['pv'] / df['uv'] #df['房间总数'] = df['室'] + df['厅'] + df['卫'] df.drop('houseType', axis=1, inplace=True) df.drop('tradeTime', axis=1, inplace=True) # items=['area','室','厅'] #统计特征 ''' for item in items: xiaoquname_mean=df.groupby('communityName',as_index=False)[item].agg({ item+'mean小区名':'mean',} ) df = df.merge(xiaoquname_mean,on='communityName',how='left') tmp=df.groupby(['communityName'],as_index=False)['area'].agg({ 'ca_mean':'mean', 'ca_std':'std', #'ca_min':'min' }) df = df.merge(tmp, on=['communityName'], how='left') ''' #df['houseType_1sumcsu']=df['室'].map(lambda x:str(x))+df['communityName'].map(lambda x:str(x)) #df['houseType_2sumcsu']=df['室'].map(lambda x:str(x))+df['交易月份'].map(lambda x:str(x)) #将冗杂特征合并,并构造新特征 df['traffic']=df['subwayStationNum']+df['busStationNum'] df['edu']=df['interSchoolNum']+df['schoolNum']+df['privateSchoolNum'] df['livecondition']=df['drugStoreNum']+df['bankNum']+df['shopNum']+df['parkNum']+df['mallNum']+df['superMarketNum']+df['gymNum'] df['pepleroute']=df['newWorkers']/df['residentPopulation'] df['tradeMoneynew']=df['tradeMeanPrice']/df['tradeNewMeanPrice'] df['meanarea']=df['totalTradeArea']/df['tradeSecNum'] df['meanNewarea']=df['totalNewTradeArea']/df['tradeNewNum'] df['lostnum']=df['supplyNewNum']-df['tradeNewNum'] ''' df['aggpeople']=df['newWorkers']/df['residentPopulation'] df['剩余新房卖出比'] = (df['tradeNewNum'])/df['remainNewNum'] df['二手售出比'] = (df['saleSecHouseNum'])/(df['tradeSecNum']) df['新二价格比'] = df['tradeMeanPrice']/df['tradeNewMeanPrice'] df['当月新房售出比'] = (df['tradeNewNum'])/(df['supplyNewNum']) ''' df=df.drop(['subwayStationNum','busStationNum','interSchoolNum','schoolNum','privateSchoolNum','drugStoreNum','bankNum'],axis=1) df=df.drop(['shopNum','parkNum','mallNum','superMarketNum','hospitalNum','gymNum'],axis=1) #基于特征重要度删除特征 df=df.drop(columns=['tradeLandNum','landMeanPrice','supplyLandNum','landTotalPrice','tradeLandArea','lostnum','supplyNewNum','supplyLandArea','region','tradeNewNum','pepleroute','lookNum','uv','saleSecHouseNum','livecondition','pv']) #categorical_feats = ['rentType', 'houseFloor', 'plate','communityName','houseType_1sumcsu','houseType_2sumcsu','houseType_3sumcsu'] categorical_feats = ['rentType', 'houseFloor', 'plate','communityName','houseType_1sumcsu'] return df, categorical_feats def getData(feature): """ 获取数据 """ train=pd.read_csv(r'C:\Users\lxc\Desktop\featurecup\train_data.csv') test=pd.read_csv(r'C:\Users\lxc\Desktop\featurecup\test_b.csv') train = parseData(train) train, test = washData(train, test) train, col = feature(train) test, col = feature(test) #train.fillna(0,inplace=True) #test.fillna(0,inplace=True) target = train.pop('tradeMoney') features = train.columns categorical_feats = col return train, test, target, features, categorical_feats train, test, target, features, categorical_feats = getData(feature) params = { 'num_leaves': 31, 'min_data_in_leaf': 20, 'min_child_samples':20, 'objective': 'regression', 'learning_rate': 0.01, "boosting": "gbdt", "feature_fraction": 0.8, "bagging_freq": 1, "bagging_fraction": 0.85, "bagging_seed": 23, "metric": 'rmse', "lambda_l1": 0.2, "nthread": 4, } folds = KFold(n_splits=5, shuffle=True, random_state=2333) oof_lgb = np.zeros(len(train)) predictions_lgb = np.zeros(len(test)) feature_importance_df = pd.DataFrame() for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)): print("fold {}".format(fold_)) trn_data = lgb.Dataset(train.iloc[trn_idx], label=target.iloc[trn_idx], categorical_feature=categorical_feats) val_data = lgb.Dataset(train.iloc[val_idx], label=target.iloc[val_idx], categorical_feature=categorical_feats) num_round = 10000 clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 200) oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = features fold_importance_df["importance"] = clf.feature_importance() fold_importance_df["fold"] = fold_ + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits print("CV Score: {:<8.5f}".format(r2_score(target, oof_lgb))) from sklearn.preprocessing import LabelEncoder xgb_params = {'eta': 0.05, 'max_depth': 5, 'subsample': 0.5, 'colsample_bytree': 0.5, 'alpha': 0.2, 'objective': 'reg:gamma', 'eval_metric': 'rmse', 'silent': True, 'nthread': -1 } le =LabelEncoder() train['rentType'] = le.fit_transform(train['rentType']) train['houseFloor'] = le.fit_transform(train['houseFloor']) train['communityName'] = le.fit_transform(train['communityName']) train['plate'] = le.fit_transform(train['plate']) train['houseType_1sumcsu'] = le.fit_transform(train['houseType_1sumcsu']) train['houseType_2sumcsu'] = le.fit_transform(train['houseType_2sumcsu']) train['houseType_3sumcsu'] = le.fit_transform(train['houseType_3sumcsu']) test['rentType'] = le.fit_transform(test['rentType']) test['houseFloor'] = le.fit_transform(test['houseFloor']) test['communityName'] = le.fit_transform(test['communityName']) test['plate'] = le.fit_transform(test['plate']) test['houseType_1sumcsu'] = le.fit_transform(test['houseType_1sumcsu']) test['houseType_2sumcsu'] = le.fit_transform(test['houseType_2sumcsu']) test['houseType_3sumcsu'] = le.fit_transform(test['houseType_3sumcsu']) import xgboost as xgb folds = KFold(n_splits=5, shuffle=True, random_state=2333) oof_xgb = np.zeros(len(train)) predictions_xgb = np.zeros(len(test)) feature_importance_df = pd.DataFrame() for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)): print("fold {}".format(fold_)) trn_data = xgb.DMatrix(train.iloc[trn_idx], label=target.iloc[trn_idx]) val_data = xgb.DMatrix(train.iloc[val_idx], label=target.iloc[val_idx]) watchlist = [(trn_data, 'train'), (val_data, 'valid_data')] num_round = 10000 clf = xgb.train(dtrain=trn_data,num_boost_round=10000, evals=watchlist, early_stopping_rounds=200, verbose_eval=1000, params=xgb_params) lgb_model=clf oof_xgb[val_idx] = clf.predict(xgb.DMatrix(train.loc[val_idx]), ntree_limit=clf.best_ntree_limit) ''' fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = features fold_importance_df["importance"] = clf.feature_importance() fold_importance_df["fold"] = fold_ + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) ''' predictions_xgb += clf.predict(xgb.DMatrix(test), ntree_limit=clf.best_ntree_limit) / folds.n_splits print("CV Score: {:<8.5f}".format(r2_score(target, oof_xgb))) from mlxtend.classifier import StackingClassifier sclf = StackingClassifier(classifiers=[lgb], meta_classifier=xgb_model) sclf_score=sclf.fit(train,target) test_predict=sclf.predict(test) from sklearn.metrics import r2_score def online_score(pred): print("预测结果最大值:{},预测结果最小值:{}".format(pred.max(),pred.min())) # a榜测分 conmbine1 = pd.read_csv(r'C:\Users\lxc\Desktop\featurecup\sub_b_919.csv',engine = "python",header=None) score1 = r2_score(pred, conmbine1) print("对比919分数:{}".format(score1)) score = online_score(test_predict) 预测结果最大值:19051.067151217972,预测结果最小值:1199.97082591554 对比919分数:0.981891385946527
最新回复(0)