import pandas
as pd
import numpy
as np
import lightgbm
as lgb
import matplotlib
.pyplot
as plt
import seaborn
as sns
from sklearn
.model_selection
import KFold
from sklearn
.metrics
import r2_score
def parseData(df
):
"""
预处理数据
"""
df
['rentType'][df
['rentType']=='--'] = '未知方式'
def parseRoom(info
, index
):
res
= int(info
[index
*2])
return res
df
.insert
(3,'室',None)
df
.insert
(4, '厅', None)
df
.insert
(5, '卫', None)
df
['室'] = df
['houseType'].apply(parseRoom
, index
=0)
df
['厅'] = df
['houseType'].apply(parseRoom
, index
=1)
df
['卫'] = df
['houseType'].apply(parseRoom
, index
=2)
df
['交易月份'] = df
['tradeTime'].apply(lambda x
: int(x
.split
('/')[1]))
df
['houseType_1sumcsu']=df
['室'].map(lambda x
:str(x
))+df
['communityName'].map(lambda x
:str(x
))
big_num_cols
= ['totalTradeMoney','totalTradeArea','tradeMeanPrice','totalNewTradeMoney', 'totalNewTradeArea',
'tradeNewMeanPrice','remainNewNum', 'supplyNewNum', 'supplyLandArea',
'tradeLandArea','landTotalPrice','landMeanPrice','totalWorkers','newWorkers',
'residentPopulation','pv','uv']
for col
in big_num_cols
:
df
[col
] = df
[col
].map(lambda x
: np
.log1p
(x
))
'''
columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate','houseType_1sumcsu','houseType_2sumcsu','houseType_3sumcsu']
for col in columns:
df[col] = df[col].astype('category')
'''
columns
= ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate','houseType_1sumcsu']
for col
in columns
:
df
[col
] = df
[col
].astype
('category')
df
['pv'].fillna
(df
['pv'].mean
(),inplace
=True)
df
['uv'].fillna
(df
['uv'].mean
(),inplace
=True)
df
['pv'] = df
['pv'].astype
('int')
df
['uv'] = df
['uv'].astype
('int')
tmp
= df
['buildYear'].copy
()
tmp2
= tmp
[tmp
!='暂无信息'].astype
('int')
tmp
[tmp
=='暂无信息'] = tmp2
.mode
().iloc
[0]
df
['buildYear'] = tmp
df
['buildYear'] = df
['buildYear'].astype
('int')
df
.loc
[(df
['rentType'] == '未知方式') & (df
['室'] <= 1), 'rentType'] = '整租'
df
.loc
[(df
['rentType'] == '未知方式') & (df
['室'] > 1) & (df
['area'] < 50), 'rentType'] = '合租'
df
.loc
[(df
['rentType'] == '未知方式') & (df
['area'] / df
['室'] < 20), 'rentType'] = '合租'
df
.loc
[(df
['rentType'] == '未知方式') & (df
['area'] <= 50) & (df
['室'] == 2), 'rentType'] = '合租'
df
.loc
[(df
['rentType'] == '未知方式') & (df
['area'] > 60) & (df
['室'] == 2), 'rentType'] = '整租'
df
.loc
[(df
['rentType'] == '未知方式') & (df
['area'] <= 60) & (df
['室'] == 3), 'rentType'] = '合租'
df
.loc
[(df
['rentType'] == '未知方式') & (df
['area'] > 60) & (df
['室'] == 3), 'rentType'] = '整租'
df
.loc
[(df
['rentType'] == '未知方式') & (df
['area'] >= 100) & (df
['室'] > 3), 'rentType'] = '整租'
items
=['area']
for item
in items
:
xiaoquname_mean
=df
.groupby
('communityName',as_index
=False)[item
].agg
({
item
+'mean小区名':'mean',}
)
df
= df
.merge
(xiaoquname_mean
,on
='communityName',how
='left')
df
.drop
('city',axis
=1,inplace
=True)
df
.drop
('houseToward',axis
=1,inplace
=True)
df
.drop
('houseDecoration',axis
=1,inplace
=True)
df
.drop
(['ID'],axis
=1,inplace
=True)
return df
def washData(df_train
, df_test
):
"""
清洗数据
"""
df_train
= df_train
[(df_train
['area']<=200)&(df_train
['area']>6)]
df_train
= df_train
[df_train
['tradeMoney']<=100000]
df_train
= df_train
.drop
(df_train
[(df_train
.tradeMoney
/df_train
.area
>300)].index
,axis
=0)
df_train
= df_train
.drop
(df_train
[(df_train
.tradeMoney
/df_train
.area
<25)].index
,axis
=0)
df_train
= df_train
.drop
(df_train
[df_train
['houseType'] =='0室0厅1卫' ].index
)
df_train
=df_train
.drop
(df_train
[df_train
.totalFloor
==0].index
)
df_train
= df_train
.drop
(df_train
[(df_train
['tradeMoney']>25000)&(df_train
['area']<100)].index
)
df_train
= df_train
.drop
(df_train
[(df_train
['tradeMoney']<75000)&(df_train
['area']>800)].index
)
return df_train
, df_test
def feature(df
):
"""
特征
"""
'''
def parseRoom(info, index):
res = int(info[index*2])
return res
df.insert(3,'室',None)
df.insert(4, '厅', None)
df.insert(5, '卫', None)
df['室'] = df['houseType'].apply(parseRoom, index=0)
df['厅'] = df['houseType'].apply(parseRoom, index=1)
df['卫'] = df['houseType'].apply(parseRoom, index=2)
df['交易月份'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1]))
'''
df
.drop
('houseType', axis
=1, inplace
=True)
df
.drop
('tradeTime', axis
=1, inplace
=True)
'''
for item in items:
xiaoquname_mean=df.groupby('communityName',as_index=False)[item].agg({
item+'mean小区名':'mean',}
)
df = df.merge(xiaoquname_mean,on='communityName',how='left')
tmp=df.groupby(['communityName'],as_index=False)['area'].agg({
'ca_mean':'mean',
'ca_std':'std',
#'ca_min':'min'
})
df = df.merge(tmp, on=['communityName'], how='left')
'''
df
['traffic']=df
['subwayStationNum']+df
['busStationNum']
df
['edu']=df
['interSchoolNum']+df
['schoolNum']+df
['privateSchoolNum']
df
['livecondition']=df
['drugStoreNum']+df
['bankNum']+df
['shopNum']+df
['parkNum']+df
['mallNum']+df
['superMarketNum']+df
['gymNum']
df
['pepleroute']=df
['newWorkers']/df
['residentPopulation']
df
['tradeMoneynew']=df
['tradeMeanPrice']/df
['tradeNewMeanPrice']
df
['meanarea']=df
['totalTradeArea']/df
['tradeSecNum']
df
['meanNewarea']=df
['totalNewTradeArea']/df
['tradeNewNum']
df
['lostnum']=df
['supplyNewNum']-df
['tradeNewNum']
'''
df['aggpeople']=df['newWorkers']/df['residentPopulation']
df['剩余新房卖出比'] = (df['tradeNewNum'])/df['remainNewNum']
df['二手售出比'] = (df['saleSecHouseNum'])/(df['tradeSecNum'])
df['新二价格比'] = df['tradeMeanPrice']/df['tradeNewMeanPrice']
df['当月新房售出比'] = (df['tradeNewNum'])/(df['supplyNewNum'])
'''
df
=df
.drop
(['subwayStationNum','busStationNum','interSchoolNum','schoolNum','privateSchoolNum','drugStoreNum','bankNum'],axis
=1)
df
=df
.drop
(['shopNum','parkNum','mallNum','superMarketNum','hospitalNum','gymNum'],axis
=1)
df
=df
.drop
(columns
=['tradeLandNum','landMeanPrice','supplyLandNum','landTotalPrice','tradeLandArea','lostnum','supplyNewNum','supplyLandArea','region','tradeNewNum','pepleroute','lookNum','uv','saleSecHouseNum','livecondition','pv'])
categorical_feats
= ['rentType', 'houseFloor', 'plate','communityName','houseType_1sumcsu']
return df
, categorical_feats
def getData(feature
):
"""
获取数据
"""
train
=pd
.read_csv
(r
'C:\Users\lxc\Desktop\featurecup\train_data.csv')
test
=pd
.read_csv
(r
'C:\Users\lxc\Desktop\featurecup\test_b.csv')
train
= parseData
(train
)
train
, test
= washData
(train
, test
)
train
, col
= feature
(train
)
test
, col
= feature
(test
)
target
= train
.pop
('tradeMoney')
features
= train
.columns
categorical_feats
= col
return train
, test
, target
, features
, categorical_feats
train
, test
, target
, features
, categorical_feats
= getData
(feature
)
params
= {
'num_leaves': 31,
'min_data_in_leaf': 20,
'min_child_samples':20,
'objective': 'regression',
'learning_rate': 0.01,
"boosting": "gbdt",
"feature_fraction": 0.8,
"bagging_freq": 1,
"bagging_fraction": 0.85,
"bagging_seed": 23,
"metric": 'rmse',
"lambda_l1": 0.2,
"nthread": 4,
}
folds
= KFold
(n_splits
=5, shuffle
=True, random_state
=2333)
oof_lgb
= np
.zeros
(len(train
))
predictions_lgb
= np
.zeros
(len(test
))
feature_importance_df
= pd
.DataFrame
()
for fold_
, (trn_idx
, val_idx
) in enumerate(folds
.split
(train
.values
, target
.values
)):
print("fold {}".format(fold_
))
trn_data
= lgb
.Dataset
(train
.iloc
[trn_idx
], label
=target
.iloc
[trn_idx
], categorical_feature
=categorical_feats
)
val_data
= lgb
.Dataset
(train
.iloc
[val_idx
], label
=target
.iloc
[val_idx
], categorical_feature
=categorical_feats
)
num_round
= 10000
clf
= lgb
.train
(params
, trn_data
, num_round
, valid_sets
= [trn_data
, val_data
], verbose_eval
=500, early_stopping_rounds
= 200)
oof_lgb
[val_idx
] = clf
.predict
(train
.iloc
[val_idx
], num_iteration
=clf
.best_iteration
)
fold_importance_df
= pd
.DataFrame
()
fold_importance_df
["feature"] = features
fold_importance_df
["importance"] = clf
.feature_importance
()
fold_importance_df
["fold"] = fold_
+ 1
feature_importance_df
= pd
.concat
([feature_importance_df
, fold_importance_df
], axis
=0)
predictions_lgb
+= clf
.predict
(test
, num_iteration
=clf
.best_iteration
) / folds
.n_splits
print("CV Score: {:<8.5f}".format(r2_score
(target
, oof_lgb
)))
from sklearn
.preprocessing
import LabelEncoder
xgb_params
= {'eta': 0.05, 'max_depth': 5, 'subsample': 0.5, 'colsample_bytree': 0.5, 'alpha': 0.2,
'objective': 'reg:gamma', 'eval_metric': 'rmse', 'silent': True, 'nthread': -1
}
le
=LabelEncoder
()
train
['rentType'] = le
.fit_transform
(train
['rentType'])
train
['houseFloor'] = le
.fit_transform
(train
['houseFloor'])
train
['communityName'] = le
.fit_transform
(train
['communityName'])
train
['plate'] = le
.fit_transform
(train
['plate'])
train
['houseType_1sumcsu'] = le
.fit_transform
(train
['houseType_1sumcsu'])
train
['houseType_2sumcsu'] = le
.fit_transform
(train
['houseType_2sumcsu'])
train
['houseType_3sumcsu'] = le
.fit_transform
(train
['houseType_3sumcsu'])
test
['rentType'] = le
.fit_transform
(test
['rentType'])
test
['houseFloor'] = le
.fit_transform
(test
['houseFloor'])
test
['communityName'] = le
.fit_transform
(test
['communityName'])
test
['plate'] = le
.fit_transform
(test
['plate'])
test
['houseType_1sumcsu'] = le
.fit_transform
(test
['houseType_1sumcsu'])
test
['houseType_2sumcsu'] = le
.fit_transform
(test
['houseType_2sumcsu'])
test
['houseType_3sumcsu'] = le
.fit_transform
(test
['houseType_3sumcsu'])
import xgboost
as xgb
folds
= KFold
(n_splits
=5, shuffle
=True, random_state
=2333)
oof_xgb
= np
.zeros
(len(train
))
predictions_xgb
= np
.zeros
(len(test
))
feature_importance_df
= pd
.DataFrame
()
for fold_
, (trn_idx
, val_idx
) in enumerate(folds
.split
(train
.values
, target
.values
)):
print("fold {}".format(fold_
))
trn_data
= xgb
.DMatrix
(train
.iloc
[trn_idx
], label
=target
.iloc
[trn_idx
])
val_data
= xgb
.DMatrix
(train
.iloc
[val_idx
], label
=target
.iloc
[val_idx
])
watchlist
= [(trn_data
, 'train'), (val_data
, 'valid_data')]
num_round
= 10000
clf
= xgb
.train
(dtrain
=trn_data
,num_boost_round
=10000, evals
=watchlist
, early_stopping_rounds
=200,
verbose_eval
=1000, params
=xgb_params
)
lgb_model
=clf
oof_xgb
[val_idx
] = clf
.predict
(xgb
.DMatrix
(train
.loc
[val_idx
]), ntree_limit
=clf
.best_ntree_limit
)
'''
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
'''
predictions_xgb
+= clf
.predict
(xgb
.DMatrix
(test
),
ntree_limit
=clf
.best_ntree_limit
) / folds
.n_splits
print("CV Score: {:<8.5f}".format(r2_score
(target
, oof_xgb
)))
from mlxtend
.classifier
import StackingClassifier
sclf
= StackingClassifier
(classifiers
=[lgb
], meta_classifier
=xgb_model
)
sclf_score
=sclf
.fit
(train
,target
)
test_predict
=sclf
.predict
(test
)
from sklearn
.metrics
import r2_score
def online_score(pred
):
print("预测结果最大值:{},预测结果最小值:{}".format(pred
.max(),pred
.min()))
conmbine1
= pd
.read_csv
(r
'C:\Users\lxc\Desktop\featurecup\sub_b_919.csv',engine
= "python",header
=None)
score1
= r2_score
(pred
, conmbine1
)
print("对比919分数:{}".format(score1
))
score
= online_score
(test_predict
)
预测结果最大值:
19051.067151217972,预测结果最小值:
1199.97082591554
对比
919分数
:0.981891385946527
转载请注明原文地址: https://win8.8miu.com/read-1547008.html