Python#单变量分析包

it2022-05-05 189

写在前面

1、目的

建模经常要做单变量分析，就把单变量分析要用到的函数全部封装为一个包fctn，其中的函数包含：

1）count_iv(df, refuse, x)：每个值为一组 2）qcut_iv(df, refuse,x, n):：n等分进行分组 3）customcut_iv(df,refuse,x,group)：自定义区间进行分组 4）autocut_iv(df,refuse, x, percent = 1)：自动合并占比x以下的组进行分组

2、用法

将Ipynb_importer.py与fctn.ipynb放入统一文件夹内

在建模文件中：

import Ipynb_importer import fctn

可以直接直接从fctn中调用函数，例如：

fctn.count_iv(alldata, 'respond', 'ini_lvl')

3、输出结果

fctn.qcut_iv(result, 'target', 'province_br',5)

代码

fctn.ipynb

import pandas as pd import numpy as np import seaborn as sns import math def count_iv(df, refuse, x): meany = df[refuse].mean() df[x] = df[x].fillna('Nan') t = df.groupby([x])[refuse].agg(['count','mean']).reset_index() t['percent'] = t['count']/df[x].count()*100 t['是否初步显著'] = t.apply(lambda row: 1 if abs(row['mean']-meany)/meany>=0.2 and row['percent']>=2 else 0, axis=1) t['是否初步拒绝'] = t.apply(lambda row: 1 if row['mean']>(2*meany) else 0, axis=1) gc = df[refuse].value_counts()[0] bc = df[refuse].value_counts()[1] t['gr'] = t.apply(lambda x: (1-x['mean'])*x['count']/gc, axis=1) t['br'] = t.apply(lambda x: x['mean']*x['count']/bc, axis=1) t['WoE'] = t.apply(lambda x: math.log(x['gr']/x['br'])*100 if ((x['br'] > 0) and (x['gr'] > 0)) else 0,axis=1) t['cIV'] = t.apply(lambda x: (x['gr']-x['br'])*math.log(x['gr']/x['br']) if ((x['br'] > 0) and (x['gr'] > 0)) else 0, axis=1) # t['WoE'] = t.apply(lambda x: math.log(x['gr']/x['br'])*100, axis=1) # t['cIV'] = t.apply(lambda x: (x['gr']-x['br'])*math.log(x['gr']/x['br']), axis=1) print('meany:',meany , 'IV值:',t['cIV'].sum() ) f=sns.barplot(x=x,y='mean',data=t) return t def qcut_iv(df, refuse,x, n): meany = df[refuse].mean() #df[x] = df[x].fillna('Nan') x1= x + '_qcut' df[x1] = pd.qcut(df[x], n) df[x1] = df[x1].cat.add_categories(['Nan']).fillna('Nan') t = df.groupby([x1])[refuse].agg(['count','mean']).reset_index() t['percent'] = t['count']/df[x1].count()*100 t['是否初步显著'] = t.apply(lambda row: 1 if abs(row['mean']-meany)/meany>=0.2 and row['percent']>=2 else 0, axis=1) t['是否初步拒绝'] = t.apply(lambda row: 1 if row['mean']>(2*meany) else 0, axis=1) gc = df[refuse].value_counts()[0] bc = df[refuse].value_counts()[1] t['gr'] = t.apply(lambda x: (1-x['mean'])*x['count']/gc, axis=1) t['br'] = t.apply(lambda x: x['mean']*x['count']/bc, axis=1) t['WoE'] = t.apply(lambda x: math.log(x['gr']/x['br'])*100 if ((x['br'] > 0) and (x['gr'] > 0)) else 0,axis=1) t['cIV'] = t.apply(lambda x: (x['gr']-x['br'])*math.log(x['gr']/x['br']) if ((x['br'] > 0) and (x['gr'] > 0)) else 0, axis=1) #t['WoE'] = t.apply(lambda x: math.log(x['gr']/x['br'])*100, axis=1) #t['cIV'] = t.apply(lambda x: (x['gr']-x['br'])*math.log(x['gr']/x['br']), axis=1) print('meany:',meany , 'IV值:',t['cIV'].sum() ) f=sns.barplot(x=x1,y='mean',data=t) return t def autocut_iv(df,refuse, x, percent = 1): meany = df[refuse].mean() bad = df[refuse].sum() allcnt = df[refuse].count() df[x] = df[x].fillna('Nan') t = df.groupby([x])[refuse].agg(['mean','count']).reset_index() t['percent'] = t['count']/df[x].count()*100 t['percent1'] = 0 t['percent1'][t['percent'] < percent] = 1 t['percent1'][t['percent'] >= percent] = 0 p_min = t[t['percent1'] == 1][x].min() p_max = t[t['percent1'] == 1][x].max() aa = str(p_min) bb = str(p_max) merge_group_name = aa + '-' + bb x1 = x + '_'+'group' t[x1] = t[x] t[x1][t['percent1'] == 1] = merge_group_name t1 = t[[x, x1]] df1 = pd.merge(df, t1, on = x, how = 'left') df = df1 x = x1 t = df.groupby([x])[refuse].agg(['count','mean']).reset_index() t['percent'] = t['count']/df[x].count()*100 t['是否初步显著'] = t.apply(lambda row: 1 if abs(row['mean']-meany)/meany>=0.2 and row['percent']>=2 else 0, axis=1) t['是否初步拒绝'] = t.apply(lambda row: 1 if row['mean']>(2*meany) else 0, axis=1) gc = df[refuse].value_counts()[0] bc = df[refuse].value_counts()[1] t['gr'] = t.apply(lambda x: (1-x['mean'])*x['count']/gc, axis=1) t['br'] = t.apply(lambda x: x['mean']*x['count']/bc, axis=1) t['WoE'] = t.apply(lambda x: math.log(x['gr']/x['br'])*100 if ((x['br'] > 0) and (x['gr'] > 0)) else 0,axis=1) t['cIV'] = t.apply(lambda x: (x['gr']-x['br'])*math.log(x['gr']/x['br']) if ((x['br'] > 0) and (x['gr'] > 0)) else 0, axis=1) # t['WoE'] = t.apply(lambda x: math.log(x['gr']/x['br'])*100,axis=1) # t['cIV'] = t.apply(lambda x: (x['gr']-x['br'])*math.log(x['gr']/x['br']), axis=1) print('meany:', meany) print ('IV:', t['cIV'].sum()) f=sns.barplot(x=x,y='mean',data=t) return t def customcut_iv(df,refuse,x,group): #float('-inf') meany = df[refuse].mean() x1= x + '_group' df[x1] = pd.cut(df[x],group,right=True ) df[x1] = df[x1].cat.add_categories(['Nan']).fillna('Nan') t = df.groupby([x1])[refuse].agg(['count','mean']).reset_index() t['percent'] = t['count']/df[x1].count()*100 t['是否初步显著'] = t.apply(lambda row: 1 if abs(row['mean']-meany)/meany>=0.2 and row['percent']>=2 else 0, axis=1) t['是否初步拒绝'] = t.apply(lambda row: 1 if row['mean']>(2*meany) else 0, axis=1) gc = df[refuse].value_counts()[0] bc = df[refuse].value_counts()[1] t['gr'] = t.apply(lambda x: (1-x['mean'])*x['count']/gc, axis=1) t['br'] = t.apply(lambda x: x['mean']*x['count']/bc, axis=1) #t.loc['sum'] = t.apply(lambda x: x['count'].sum()) t['WoE'] = t.apply(lambda x: math.log(x['gr']/x['br'])*100 if ((x['br'] > 0) and (x['gr'] > 0)) else 0,axis=1) t['cIV'] = t.apply(lambda x: (x['gr']-x['br'])*math.log(x['gr']/x['br']) if ((x['br'] > 0) and (x['gr'] > 0)) else 0, axis=1) #t['WoE'] = t.apply(lambda x: math.log(x['gr']/x['br'])*100, axis=1) #t['cIV'] = t.apply(lambda x: (x['gr']-x['br'])*math.log(x['gr']/x['br']), axis=1) print('meany:',meany) print ('IV:', t['cIV'].sum()) f=sns.barplot(x=x1,y='mean',data=t) return t

Ipynb_importer.py

import io, os,sys,types from IPython import get_ipython from nbformat import read from IPython.core.interactiveshell import InteractiveShell class NotebookFinder(object): """Module finder that locates Jupyter Notebooks""" def __init__(self): self.loaders = {} def find_module(self, fullname, path=None): nb_path = find_notebook(fullname, path) if not nb_path: return key = path if path: # lists aren't hashable key = os.path.sep.join(path) if key not in self.loaders: self.loaders[key] = NotebookLoader(path) return self.loaders[key] def find_notebook(fullname, path=None): """find a notebook, given its fully qualified name and an optional path This turns "foo.bar" into "foo/bar.ipynb" and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar does not exist. """ name = fullname.rsplit('.', 1)[-1] if not path: path = [''] for d in path: nb_path = os.path.join(d, name + ".ipynb") if os.path.isfile(nb_path): return nb_path # let import Notebook_Name find "Notebook Name.ipynb" nb_path = nb_path.replace("_", " ") if os.path.isfile(nb_path): return nb_path class NotebookLoader(object): """Module Loader for Jupyter Notebooks""" def __init__(self, path=None): self.shell = InteractiveShell.instance() self.path = path def load_module(self, fullname): """import a notebook as a module""" path = find_notebook(fullname, self.path) print ("importing Jupyter notebook from %s" % path) # load the notebook object with io.open(path, 'r', encoding='utf-8') as f: nb = read(f, 4) # create the module and add it to sys.modules # if name in sys.modules: # return sys.modules[name] mod = types.ModuleType(fullname) mod.__file__ = path mod.__loader__ = self mod.__dict__['get_ipython'] = get_ipython sys.modules[fullname] = mod # extra work to ensure that magics that would affect the user_ns # actually affect the notebook module's ns save_user_ns = self.shell.user_ns self.shell.user_ns = mod.__dict__ try: for cell in nb.cells: if cell.cell_type == 'code': # transform the input to executable Python code = self.shell.input_transformer_manager.transform_cell(cell.source) # run the code in themodule exec(code, mod.__dict__) finally: self.shell.user_ns = save_user_ns return mod sys.meta_path.append(NotebookFinder())

专利

最新回复(0)