写在前面
1、目的
建模经常要做单变量分析,就把单变量分析要用到的函数全部封装为一个包fctn,其中的函数包含:
1)count_iv(df, refuse, x):每个值为一组 2)qcut_iv(df, refuse,x, n)::n等分进行分组 3)customcut_iv(df,refuse,x,group):自定义区间进行分组 4)autocut_iv(df,refuse, x, percent = 1):自动合并占比x以下的组进行分组
2、用法
将Ipynb_importer.py与fctn.ipynb放入统一文件夹内
在建模文件中:
import Ipynb_importer
import fctn
可以直接直接从fctn中调用函数,例如:
fctn.count_iv(alldata, 'respond', 'ini_lvl')
3、输出结果
fctn.qcut_iv(result, 'target', 'province_br',5)
代码
fctn.ipynb
import pandas
as pd
import numpy
as np
import seaborn
as sns
import math
def count_iv(df
, refuse
, x
):
meany
= df
[refuse
].mean
()
df
[x
] = df
[x
].fillna
('Nan')
t
= df
.groupby
([x
])[refuse
].agg
(['count','mean']).reset_index
()
t
['percent'] = t
['count']/df
[x
].count
()*100
t
['是否初步显著'] = t
.apply(lambda row
: 1 if abs(row
['mean']-meany
)/meany
>=0.2 and row
['percent']>=2 else 0, axis
=1)
t
['是否初步拒绝'] = t
.apply(lambda row
: 1 if row
['mean']>(2*meany
) else 0, axis
=1)
gc
= df
[refuse
].value_counts
()[0]
bc
= df
[refuse
].value_counts
()[1]
t
['gr'] = t
.apply(lambda x
: (1-x
['mean'])*x
['count']/gc
, axis
=1)
t
['br'] = t
.apply(lambda x
: x
['mean']*x
['count']/bc
, axis
=1)
t
['WoE'] = t
.apply(lambda x
: math
.log
(x
['gr']/x
['br'])*100 if ((x
['br'] > 0) and (x
['gr'] > 0)) else 0,axis
=1)
t
['cIV'] = t
.apply(lambda x
: (x
['gr']-x
['br'])*math
.log
(x
['gr']/x
['br']) if ((x
['br'] > 0) and (x
['gr'] > 0)) else 0, axis
=1)
print('meany:',meany
, 'IV值:',t
['cIV'].sum() )
f
=sns
.barplot
(x
=x
,y
='mean',data
=t
)
return t
def qcut_iv(df
, refuse
,x
, n
):
meany
= df
[refuse
].mean
()
x1
= x
+ '_qcut'
df
[x1
] = pd
.qcut
(df
[x
], n
)
df
[x1
] = df
[x1
].cat
.add_categories
(['Nan']).fillna
('Nan')
t
= df
.groupby
([x1
])[refuse
].agg
(['count','mean']).reset_index
()
t
['percent'] = t
['count']/df
[x1
].count
()*100
t
['是否初步显著'] = t
.apply(lambda row
: 1 if abs(row
['mean']-meany
)/meany
>=0.2 and row
['percent']>=2 else 0, axis
=1)
t
['是否初步拒绝'] = t
.apply(lambda row
: 1 if row
['mean']>(2*meany
) else 0, axis
=1)
gc
= df
[refuse
].value_counts
()[0]
bc
= df
[refuse
].value_counts
()[1]
t
['gr'] = t
.apply(lambda x
: (1-x
['mean'])*x
['count']/gc
, axis
=1)
t
['br'] = t
.apply(lambda x
: x
['mean']*x
['count']/bc
, axis
=1)
t
['WoE'] = t
.apply(lambda x
: math
.log
(x
['gr']/x
['br'])*100 if ((x
['br'] > 0) and (x
['gr'] > 0)) else 0,axis
=1)
t
['cIV'] = t
.apply(lambda x
: (x
['gr']-x
['br'])*math
.log
(x
['gr']/x
['br']) if ((x
['br'] > 0) and (x
['gr'] > 0)) else 0, axis
=1)
print('meany:',meany
, 'IV值:',t
['cIV'].sum() )
f
=sns
.barplot
(x
=x1
,y
='mean',data
=t
)
return t
def autocut_iv(df
,refuse
, x
, percent
= 1):
meany
= df
[refuse
].mean
()
bad
= df
[refuse
].sum()
allcnt
= df
[refuse
].count
()
df
[x
] = df
[x
].fillna
('Nan')
t
= df
.groupby
([x
])[refuse
].agg
(['mean','count']).reset_index
()
t
['percent'] = t
['count']/df
[x
].count
()*100
t
['percent1'] = 0
t
['percent1'][t
['percent'] < percent
] = 1
t
['percent1'][t
['percent'] >= percent
] = 0
p_min
= t
[t
['percent1'] == 1][x
].min()
p_max
= t
[t
['percent1'] == 1][x
].max()
aa
= str(p_min
)
bb
= str(p_max
)
merge_group_name
= aa
+ '-' + bb
x1
= x
+ '_'+'group'
t
[x1
] = t
[x
]
t
[x1
][t
['percent1'] == 1] = merge_group_name
t1
= t
[[x
, x1
]]
df1
= pd
.merge
(df
, t1
, on
= x
, how
= 'left')
df
= df1
x
= x1
t
= df
.groupby
([x
])[refuse
].agg
(['count','mean']).reset_index
()
t
['percent'] = t
['count']/df
[x
].count
()*100
t
['是否初步显著'] = t
.apply(lambda row
: 1 if abs(row
['mean']-meany
)/meany
>=0.2 and row
['percent']>=2 else 0, axis
=1)
t
['是否初步拒绝'] = t
.apply(lambda row
: 1 if row
['mean']>(2*meany
) else 0, axis
=1)
gc
= df
[refuse
].value_counts
()[0]
bc
= df
[refuse
].value_counts
()[1]
t
['gr'] = t
.apply(lambda x
: (1-x
['mean'])*x
['count']/gc
, axis
=1)
t
['br'] = t
.apply(lambda x
: x
['mean']*x
['count']/bc
, axis
=1)
t
['WoE'] = t
.apply(lambda x
: math
.log
(x
['gr']/x
['br'])*100 if ((x
['br'] > 0) and (x
['gr'] > 0)) else 0,axis
=1)
t
['cIV'] = t
.apply(lambda x
: (x
['gr']-x
['br'])*math
.log
(x
['gr']/x
['br']) if ((x
['br'] > 0) and (x
['gr'] > 0)) else 0, axis
=1)
print('meany:', meany
)
print ('IV:', t
['cIV'].sum())
f
=sns
.barplot
(x
=x
,y
='mean',data
=t
)
return t
def customcut_iv(df
,refuse
,x
,group
):
meany
= df
[refuse
].mean
()
x1
= x
+ '_group'
df
[x1
] = pd
.cut
(df
[x
],group
,right
=True )
df
[x1
] = df
[x1
].cat
.add_categories
(['Nan']).fillna
('Nan')
t
= df
.groupby
([x1
])[refuse
].agg
(['count','mean']).reset_index
()
t
['percent'] = t
['count']/df
[x1
].count
()*100
t
['是否初步显著'] = t
.apply(lambda row
: 1 if abs(row
['mean']-meany
)/meany
>=0.2 and row
['percent']>=2 else 0, axis
=1)
t
['是否初步拒绝'] = t
.apply(lambda row
: 1 if row
['mean']>(2*meany
) else 0, axis
=1)
gc
= df
[refuse
].value_counts
()[0]
bc
= df
[refuse
].value_counts
()[1]
t
['gr'] = t
.apply(lambda x
: (1-x
['mean'])*x
['count']/gc
, axis
=1)
t
['br'] = t
.apply(lambda x
: x
['mean']*x
['count']/bc
, axis
=1)
t
['WoE'] = t
.apply(lambda x
: math
.log
(x
['gr']/x
['br'])*100 if ((x
['br'] > 0) and (x
['gr'] > 0)) else 0,axis
=1)
t
['cIV'] = t
.apply(lambda x
: (x
['gr']-x
['br'])*math
.log
(x
['gr']/x
['br']) if ((x
['br'] > 0) and (x
['gr'] > 0)) else 0, axis
=1)
print('meany:',meany
)
print ('IV:', t
['cIV'].sum())
f
=sns
.barplot
(x
=x1
,y
='mean',data
=t
)
return t
Ipynb_importer.py
import io
, os
,sys
,types
from IPython
import get_ipython
from nbformat
import read
from IPython
.core
.interactiveshell
import InteractiveShell
class NotebookFinder(object):
"""Module finder that locates Jupyter Notebooks"""
def __init__(self
):
self
.loaders
= {}
def find_module(self
, fullname
, path
=None):
nb_path
= find_notebook
(fullname
, path
)
if not nb_path
:
return
key
= path
if path
:
key
= os
.path
.sep
.join
(path
)
if key
not in self
.loaders
:
self
.loaders
[key
] = NotebookLoader
(path
)
return self
.loaders
[key
]
def find_notebook(fullname
, path
=None):
"""find a notebook, given its fully qualified name and an optional path
This turns "foo.bar" into "foo/bar.ipynb"
and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
does not exist.
"""
name
= fullname
.rsplit
('.', 1)[-1]
if not path
:
path
= ['']
for d
in path
:
nb_path
= os
.path
.join
(d
, name
+ ".ipynb")
if os
.path
.isfile
(nb_path
):
return nb_path
nb_path
= nb_path
.replace
("_", " ")
if os
.path
.isfile
(nb_path
):
return nb_path
class NotebookLoader(object):
"""Module Loader for Jupyter Notebooks"""
def __init__(self
, path
=None):
self
.shell
= InteractiveShell
.instance
()
self
.path
= path
def load_module(self
, fullname
):
"""import a notebook as a module"""
path
= find_notebook
(fullname
, self
.path
)
print ("importing Jupyter notebook from %s" % path
)
with io
.open(path
, 'r', encoding
='utf-8') as f
:
nb
= read
(f
, 4)
mod
= types
.ModuleType
(fullname
)
mod
.__file__
= path
mod
.__loader__
= self
mod
.__dict__
['get_ipython'] = get_ipython
sys
.modules
[fullname
] = mod
save_user_ns
= self
.shell
.user_ns
self
.shell
.user_ns
= mod
.__dict__
try:
for cell
in nb
.cells
:
if cell
.cell_type
== 'code':
code
= self
.shell
.input_transformer_manager
.transform_cell
(cell
.source
)
exec(code
, mod
.__dict__
)
finally:
self
.shell
.user_ns
= save_user_ns
return mod
sys
.meta_path
.append
(NotebookFinder
())