-
Notifications
You must be signed in to change notification settings - Fork 105
/
Copy pathdoFeats_1.py
274 lines (235 loc) · 12.3 KB
/
doFeats_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import numpy as np
import pandas as pd
import scipy as sp
import gc
import datetime
import random
import scipy.special as special
rawpath='C:\\final\\'
temppath='C:\\final\\temp\\'
iapath='C:\\final\\temp\\installedactions\\'
def logloss(act, preds):
epsilon = 1e-15
preds = sp.maximum(epsilon, preds)
preds = sp.minimum(1 - epsilon, preds)
ll = sum(act * sp.log(preds) + sp.subtract(1, act) * sp.log(sp.subtract(1, preds)))
ll = ll * -1.0 / len(act)
return ll
class HyperParam(object):#平滑,这个快一点;hyper=HyperParam(1, 1); hyper.update_from_data_by_moment(show, click)
def __init__(self, alpha, beta):
self.alpha = alpha
self.beta = beta
def sample_from_beta(self, alpha, beta, num, imp_upperbound):
sample = numpy.random.beta(alpha, beta, num)
I = []
C = []
for click_ratio in sample:
imp = random.random() * imp_upperbound
#imp = imp_upperbound
click = imp * click_ratio
I.append(imp)
C.append(click)
return I, C
def update_from_data_by_FPI(self, tries, success, iter_num, epsilon):
'''estimate alpha, beta using fixed point iteration'''
for i in range(iter_num):
new_alpha, new_beta = self.__fixed_point_iteration(tries, success, self.alpha, self.beta)
if abs(new_alpha-self.alpha)<epsilon and abs(new_beta-self.beta)<epsilon:
break
self.alpha = new_alpha
self.beta = new_beta
def __fixed_point_iteration(self, tries, success, alpha, beta):
'''fixed point iteration'''
sumfenzialpha = 0.0
sumfenzibeta = 0.0
sumfenmu = 0.0
for i in range(len(tries)):
sumfenzialpha += (special.digamma(success[i]+alpha) - special.digamma(alpha))
sumfenzibeta += (special.digamma(tries[i]-success[i]+beta) - special.digamma(beta))
sumfenmu += (special.digamma(tries[i]+alpha+beta) - special.digamma(alpha+beta))
return alpha*(sumfenzialpha/sumfenmu), beta*(sumfenzibeta/sumfenmu)
def update_from_data_by_moment(self, tries, success):
'''estimate alpha, beta using moment estimation'''
mean, var = self.__compute_moment(tries, success)
#print 'mean and variance: ', mean, var
#self.alpha = mean*(mean*(1-mean)/(var+0.000001)-1)
self.alpha = (mean+0.000001) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)
#self.beta = (1-mean)*(mean*(1-mean)/(var+0.000001)-1)
self.beta = (1.000001 - mean) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)
def __compute_moment(self, tries, success):
'''moment estimation'''
ctr_list = []
var = 0.0
for i in range(len(tries)):
ctr_list.append(float(success[i])/tries[i])
mean = sum(ctr_list)/len(ctr_list)
for ctr in ctr_list:
var += pow(ctr-mean, 2)
return mean, var/(len(ctr_list)-1)
class BayesianSmoothing(object):#贝叶斯平滑,这个慢一点
def __init__(self, alpha, beta):
self.alpha = alpha
self.beta = beta
def sample(self, alpha, beta, num, imp_upperbound):
sample = np.random.beta(alpha, beta, num)
I = []
C = []
for clk_rt in sample:
imp = random.random() * imp_upperbound
imp = imp_upperbound
clk = imp * clk_rt
I.append(imp)
C.append(clk)
return I, C
def update(self, imps, clks, iter_num, epsilon):
for i in range(iter_num):
new_alpha, new_beta = self.__fixed_point_iteration(imps, clks, self.alpha, self.beta)
if abs(new_alpha - self.alpha) < epsilon and abs(new_beta - self.beta) < epsilon:
break
self.alpha = new_alpha
self.beta = new_beta
print(self.alpha, self.beta)
def __fixed_point_iteration(self, imps, clks, alpha, beta):
numerator_alpha = 0.0
numerator_beta = 0.0
denominator = 0.0
for i in range(len(imps)):
numerator_alpha += (special.digamma(clks[i] + alpha) - special.digamma(alpha))
numerator_beta += (special.digamma(imps[i] - clks[i] + beta) - special.digamma(beta))
denominator += (special.digamma(imps[i] + alpha + beta) - special.digamma(alpha + beta))
return alpha * (numerator_alpha / denominator), beta * (numerator_beta / denominator)
def readData(m_type='inner',drop=True): ###################使用Trick时,left merge不改变顺序会比inner merge差1个万分点左右?
X_train = pd.read_csv(rawpath+'train.csv')
X_test = pd.read_csv(rawpath+'test.csv')
if drop:
X_train.drop('conversionTime', axis=1, inplace=True)
userfile = pd.read_csv(rawpath+'user.csv')
X_train = X_train.merge(userfile, how=m_type, on='userID')
X_test = X_test.merge(userfile, how=m_type, on='userID')
del userfile
gc.collect()
adfile = pd.read_csv(rawpath+'ad.csv')
X_train = X_train.merge(adfile, how=m_type, on='creativeID')
X_test = X_test.merge(adfile, how=m_type, on='creativeID')
del adfile
gc.collect()
appcatfile = pd.read_csv(rawpath+'app_categories.csv')
X_train = X_train.merge(appcatfile, how=m_type, on='appID')
X_test = X_test.merge(appcatfile, how=m_type, on='appID')
del appcatfile
gc.collect()
positionfile = pd.read_csv(rawpath+'position.csv')
X_train = X_train.merge(positionfile, how=m_type, on='positionID')
X_test = X_test.merge(positionfile, how=m_type, on='positionID')
del positionfile
gc.collect()
print('merge type:', m_type)
return X_train, X_test
def doPre(data):
data['day'] = data['clickTime'] // 1000000
data['hour'] = data['clickTime'] % 1000000 // 10000
return data
##########################################################installed文件关联user和app文件提取特征
userfile= pd.read_csv(rawpath+'user.csv')
appfile= pd.read_csv(rawpath+'app_categories.csv')
installed = pd.read_csv(rawpath+'user_installedapps.csv')
installed=installed.merge(userfile,how='left',on='userID')
installed=installed.merge(appfile,how='left',on='appID')
#app,appCat安装用户数
temp = installed.groupby('appID')['userID'].count().reset_index()
temp.columns=['appID','app_usercount']
temp.to_csv(iapath+'appInstalledusercount.csv',index=False)
temp = installed.groupby('appCategory')['userID'].count().reset_index()
temp.columns=['appCategory','appCat_usercount']
temp.to_csv(iapath+'appCatInstalledusercount.csv',index=False)
#user,edu,age,gender安装app数
temp = installed.groupby('userID')['appID'].count().reset_index()
temp.columns=['userID','user_appcount']
temp.to_csv(iapath+'userInstalledappscount.csv',index=False)
temp = installed.groupby('education')['appID'].count().reset_index()
temp.columns=['education','edu_appcount']
temp.to_csv(iapath+'eduuserInstalledappscount.csv',index=False)
temp = installed.groupby('age')['appID'].count().reset_index()
temp.columns=['age','age_appcount']
temp.to_csv(iapath+'ageuserInstalledappscount.csv',index=False)
temp = installed.groupby('gender')['appID'].count().reset_index()
temp.columns=['gender','gender_appcount']
temp.to_csv(iapath+'genderuserInstalledappscount.csv',index=False)
print('installed over...')
##########################################################actions文件提取特征,7天滑窗,统计用户安装的app数,app被安装的用户数
actions = pd.read_csv(rawpath+'user_app_actions.csv')
actions['day']=actions['installTime']//1000000
res=pd.DataFrame()
temp=actions[['userID','day','appID']]
for day in range(28,32):
count=temp.groupby(['userID']).apply(lambda x: x['appID'][(x['day']<day).values & (x['day']>day-8).values].count()).reset_index(name='appcount')
count['day']=day
res=res.append(count,ignore_index=True)
res.to_csv(iapath+'all_user_seven_day_cnt.csv',index=False)
res=pd.DataFrame()
temp=actions[['userID','day','appID']]
for day in range(28,32):
count=temp.groupby(['appID']).apply(lambda x: x['userID'][(x['day']<day).values & (x['day']>day-8).values].count()).reset_index(name='usercount')
count['day']=day
res=res.append(count,ignore_index=True)
res.to_csv(iapath+'all_app_seven_day_cnt.csv',index=False)
print('actions over...')
X_loc_train,X_loc_test=readData(m_type='inner',drop=True)
print('readData over')
X_loc_train=doPre(X_loc_train)
X_loc_test=doPre(X_loc_test)
print('doPre over...')
##########################################################统计特征,统计特征为点击数,转化数,转化率为转化数/点击数,
##########################################################初赛用7天滑窗算统计,决赛根据周冠军分享改为了使用了clickTime之前所有天算统计
for feat_1 in ['creativeID','positionID','userID','sitesetID']:
gc.collect()
res=pd.DataFrame()
temp=X_loc_train[[feat_1,'day','label']]
for day in range(28,32):
count=temp.groupby([feat_1]).apply(lambda x: x['label'][(x['day']<day).values].count()).reset_index(name=feat_1+'_all')
count1=temp.groupby([feat_1]).apply(lambda x: x['label'][(x['day']<day).values].sum()).reset_index(name=feat_1+'_1')
count[feat_1+'_1']=count1[feat_1+'_1']
count.fillna(value=0, inplace=True)
count['day']=day
res=res.append(count,ignore_index=True)
print(feat_1,' over')
res.to_csv(temppath+'%s.csv' %feat_1, index=False)
for feat_1,feat_2 in[('positionID','advertiserID'),('userID','sitesetID'),('positionID','connectionType'),('userID','positionID'),
('appPlatform','positionType'),('advertiserID','connectionType'),('positionID','appCategory'),('appID','age'),
('userID', 'appID'),('userID','connectionType'),('appCategory','connectionType'),('appID','hour'),('hour','age')]:
gc.collect()
res=pd.DataFrame()
temp=X_loc_train[[feat_1,feat_2,'day','label']]
for day in range(28,32):
count=temp.groupby([feat_1,feat_2]).apply(lambda x: x['label'][(x['day']<day).values].count()).reset_index(name=feat_1+'_'+feat_2+'_all')
count1=temp.groupby([feat_1,feat_2]).apply(lambda x: x['label'][(x['day']<day).values].sum()).reset_index(name=feat_1+'_'+feat_2+'_1')
count[feat_1+'_'+feat_2+'_1']=count1[feat_1+'_'+feat_2+'_1']
count.fillna(value=0, inplace=True)
count['day']=day
res=res.append(count,ignore_index=True)
print(feat_1,feat_2,' over')
res.to_csv(temppath+'%s.csv' % (feat_1+'_'+feat_2), index=False)
for feat_1,feat_2,feat_3 in[('appID','connectionType','positionID'),('appID','haveBaby','gender')]:
gc.collect()
res=pd.DataFrame()
temp=X_loc_train[[feat_1,feat_2,feat_3,'day','label']]
for day in range(28,32):
count=temp.groupby([feat_1,feat_2,feat_3]).apply(lambda x: x['label'][(x['day']<day).values].count()).reset_index(name=feat_1+'_'+feat_2+'_'+feat_3+'_all')
count1=temp.groupby([feat_1,feat_2,feat_3]).apply(lambda x: x['label'][(x['day']<day).values].sum()).reset_index(name=feat_1+'_'+feat_2+'_'+feat_3+'_1')
count[feat_1+'_'+feat_2+'_'+feat_3+'_1']=count1[feat_1+'_'+feat_2+'_'+feat_3+'_1']
count.fillna(value=0, inplace=True)
count['day']=day
res=res.append(count,ignore_index=True)
print(feat_1,feat_2,feat_3,' over')
res.to_csv(temppath+'%s.csv' % (feat_1+'_'+feat_2+'_'+feat_3), index=False)
##########################################################比赛官方群里大神分享过的,这里用app平均回流时间做特征,缺失的用app类别的平均回流时间替代
X_loc_train,X_loc_test=readData(m_type='inner',drop=False)
del X_loc_test
X_loc_train=X_loc_train.loc[X_loc_train['label']==1,:]
X_loc_train['cov_diffTime']=X_loc_train['conversionTime']-X_loc_train['clickTime']
grouped=X_loc_train.groupby('appID')['cov_diffTime'].mean().reset_index()
grouped.columns=['appID','cov_diffTime']
grouped.to_csv(temppath+'app_cov_diffTime.csv',index=False)
grouped=X_loc_train.groupby('appCategory')['cov_diffTime'].mean().reset_index()
grouped.columns=['appCategory','appCat_cov_diffTime']
grouped.to_csv(temppath+'appCat_cov_diffTime.csv',index=False)