# python - 根据概率对数据框行进行分类

``````user_id    city_id
0           a
1           a
2           b
3           a
4           c
.. and so on
``````

`````` city_id     district_id    probability
a             a1           0.01
a             a2           0.02
a             a3           0.02
a             a4           0.56
a             a5           0.39
b             b1           0.63
b             b2           0.07
b             b3           0.30
and so on..
``````

Anajlim

81
Jim Eisenberg 2020-02-01 00:54

``````import numpy as np
def get_district(city):
dlist = list(df2.loc[df2['city_id']==city, 'district_id']) #get list of districts
p = list(df2.loc[df2['city_id']==city, 'probability']) #get corresponding odds
return np.random.choice(dlist, p=p) #give weighed random choice from list
``````

``````df['district_id'] = df.city_id.apply(get_district)
``````

``````def get_city_district(city,df1,df2):
l = len(df1[df1.city_id==city])
d = df2[df2['city_id']==city]
ds, p = list(d['district_id']),list(d['probability'])
df1.loc[df1.city_id==city,'district_id'] = np.random.choice(ds, size=l,p=p)
return df1

def f(df1,df2):
df1['district_id'] = None
for i in set(df1.city_id):
df1 = get_city_district(i,df1,df2)

return df1
``````