2012美国大选献金项目(最详细解释)

1. 数据载入和总览

import numpy as np

import pandas as pd

from pandas import Series,DataFrame

1.1 数据载入

df1 = pd.read_csv('./usa_elect/contb_01.csv')

df2 = pd.read_csv('./usa_elect/contb_02.csv')

df3 = pd.read_csv('./usa_elect/contb_03.csv')
df1.head()
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt
0 Bachmann, Michelle HARVEY, WILLIAM AL RETIRED RETIRED 250.0 20-JUN-11
1 Bachmann, Michelle HARVEY, WILLIAM AL RETIRED RETIRED 50.0 23-JUN-11
2 Bachmann, Michelle SMITH, LANIER AL INFORMATION REQUESTED INFORMATION REQUESTED 250.0 05-JUL-11
3 Bachmann, Michelle BLEVINS, DARONDA AR NONE RETIRED 250.0 01-AUG-11
4 Bachmann, Michelle WARDENBURG, HAROLD AR NONE RETIRED 300.0 20-JUN-11
df3.head()
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt
0 Perry, Rick NORRIS, BOBBY TX SELF REALTOR/ RANCHER 1000.0 14-DEC-11
1 Perry, Rick HUNT, CAROLINE TX HOMEMAKER HOMEMAKER 2500.0 14-DEC-11
2 Perry, Rick ROBINOWITZ, DALE TX SELF-EMPLOYED DENTIST 2500.0 14-DEC-11
3 Perry, Rick ZUNIGA, DAVID & SYLVIA TX DAVID M. ZUNIGA PE OWNER 250.0 14-DEC-11
4 Perry, Rick FOSTER, GLENNA TX SELF INVESTOR/ CONSULTANT 1500.0 14-DEC-11

1.2 数据合并

contb = pd.concat([df1,df2,df3],axis = 0,ignore_index=True)
contb.shape
(1001733, 7)

1.3 数据预览和基本统计分析

contb.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001733 entries, 0 to 1001732
Data columns (total 7 columns):
cand_nm              1001733 non-null object
contbr_nm            1001733 non-null object
contbr_st            1001729 non-null object
contbr_employer      988004 non-null object
contbr_occupation    993303 non-null object
contb_receipt_amt    1001733 non-null float64
contb_receipt_dt     1001733 non-null object
dtypes: float64(1), object(6)
memory usage: 53.5+ MB
contb.fillna(value='NOT PROVIDE',inplace=True)
contb.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001733 entries, 0 to 1001732
Data columns (total 7 columns):
cand_nm              1001733 non-null object
contbr_nm            1001733 non-null object
contbr_st            1001733 non-null object
contbr_employer      1001733 non-null object
contbr_occupation    1001733 non-null object
contb_receipt_amt    1001733 non-null float64
contb_receipt_dt     1001733 non-null object
dtypes: float64(1), object(6)
memory usage: 53.5+ MB
# 献金,给总统钱
contb.describe()
contb_receipt_amt
count 1.001733e+06
mean 2.982358e+02
std 3.749663e+03
min -3.080000e+04
25% 3.500000e+01
50% 1.000000e+02
75% 2.500000e+02
max 2.014491e+06

2. 数据清洗

2.1 缺失值处理

填充:NOT PROVIDE

# 职位,公司,州
# 填充了NOT PROVIDE 无可奉告
# 删除,也是一种选择

2.2 数据转换

利用字典映射进行转换:党派分析

#通过搜索引擎等途径,获取到每个总统候选人的所属党派,建立字典parties,候选人名字作为键,所属党派作为对应的值
parties = {'Bachmann, Michelle': 'Republican',
           'Cain, Herman': 'Republican',
           'Gingrich, Newt': 'Republican',
           'Huntsman, Jon': 'Republican',
           'Johnson, Gary Earl': 'Republican',
           'McCotter, Thaddeus G': 'Republican',
           'Obama, Barack': 'Democrat',
           'Paul, Ron': 'Republican',
           'Pawlenty, Timothy': 'Republican',
           'Perry, Rick': 'Republican',
           "Roemer, Charles E. 'Buddy' III": 'Republican',
           'Romney, Mitt': 'Republican',
           'Santorum, Rick': 'Republican'}
contb.head()
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt
0 Bachmann, Michelle HARVEY, WILLIAM AL RETIRED RETIRED 250.0 20-JUN-11
1 Bachmann, Michelle HARVEY, WILLIAM AL RETIRED RETIRED 50.0 23-JUN-11
2 Bachmann, Michelle SMITH, LANIER AL INFORMATION REQUESTED INFORMATION REQUESTED 250.0 05-JUL-11
3 Bachmann, Michelle BLEVINS, DARONDA AR NONE RETIRED 250.0 01-AUG-11
4 Bachmann, Michelle WARDENBURG, HAROLD AR NONE RETIRED 300.0 20-JUN-11

map方法添加party属性

# replace,map,transform,apply都可以实现
contb['party'] = contb['cand_nm'].map(parties)
contb.head()
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt party
0 Bachmann, Michelle HARVEY, WILLIAM AL RETIRED RETIRED 250.0 20-JUN-11 Republican
1 Bachmann, Michelle HARVEY, WILLIAM AL RETIRED RETIRED 50.0 23-JUN-11 Republican
2 Bachmann, Michelle SMITH, LANIER AL INFORMATION REQUESTED INFORMATION REQUESTED 250.0 05-JUL-11 Republican
3 Bachmann, Michelle BLEVINS, DARONDA AR NONE RETIRED 250.0 01-AUG-11 Republican
4 Bachmann, Michelle WARDENBURG, HAROLD AR NONE RETIRED 300.0 20-JUN-11 Republican

value_counts统计各党派总捐频次

contb['party'].value_counts()
Democrat      593747
Republican    407986
Name: party, dtype: int64
contb.groupby(by = 'party')['contb_receipt_amt'].sum()
party
Democrat      1.335028e+08
Republican    1.652498e+08
Name: contb_receipt_amt, dtype: float64

排序:按照职业汇总对赞助总金额进行排序

occupation_ = contb.groupby(by = 'contbr_occupation')['contb_receipt_amt'].sum()
occupation_
contbr_occupation
   MIXED-MEDIA ARTIST / STORYTELLER     100.0
 AREA VICE PRESIDENT                    250.0
 RESEARCH ASSOCIATE                     100.0
 TEACHER                                500.0
 THERAPIST                             3900.0
                                        ...  
ZOOKEEPER                                35.0
ZOOLOGIST                               400.0
ZOOLOGY EDUCATION                        25.0
\NONE\                                  250.0
~                                        75.0
Name: contb_receipt_amt, Length: 45074, dtype: float64
occupation_.sort_values(ascending=False)[:50]
contbr_occupation
RETIRED                                   48176647.00
ATTORNEY                                  18470473.30
HOMEMAKER                                 17484807.65
INFORMATION REQUESTED PER BEST EFFORTS    15859514.55
INFORMATION REQUESTED                      8742357.59
PHYSICIAN                                  7224044.40
PRESIDENT                                  6347843.59
EXECUTIVE                                  5273717.90
CONSULTANT                                 4932627.98
NOT PROVIDE                                4224760.39
CEO                                        3570942.20
LAWYER                                     3537982.19
OWNER                                      3278488.16
INVESTOR                                   3204481.92
ENGINEER                                   2730527.43
PROFESSOR                                  2458033.81
C.E.O.                                     2433218.11
SELF-EMPLOYED                              2259150.94
MANAGER                                    2167571.47
REAL ESTATE                                2110499.34
SALES                                      1814901.82
NOT EMPLOYED                               1752927.93
BUSINESS OWNER                             1736511.73
TEACHER                                    1709754.05
CHAIRMAN                                   1691595.37
STUDENT                                    1679435.28
FINANCE                                    1664021.31
BANKER                                     1462903.13
WRITER                                     1303267.29
PARTNER                                    1177656.55
VICE PRESIDENT                             1177600.41
INVESTMENTS                                1120947.94
MANAGING DIRECTOR                           969128.57
INVESTMENT BANKER                           961297.90
ARTIST                                      959387.57
DIRECTOR                                    855121.89
SOFTWARE ENGINEER                           836558.02
ACCOUNTANT                                  828374.46
FARMER                                      747042.89
DENTIST                                     741347.59
FINANCIAL ADVISOR                           660786.63
ARCHITECT                                   656140.41
PRESIDENT & C.E.O.                          642686.28
CPA                                         606266.64
INVESTMENT MANAGER                          601899.60
INVESTMENT MANAGEMENT                       583189.02
REAL ESTATE DEVELOPER                       578934.25
PRINCIPAL                                   569543.44
REQUESTED                                   567872.46
PRIVATE EQUITY                              555939.00
Name: contb_receipt_amt, dtype: float64

按照职位进行汇总,计算赞助总金额,展示前50项,发现不少职业是相同的,只不过是表达不一样而已,如C.E.O.与CEO,都是一个职业

# 职位重合,替换合并
occupation = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDE',
             'INFORMATION REQUESTED':'NOT PROVIDE',
              'C.E.O.':'CEO',
              'LAWYER':'ATTORNEY',
              'PRESIDENT & C.E.O.':'CEO'
             }
print(d.get('李四'))
None
def convert(x):
    if x == '张三':
        return 'zhang sir'

#   map,遍历所有, 遍历所有
df1['name'] = df1['name'].transform(convert)
df1
name
0 zhang sir
1 None
2 None
3 None
# map 原理
df1 = DataFrame({'name':['张三','李四','王五','Jack']})
d = {'张三':'zhang sir'}
# df1['name'] = df1['name'].apply(d)
df1
name
0 张三
1 李四
2 王五
3 Jack
# 原数据的职位这一列,替换,一一找到,相同的职位划归到一个
# map:将里面的所有的数据进行替换;
# replace:替换其中一部分
contb['contbr_occupation'] = contb['contbr_occupation'].replace(occupation)
contb
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt party
0 Bachmann, Michelle HARVEY, WILLIAM AL RETIRED RETIRED 250.0 20-JUN-11 Republican
1 Bachmann, Michelle HARVEY, WILLIAM AL RETIRED RETIRED 50.0 23-JUN-11 Republican
2 Bachmann, Michelle SMITH, LANIER AL INFORMATION REQUESTED NOT PROVIDE 250.0 05-JUL-11 Republican
3 Bachmann, Michelle BLEVINS, DARONDA AR NONE RETIRED 250.0 01-AUG-11 Republican
4 Bachmann, Michelle WARDENBURG, HAROLD AR NONE RETIRED 300.0 20-JUN-11 Republican
... ... ... ... ... ... ... ... ...
1001728 Perry, Rick GORMAN, CHRIS D. MR. XX INFORMATION REQUESTED PER BEST EFFORTS NOT PROVIDE 5000.0 29-SEP-11 Republican
1001729 Perry, Rick DUFFY, DAVID A. MR. XX DUFFY EQUIPMENT COMPANY INC. BUSINESS OWNER 2500.0 30-SEP-11 Republican
1001730 Perry, Rick GRANE, BRYAN F. MR. XX INFORMATION REQUESTED PER BEST EFFORTS NOT PROVIDE 500.0 29-SEP-11 Republican
1001731 Perry, Rick TOLBERT, DARYL MR. XX T.A.C.C. LONGWALL MAINTENANCE FOREMAN 500.0 30-SEP-11 Republican
1001732 Perry, Rick ANDERSON, MARILEE MRS. XX INFORMATION REQUESTED PER BEST EFFORTS NOT PROVIDE 2500.0 31-AUG-11 Republican

1001733 rows × 8 columns

(occupation_.rename(occupation).index == 'C.E.O.').sum()
0

利用函数进行数据转换:职业与雇主信息分析

建立一个职业对应字典,把相同职业的不同表达映射为对应的职业,比如把C.E.O.映射为CEO



2.3 数据筛选

赞助金额筛选,赞助金额大于0

cond = contb['contb_receipt_amt'] <= 0

contb[cond]
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt party
41 Bachmann, Michelle MINNIS, RITA CA NOT PROVIDE NOT PROVIDE -1500.0 20-JUN-11 Republican
264 Bachmann, Michelle BISHOP, GERARD NY NOT PROVIDE NOT PROVIDE -1700.0 28-JUN-11 Republican
752 Romney, Mitt KNIGHT, RENA AL NOT PROVIDE NOT PROVIDE -60.0 07-MAR-12 Republican
885 Romney, Mitt HENDERSON, RONALD E. DR. AL RETIRED RETIRED -150.0 06-FEB-12 Republican
1033 Romney, Mitt SELLERS, LEE AL NOT PROVIDE NOT PROVIDE -120.0 20-JAN-12 Republican
... ... ... ... ... ... ... ... ...
1001705 Perry, Rick NELSON, HOWARD L. MR. WV SWANSON INDUSTRIES C.F.O. -2500.0 01-OCT-11 Republican
1001706 Perry, Rick PHILLIPS, JIMMY D. MR. WV PHILLIPS MACHINE SERVICES EXECUTIVE -2500.0 28-NOV-11 Republican
1001709 Perry, Rick BEVILOCK, JOHN C. MR. WV DAVID STANLEY COMPANY SALES -2500.0 08-OCT-11 Republican
1001711 Perry, Rick HAYHURST, JAMES W. MR. WV DAVID STANLEY COMPANY SALES -2500.0 09-OCT-11 Republican
1001712 Perry, Rick PHILLIPS, JIMMY WV NOT PROVIDE NOT PROVIDE -2500.0 31-MAR-12 Republican

10256 rows × 8 columns

cond = contb['contb_receipt_amt'] > 0 

contb = contb[cond]
# 将异常值,进行了过滤,定义异常值:捐款为负数
contb.shape
(991477, 8)

查看各候选人获得的赞助总金额

# 共和党的罗姆尼,民主党的奥巴马双雄
cand_nm_sum = contb.groupby(by = 'cand_nm')['contb_receipt_amt'].sum().sort_values(ascending = False)

饼图绘制各候选人献金情况

cand_nm_sum.plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a993e9ef28>
# 饼图,百分比形式,显示数据
cand_nm_sum[:5].plot(kind = 'pie')
<matplotlib.axes._subplots.AxesSubplot at 0x2a994216ac8>

在这里插入图片描述

筛选选取候选人为Obama、Romney的子集数据,多种方式

contb.shape
(991477, 8)
contb['cand_nm'].unique()
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
       'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
       'Huntsman, Jon', 'Perry, Rick'], dtype=object)
# 奥巴马和罗姆尼的子集,只有他俩
convtb_vs = contb.query("cand_nm == 'Obama, Barack' or cand_nm == 'Romney, Mitt'")
convtb_vs.shape
(694283, 8)
cond1 = contb['cand_nm'] == 'Romney, Mitt'

cond2 = contb['cand_nm'] == 'Obama, Barack'

cond = cond1|cond2

contb_vs2 = contb[cond]
contb_vs2.shape
(694283, 8)

2.4 面元化数据

分箱操作

接下来我们对该数据做另一种非常实用的分析,利用cut函数根据出资额大小将数据离散化到多个面元中

扫描二维码关注公众号,回复: 11519850 查看本文章
contb_vs2.head()
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt party
411 Romney, Mitt ELDERBAUM, WILLIAM AA US GOVERNMENT FOREIGN SERVICE OFFICER 25.0 01-FEB-12 Republican
412 Romney, Mitt ELDERBAUM, WILLIAM AA US GOVERNMENT FOREIGN SERVICE OFFICER 110.0 01-FEB-12 Republican
413 Romney, Mitt CARLSEN, RICHARD AE DEFENSE INTELLIGENCE AGENCY INTELLIGENCE ANALYST 250.0 13-APR-12 Republican
414 Romney, Mitt DELUCA, PIERRE AE CISCO ENGINEER 30.0 21-AUG-11 Republican
415 Romney, Mitt SARGENT, MICHAEL AE RAYTHEON TECHNICAL SERVICES CORP COMPUTER SYSTEMS ENGINEER 100.0 07-MAR-12 Republican
convtb_vs.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 694283 entries, 411 to 701386
Data columns (total 8 columns):
cand_nm              694283 non-null object
contbr_nm            694283 non-null object
contbr_st            694283 non-null object
contbr_employer      694283 non-null object
contbr_occupation    694283 non-null object
contb_receipt_amt    694283 non-null float64
contb_receipt_dt     694283 non-null object
party                694283 non-null object
dtypes: float64(1), object(7)
memory usage: 47.7+ MB
labels = [0,10,100,1000,10000,100000,1000000,10000000]
convtb_vs['contb_receipt_amt'].sort_values()
325151          0.01
319758          0.01
323688          0.01
323661          0.01
319831          0.01
             ...    
319478     526246.17
344419    1511192.17
344539    1679114.65
326651    1944042.43
325136    2014490.51
Name: contb_receipt_amt, Length: 694283, dtype: float64
contb_vs2['contb_receipt_amt']
411        25.0
412       110.0
413       250.0
414        30.0
415       100.0
          ...  
701382     25.0
701383    250.0
701384      3.0
701385     25.0
701386    135.0
Name: contb_receipt_amt, Length: 694283, dtype: float64
contb_vs = contb_vs2.copy()
contb_vs['labels'] = pd.cut(contb_vs2['contb_receipt_amt'],bins=labels)
contb_vs2.loc[:,'label'] = pd.cut(contb_vs2['contb_receipt_amt'],bins=labels)
d:\python3.6.8\lib\site-packages\pandas\core\indexing.py:376: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
d:\python3.6.8\lib\site-packages\pandas\core\indexing.py:494: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
contb_vs2.drop('label',axis= 1,inplace=True)
d:\python3.6.8\lib\site-packages\pandas\core\frame.py:4117: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
contb_vs2.head()
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt party label
411 Romney, Mitt ELDERBAUM, WILLIAM AA US GOVERNMENT FOREIGN SERVICE OFFICER 25.0 01-FEB-12 Republican (10, 100]
412 Romney, Mitt ELDERBAUM, WILLIAM AA US GOVERNMENT FOREIGN SERVICE OFFICER 110.0 01-FEB-12 Republican (100, 1000]
413 Romney, Mitt CARLSEN, RICHARD AE DEFENSE INTELLIGENCE AGENCY INTELLIGENCE ANALYST 250.0 13-APR-12 Republican (100, 1000]
414 Romney, Mitt DELUCA, PIERRE AE CISCO ENGINEER 30.0 21-AUG-11 Republican (10, 100]
415 Romney, Mitt SARGENT, MICHAEL AE RAYTHEON TECHNICAL SERVICES CORP COMPUTER SYSTEMS ENGINEER 100.0 07-MAR-12 Republican (10, 100]
contb_vs.drop('labels',axis = 1,inplace=True)

3. 数据聚合与分组运算

3.1 透视表(pivot_table)分析党派和职业

# excel 中透视表功能
# sql 中处理数据时,也会用到透视表
# pandas提供方法了,显示透视表

按照党派、职业对赞助金额进行汇总,类似excel中的透视表操作,聚合函数为sum

cont
# 透视表和分组聚合一模一样
pt = pd.pivot_table(contb_vs,values=['contb_receipt_amt'],
               index = ['contbr_occupation'],columns=['party'],aggfunc='sum',
                    fill_value=0)
pt.isnull().sum()
                   party     
contb_receipt_amt  Democrat       4435
                   Republican    26779
dtype: int64
pt
contb_receipt_amt
party Democrat Republican
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER 100.0 0.0
AREA VICE PRESIDENT 250.0 0.0
RESEARCH ASSOCIATE 100.0 0.0
TEACHER 500.0 0.0
THERAPIST 3900.0 0.0
... ... ...
ZEPPOS AND ASSOCIATES 1000.0 0.0
ZONE MANAGER 135.0 0.0
ZOOKEEPER 35.0 0.0
ZOOLOGIST 400.0 0.0
ZOOLOGY EDUCATION 25.0 0.0

33599 rows × 2 columns

contb_vs.groupby(['contbr_occupation','party'])['contb_receipt_amt'].sum().unstack()
party Democrat Republican
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER 100.0 NaN
AREA VICE PRESIDENT 250.0 NaN
RESEARCH ASSOCIATE 100.0 NaN
TEACHER 500.0 NaN
THERAPIST 3900.0 NaN
... ... ...
ZEPPOS AND ASSOCIATES 1000.0 NaN
ZONE MANAGER 135.0 NaN
ZOOKEEPER 35.0 NaN
ZOOLOGIST 400.0 NaN
ZOOLOGY EDUCATION 25.0 NaN

33599 rows × 2 columns

过滤掉赞助金额小于200W的数据

cond = pt.sum(axis = 1) > 2000000

pt_large = pt[cond]
pt_large
contb_receipt_amt
party Democrat Republican
contbr_occupation
ATTORNEY 14302461.84 5372424.02
CEO 2074974.79 2898458.31
CONSULTANT 2459912.71 1424894.01
EXECUTIVE 1355161.05 2300947.03
HOMEMAKER 4248875.80 8147446.22
INVESTOR 884133.00 1537595.12
NOT PROVIDE 13725187.32 11638509.84
PHYSICIAN 3735124.94 1368023.96
PRESIDENT 1878509.95 2491244.89
PROFESSOR 2165071.08 161362.12
RETIRED 25305316.38 11508473.59
index = pt_large.sum(axis = 1).sort_values().index

pt_large = pt_large.loc[index[::-1]]

绘制图形

pt_large.plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a997df5ac8>

在这里插入图片描述

pt_large.plot(kind = 'bar',stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a9972d58d0>

在这里插入图片描述

3.2 分组级运算和转换

根据职业与雇主信息分别分组运算捐献金额总数

由于职业和雇主的处理非常相似,我们定义函数get_top_amounts()对两个字段进行分析处理

occ_result = contb_vs.groupby(['contbr_occupation','cand_nm'])['contb_receipt_amt'].sum().unstack()

occ_result.fillna(0,inplace = True)
occ_result
cand_nm Obama, Barack Romney, Mitt
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER 100.0 0.0
AREA VICE PRESIDENT 250.0 0.0
RESEARCH ASSOCIATE 100.0 0.0
TEACHER 500.0 0.0
THERAPIST 3900.0 0.0
... ... ...
ZEPPOS AND ASSOCIATES 1000.0 0.0
ZONE MANAGER 135.0 0.0
ZOOKEEPER 35.0 0.0
ZOOLOGIST 400.0 0.0
ZOOLOGY EDUCATION 25.0 0.0

33599 rows × 2 columns

occ_result['Obama, Barack'].sort_values(ascending = False)[:20]
contbr_occupation
RETIRED          25305316.38
ATTORNEY         14302461.84
NOT PROVIDE      13725187.32
HOMEMAKER         4248875.80
PHYSICIAN         3735124.94
CONSULTANT        2459912.71
PROFESSOR         2165071.08
CEO               2074974.79
PRESIDENT         1878509.95
NOT EMPLOYED      1709188.20
EXECUTIVE         1355161.05
TEACHER           1250969.15
WRITER            1084188.88
OWNER             1001567.36
ENGINEER           951525.55
INVESTOR           884133.00
ARTIST             763125.00
MANAGER            762883.22
SELF-EMPLOYED      672393.40
STUDENT            628099.75
Name: Obama, Barack, dtype: float64

从数据可以看出,Obama更受精英群体(律师、医生、咨询顾问)的欢迎,Romney则得到更多企业家或企业高管的支持

occ_result['Romney, Mitt'].sort_values(ascending = False)[:20]
contbr_occupation
NOT PROVIDE          11638509.84
RETIRED              11508473.59
HOMEMAKER             8147446.22
ATTORNEY              5372424.02
CEO                   2898458.31
PRESIDENT             2491244.89
EXECUTIVE             2300947.03
INVESTOR              1537595.12
CONSULTANT            1424894.01
PHYSICIAN             1368023.96
FINANCE               1143592.25
SELF-EMPLOYED         1092538.44
REAL ESTATE           1054801.00
BANKER                1009186.24
OWNER                  875186.24
CHAIRMAN               805131.75
INVESTMENT BANKER      661035.00
INVESTMENTS            634922.00
BUSINESS OWNER         617389.86
MANAGER                577924.94
Name: Romney, Mitt, dtype: float64

同样的,使用get_top_amounts()对雇主进行分析处理


contb_vs.head()
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt party label
411 Romney, Mitt ELDERBAUM, WILLIAM AA US GOVERNMENT FOREIGN SERVICE OFFICER 25.0 01-FEB-12 Republican (10, 100]
412 Romney, Mitt ELDERBAUM, WILLIAM AA US GOVERNMENT FOREIGN SERVICE OFFICER 110.0 01-FEB-12 Republican (100, 1000]
413 Romney, Mitt CARLSEN, RICHARD AE DEFENSE INTELLIGENCE AGENCY INTELLIGENCE ANALYST 250.0 13-APR-12 Republican (100, 1000]
414 Romney, Mitt DELUCA, PIERRE AE CISCO ENGINEER 30.0 21-AUG-11 Republican (10, 100]
415 Romney, Mitt SARGENT, MICHAEL AE RAYTHEON TECHNICAL SERVICES CORP COMPUTER SYSTEMS ENGINEER 100.0 07-MAR-12 Republican (10, 100]
compy_result = contb_vs.groupby(['contbr_employer','cand_nm'])['contb_receipt_amt'].sum().unstack()

compy_result.fillna(0,inplace = True)
compy_result['Obama, Barack'].sort_values(ascending = False)[:20].plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c64d0908>

在这里插入图片描述

compy_result['Romney, Mitt'].sort_values(ascending = False)[:20].plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c64115c0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QkgQlUxC-1574946989346)(output_97_1.png)]

Obama:微软、盛德国际律师事务所; Romney:瑞士瑞信银行、摩根斯坦利、高盛公司、巴克莱资本、H.I.G.资本


对赞助金额进行分组分析(matplotlib画图)

前面我们已经利用pd.cut()函数,根据出资额大小将数据离散化到多个面元中,接下来我们就要对每个离散化的面元进行分组分析

首先统计各出资区间的赞助笔数,这里用到unstack(),stack()函数是堆叠,unstack()函数就是不要堆叠,即把多层索引变为表格数据

接下来,我们再统计各区间的赞助金额

contb_vs.head()
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt party label
411 Romney, Mitt ELDERBAUM, WILLIAM AA US GOVERNMENT FOREIGN SERVICE OFFICER 25.0 01-FEB-12 Republican (10, 100]
412 Romney, Mitt ELDERBAUM, WILLIAM AA US GOVERNMENT FOREIGN SERVICE OFFICER 110.0 01-FEB-12 Republican (100, 1000]
413 Romney, Mitt CARLSEN, RICHARD AE DEFENSE INTELLIGENCE AGENCY INTELLIGENCE ANALYST 250.0 13-APR-12 Republican (100, 1000]
414 Romney, Mitt DELUCA, PIERRE AE CISCO ENGINEER 30.0 21-AUG-11 Republican (10, 100]
415 Romney, Mitt SARGENT, MICHAEL AE RAYTHEON TECHNICAL SERVICES CORP COMPUTER SYSTEMS ENGINEER 100.0 07-MAR-12 Republican (10, 100]
contb_count = contb_vs.groupby(['label','cand_nm'])['party'].count().unstack()

contb_count.fillna(0,inplace = True)
contb_count
cand_nm Obama, Barack Romney, Mitt
label
(0, 10] 40563.0 3758.0
(10, 100] 372280.0 31853.0
(100, 1000] 153992.0 43357.0
(1000, 10000] 22284.0 26186.0
(10000, 100000] 2.0 1.0
(100000, 1000000] 3.0 0.0
(1000000, 10000000] 4.0 0.0

在这里插入图片描述

contb_count.plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c65c47b8>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-BTwQPKtc-1574946989347)(output_107_1.png)]

contb_count.plot(kind = 'bar',stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c69846d8>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u5MloaGv-1574946989350)(output_108_1.png)]

contb_count
cand_nm Obama, Barack Romney, Mitt
label
(0, 10] 40563.0 3758.0
(10, 100] 372280.0 31853.0
(100, 1000] 153992.0 43357.0
(1000, 10000] 22284.0 26186.0
(10000, 100000] 2.0 1.0
(100000, 1000000] 3.0 0.0
(1000000, 10000000] 4.0 0.0
# sql 中,进行这样操作,难度很大
count = (contb_count.sum(axis = 1))
count
label
(0, 10]                 44321.0
(10, 100]              404133.0
(100, 1000]            197349.0
(1000, 10000]           48470.0
(10000, 100000]             3.0
(100000, 1000000]           3.0
(1000000, 10000000]         4.0
dtype: float64
for i,c in enumerate(count):
    contb_count.iloc[i] = contb_count.iloc[i]/c
contb_count.plot(kind = 'bar',stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c6984cf8>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-A1tA8Oim-1574946989351)(output_112_1.png)]

可以看出,小额赞助方面,Obama获得的数量和金额比Romney多得多

4.时间处理

4.1 str转datetime

contb_vs.head()
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt party label
411 Romney, Mitt ELDERBAUM, WILLIAM AA US GOVERNMENT FOREIGN SERVICE OFFICER 25.0 01-FEB-12 Republican (10, 100]
412 Romney, Mitt ELDERBAUM, WILLIAM AA US GOVERNMENT FOREIGN SERVICE OFFICER 110.0 01-FEB-12 Republican (100, 1000]
413 Romney, Mitt CARLSEN, RICHARD AE DEFENSE INTELLIGENCE AGENCY INTELLIGENCE ANALYST 250.0 13-APR-12 Republican (100, 1000]
414 Romney, Mitt DELUCA, PIERRE AE CISCO ENGINEER 30.0 21-AUG-11 Republican (10, 100]
415 Romney, Mitt SARGENT, MICHAEL AE RAYTHEON TECHNICAL SERVICES CORP COMPUTER SYSTEMS ENGINEER 100.0 07-MAR-12 Republican (10, 100]
contb_vs.dtypes
cand_nm                object
contbr_nm              object
contbr_st              object
contbr_employer        object
contbr_occupation      object
contb_receipt_amt     float64
contb_receipt_dt       object
party                  object
label                category
dtype: object
contb_vs['contb_receipt_dt'] = pd.to_datetime(contb_vs['contb_receipt_dt'])
contb_vs.dtypes
cand_nm                      object
contbr_nm                    object
contbr_st                    object
contbr_employer              object
contbr_occupation            object
contb_receipt_amt           float64
contb_receipt_dt     datetime64[ns]
party                        object
label                      category
dtype: object

4.2 以时间作为索引

contb_vs.set_index('contb_receipt_dt',inplace=True)
contb_vs.sort_index(inplace=True)

4.3重采样和频度转换

重采样(Resampling)指的是把时间序列的频度变为另一个频度的过程。把高频度的数据变为低频度叫做降采样(downsampling),resample会对数据进行分组,然后再调用聚合函数。这里我们把频率从每日转换为每月,属于高频转低频的降采样

contb_vs.resample('M')['contb_receipt_amt'].sum()
contb_receipt_amt
contb_receipt_dt
2011-04-30 10282228.92
2011-05-31 13426593.92
2011-06-30 20552077.13
2011-07-31 5932074.56
2011-08-31 7097173.03
2011-09-30 22826839.44
2011-10-31 9585231.97
2011-11-30 12200548.56
2011-12-31 24986600.28
2012-01-31 13536133.55
2012-02-29 26336337.17
2012-03-31 31926272.05
2012-04-30 25525424.19
ret  = contb_vs.groupby(['cand_nm']).resample("M")['contb_receipt_amt'].sum().unstack(level = 0)

ret.plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c685e9b0>

在这里插入图片描述

ret = contb_vs.groupby(['cand_nm']).resample("M")['contb_receipt_amt'].sum().unstack(level = 0)
ret
cand_nm Obama, Barack Romney, Mitt
contb_receipt_dt
2011-04-30 8246635.79 2035593.13
2011-05-31 6570013.79 6856580.13
2011-06-30 12159624.88 8392452.25
2011-07-31 3557892.31 2374182.25
2011-08-31 4685825.79 2411347.24
2011-09-30 15304567.88 7522271.56
2011-10-31 5945068.04 3640163.93
2011-11-30 5194383.31 7006165.25
2011-12-31 13571341.82 11415258.46
2012-01-31 8167000.77 5369132.78
2012-02-29 15858199.16 10478138.01
2012-03-31 20806093.40 11120178.65
2012-04-30 15810980.30 9714443.89
ret.cumsum().plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c6d44518>

在这里插入图片描述

ret.cumsum().plot(kind = 'line')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c6ca8278>

在这里插入图片描述

5.各州支持率

分组聚合计算奥巴马、罗姆尼在各州支持率

# 分组聚合进行运算,求各州,对奥巴马和罗姆尼支持情况,计算捐献金额
st = contb_vs.groupby(['cand_nm','contbr_st'])['contb_receipt_amt'].sum().unstack(level = 0)
st.fillna(0,inplace = True)
# 68个州,18个州,可以删除
# 献金者,填表,填错了
st
cand_nm Obama, Barack Romney, Mitt
contbr_st
AA 56405.00 135.00
AB 2048.00 0.00
AE 42973.75 5680.00
AK 281840.15 86204.24
AL 543123.48 527303.51
... ... ...
WI 1130155.46 270316.32
WV 169154.47 126725.12
WY 194046.74 252595.84
XX 0.00 400250.00
ZZ 5963.00 0.00

68 rows × 2 columns

# 将美国有哪些州,将州缩写获取
abbr = pd.read_csv('./state-abbrevs.csv')
sx = abbr['abbreviation']
sx
0     AL
1     AK
2     AZ
3     AR
4     CA
5     CO
6     CT
7     DE
8     DC
9     FL
10    GA
11    HI
12    ID
13    IL
14    IN
15    IA
16    KS
17    KY
18    LA
19    ME
20    MT
21    NE
22    NV
23    NH
24    NJ
25    NM
26    NY
27    NC
28    ND
29    OH
30    OK
31    OR
32    MD
33    MA
34    MI
35    MN
36    MS
37    MO
38    PA
39    RI
40    SC
41    SD
42    TN
43    TX
44    UT
45    VT
46    VA
47    WA
48    WV
49    WI
50    WY
Name: abbreviation, dtype: object
index = st.index
s = Series(index)
# cond过滤条件
cond = s.isin(sx.to_numpy())
# 索引自然数,替换索引
cond.index = st.index
# 根据条件进行过滤
state_support = st[cond]
state_support
cand_nm Obama, Barack Romney, Mitt
contbr_st
AK 281840.15 86204.24
AL 543123.48 527303.51
AR 359247.28 105556.00
AZ 1506476.98 1888436.23
CA 23824984.24 11237636.60
CO 2132429.49 1506714.12
CT 2068291.26 3499475.45
DC 4373538.80 1025137.50
DE 336669.14 82712.00
FL 7318178.58 8338458.81
GA 2786399.49 1995725.59
HI 795212.64 111763.00
IA 584829.10 208537.49
ID 197538.06 787158.44
IL 16443895.84 3628571.53
IN 883691.81 542086.23
KS 448038.57 326633.87
KY 714954.32 666902.59
LA 548013.54 991236.60
MA 6649015.25 4710542.30
MD 4832663.93 1633690.40
ME 1167760.12 117151.84
MI 2570307.25 2448109.87
MN 1744387.14 293656.32
MO 1320780.71 1371332.90
MS 195197.17 330183.00
MT 300225.24 161629.00
NC 2357067.63 1273603.92
ND 58999.90 39392.00
NE 251311.97 178600.50
NH 616994.85 424839.11
NJ 3203257.93 3333320.20
NM 906162.36 168011.36
NV 710693.67 630049.49
NY 14652118.51 10184212.63
OH 1822728.83 1901560.66
OK 594342.52 839483.96
OR 1222942.39 526237.73
PA 3713426.98 1941519.43
RI 457352.22 187892.92
SC 630732.94 402742.86
SD 86530.76 166558.00
TN 1119315.02 1516918.04
TX 6570832.45 6221989.68
UT 519851.37 3717300.48
VA 4259977.19 3465765.85
VT 986510.59 55229.44
WA 4250933.16 1341521.56
WI 1130155.46 270316.32
WV 169154.47 126725.12
WY 194046.74 252595.84
state_support.plot(kind = 'bar',stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a9cfc56518>

在这里插入图片描述

df = DataFrame(state_support.sum(axis = 1))
df_sum = df.join(df,lsuffix = '_l',rsuffix = '_r')

df_sum.columns = ['Obama, Barack','Romney, Mitt']
df_sum
Obama, Barack Romney, Mitt
contbr_st
AK 368044.39 368044.39
AL 1070426.99 1070426.99
AR 464803.28 464803.28
AZ 3394913.21 3394913.21
CA 35062620.84 35062620.84
CO 3639143.61 3639143.61
CT 5567766.71 5567766.71
DC 5398676.30 5398676.30
DE 419381.14 419381.14
FL 15656637.39 15656637.39
GA 4782125.08 4782125.08
HI 906975.64 906975.64
IA 793366.59 793366.59
ID 984696.50 984696.50
IL 20072467.37 20072467.37
IN 1425778.04 1425778.04
KS 774672.44 774672.44
KY 1381856.91 1381856.91
LA 1539250.14 1539250.14
MA 11359557.55 11359557.55
MD 6466354.33 6466354.33
ME 1284911.96 1284911.96
MI 5018417.12 5018417.12
MN 2038043.46 2038043.46
MO 2692113.61 2692113.61
MS 525380.17 525380.17
MT 461854.24 461854.24
NC 3630671.55 3630671.55
ND 98391.90 98391.90
NE 429912.47 429912.47
NH 1041833.96 1041833.96
NJ 6536578.13 6536578.13
NM 1074173.72 1074173.72
NV 1340743.16 1340743.16
NY 24836331.14 24836331.14
OH 3724289.49 3724289.49
OK 1433826.48 1433826.48
OR 1749180.12 1749180.12
PA 5654946.41 5654946.41
RI 645245.14 645245.14
SC 1033475.80 1033475.80
SD 253088.76 253088.76
TN 2636233.06 2636233.06
TX 12792822.13 12792822.13
UT 4237151.85 4237151.85
VA 7725743.04 7725743.04
VT 1041740.03 1041740.03
WA 5592454.72 5592454.72
WI 1400471.78 1400471.78
WV 295879.59 295879.59
WY 446642.58 446642.58
state_precent = state_support/df_sum
state_precent.plot(kind = 'bar',stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a9cfa766a0>

在这里插入图片描述

条形图显示对比情况


# basemape 工具包,pip install basemap
# basemape绘制地图,pip install 无法安装成功
# 下载:https://www.lfd.uci.edu/~gohlke/pythonlibs/
# 州全称对应缩写缩写
abbr = {'Commonwealth of Kentucky':'KY','Commonwealth of Massachusetts':'MA','Commonwealth of Pennsylvania':'PA',
        'State of Rhode Island and Providence Plantations':'RI'}
import matplotlib.pyplot as plt
%matplotlib inline

from mpl_toolkits.basemap import Basemap

# hex16进制表示的颜色
from matplotlib.colors import rgb2hex
from matplotlib.patches import Polygon

'''llcrnrlon	所需地图域(度)的左下角经度。
llcrnrlat	所需地图域左下角的纬度(度)。
urcrnrlon	所需地图域(度)的右上角经度。
urcrnrlat	所需地图域右上角的纬度(度)。
'''

plt.figure(figsize=(12,9))
m = Basemap(llcrnrlon = -122,
           llcrnrlat = 23.41,
           urcrnrlon = -64,
           urcrnrlat = 45,
           projection = 'lcc',
           lat_1 = 30,
           lon_0 = -100
            )

m.drawcoastlines(linewidth=1.5)
m.drawcountries(linewidth = 1.5)

# 地图读取了美国地图形状,m中就有各州的形状,数据
m.readshapefile('./USA/gadm36_USA_1',name = 'states')

colors = []
states = []

cmap = plt.cm.Reds

for shapeinfo in m.states_info:
    a = shapeinfo['VARNAME_1']
#     州的缩写
    s = a.split('|')[0]
    try:
        rate = oba[s]
        colors.append(cmap(rate))
        states.append(s)
    except:
        colors.append(cmap(oba[abbr[s]]))
        states.append(s)
# 州填充颜色
# seg州中的一部分区域,多边形
ax = plt.gca()
for n,seg in enumerate(m.states):
    c = rgb2hex(colors[n])
    poly = Polygon(seg,color = c )
    ax.add_patch(poly)
plt.savefig('./usa_states_rate.jpg')
plt.show()

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/qq_42166308/article/details/103301318