字典统计词频

版权声明:本文为博主原创文章,未经博主允许不得转载。https://blog.csdn.net/yukyin https://blog.csdn.net/yukyin/article/details/83030804
import pandas as pd
import numpy as np

#构造B列为多值,那么B列是字符串,也就是['','',''],这样可以split。不能写成[[],[],[]],这样是list,list不能split。
temp=pd.DataFrame({'A':[1,2,3],'B':['4,2,1','5,3,2','6,4,3']},index=['a','b','c'])
print(temp)
#    A      B
# a  1  4,2,1
# b  2  5,3,2
# c  3  6,4,3


for index, row in temp[['A','B']].iterrows():
    print(index)
    #a
    #b
    #c
    print(row)#下面这些类是series
    # A
    # 1
    # B
    # 4, 2, 1
    # Name: a, dtype: object
    # A
    # 2
    # B
    # 5, 3, 2
    # Name: b, dtype: object
    # A
    # 3
    # B
    # 6, 4, 3
    # Name: c, dtype: object
    print(row['A'])
    # 1
    # 2
    # 3
    print(row['B'])
    # 4, 2, 1
    # 5, 3, 2
    # 6, 4, 3


#统计词频
#写法1:(更简单?)
from collections import defaultdict
back = defaultdict(lambda :0)
for index, row in temp[['A', 'B']].iterrows():
    word_list=row['B'].split(',')#这一列是以空格分隔的括号里就空的,以逗号分隔就是','
    for word in word_list:
        # print(back[word])#这种写法在这里写这一句,会打印0,因为上面已设置默认为0
        back[word] = back[word] + 1
        print(back[word])
        # 1
        # 1
        # 1
        # 1
        # 1
        # 2
        # 1
        # 2
        # 2
print(back)
#defaultdict(<function <lambda> at 0x0000015191AFE598>, {'5': 1, '6': 1, '1': 1, '2': 2, '3': 2, '4': 2})


#写法2:
back = {}
for index, row in temp[['A', 'B']].iterrows():
    word_list=row['B'].split(',')
    for word in word_list:
        # print(back[word])#会报错,因为字典统计词频需要首先有这个词
        try:
            back[word]=back[word]+1
        except:
            back[word]=1
        print(back[word])
        # 1
        # 1
        # 1
        # 1
        # 1
        # 2
        # 1
        # 2
        # 2
print(back)
#{'5': 1, '6': 1, '1': 1, '2': 2, '3': 2, '4': 2}

猜你喜欢

转载自blog.csdn.net/yukyin/article/details/83030804