# 同时添加如下代码, 这样每次环境(kernel)启动的时候只要运行下方代码即可: # Also add the following code, # so that every time the environment (kernel) starts, # just run the following code: import sys
import json
import pandas as pd
import matplotlib.pyplot as plt
sys.path.append('/home/aistudio/external-libraries')
data =[]withopen("/home/aistudio/data/data67990/arxiv-metadata-oai-2019.json",'r')as f:for idx, line inenumerate(f):
d = json.loads(line)
d ={
'authors': d['authors'],'categories': d['categories'],'authors_parsed': d['authors_parsed']}
data.append(d)
data = pd.DataFrame(data)# 选择类别为cs.CV下面的论文
data2 = data[data['categories'].apply(lambda x:'cs.CV'in x)]# 拼接所有作者
all_authors =sum(data2['authors_parsed'],[])# 拼接所有的作者
authors_names =[' '.join(x)for x in all_authors]
authors_names = pd.DataFrame(authors_names)# 根据作者频率绘制直方图
plt.figure(figsize=(10,6))
authors_names[0].value_counts().head(10).plot(kind='barh')# 修改图配置
names = authors_names[0].value_counts().index.values[:10]
_ = plt.yticks(range(0,len(names)), names)
plt.ylabel('Author')
plt.xlabel('Count')
authors_lastnames =[x[0]for x in all_authors]
authors_lastnames = pd.DataFrame(authors_lastnames)
plt.figure(figsize=(10,6))
authors_lastnames[0].value_counts().head(10).plot(kind='barh')
names = authors_lastnames[0].value_counts().index.values[:10]
_ = plt.yticks(range(0,len(names)), names)
plt.ylabel('Author')
plt.xlabel('Count')