习题一:对一个文件中的关键单词进行统计以及进行的代码优化。
素材要求: 一篇正常的英文文档。
正常实现的代码:
def makekey(s:str): chars = set(r"""!'"#./\()[],*-""") key = s.lower() ret = [] for i,c in enumerate(key): if c in chars: ret.append(' ') else: ret.append(c) return ''.join(ret).split() #上面第一种方案:makekey代码效率低 # #下面第二种方案:makekey1还可以优化: def makekey1(s:str): chars = set(r"""!'"#./\()[],*-""") key = s.lower() ret = [] start = 0 for i,c in enumerate(key): if c in chars: if start == i: #如果紧挨着还是特殊字符,start一定等于i. start += 1 #加1并continue continue ret.append((key[start:i])) start = i+1 #加1是跳过这个不需要的特殊字符c. else: if start < len(key): #小于,说明还有有效字符,程序需要一直执行到未尾。 ret.append(key[start:]) return ret #------------------------------------------# d= {} with open('sample.txt',encoding='utf-8') as f: for line in f: words = line.split() for wordlist in map(makekey1,words): for word in wordlist: d[word] = d.get(word,0) + 1 for i,(k,v) in enumerate(sorted(d.items(),key=lambda item:item[1],reverse=True),1): if not i > 10: print(i,k,v)
执行结果:
1 path 138 2 the 136 3 is 60 4 a 59 5 os 49 6 if 43 7 and 40 8 to 34 9 on 33 10 of 33 Process finished with exit code 0
下面是代码的优化:
#todo ===============下面是对上面代码中的makekey1进行优化========================# #todo :删除上面的第一种方案: #todo : 对第二种方案:makekey1进行优化: #TODO : 排除字符可以有两种写法,各位随意。 # CHARS = set("""!'"#./\()[],*- \r\n\t""") # def _makekey2(key:str,chars=CHARS): def _makekey2(key: str, chars=set("""!'"#./\()[],*- \r\n\t""")): #如果使用多次,可以用上面的方案. start = 0 for i,c in enumerate(key): if c in chars: if start == i: #如果紧挨着还是特殊字符,start一定等于i. start += 1 #加1并continue continue # ret.append((key[start:i])) yield key[start:i] start = i+1 #加1是跳过这个不需要的特殊字符c. else: if start < len(key): #小于,说明还有有效字符,而且一直到未尾。 # ret.append(key[start:]) yield key[start:] #todo :【下面是不区分大小写】: def wordcount(filename:str,encoding='utf-8',ignorewords=set())->dict: #在这里加上过滤字符ignorewords. """此函数进行单词统计 """ d= {} with open('sample.txt',encoding=encoding) as f: for line in f: for word in map(str.lower,_makekey2(line)): #不区分大小写. if word not in ignorewords: d[word] = d.get(word,0) + 1 return d # todo :【下面是区分大小写】: # def wordcount(filename:str,encoding='utf-8',ignorewords=set())->dict: # """此函数进行单词统计 # # """ # d= {} # with open('sample.txt',encoding=encoding) as f: # for line in f: # for word in _makekey2(line): #区分大小写. # d[word] = d.get(word,0) + 1 #top10 def top(d:dict,n:int=10): #迭代出你要的top数据 for i,(k,v) in enumerate(sorted(d.items(),key = lambda item:item[1],reverse=True)): if i >= n: break # print(k,v) #正常不需要打印. yield k,v for k,v in top(wordcount('sample',ignorewords={'the','is'})): print(k,v)
执行结果:
path 138 a 59 os 49 if 43 and 40 to 34 on 33 of 33 return 30 windows 25 Process finished with exit code 0