第三章-处理原始文本

需要解决的问题？

txt在线文档下载

分词
创建text
根据内容定义开始与结尾
HTML下载
HTML解析
过滤无关内容

读取本地文件

正则表达式

查找ed结尾的词汇

字谜：8个字母，第3个字母是j，第6个字母是t
9宫格输入判断
正则表达式中的+
提取字符块
查找词干
搜索已分词文本
规范化文本
词干提取器
词性归并
分割
链表与字符串
字符串与格式
排列

--------------------------------------------------------------------------------------------------------------------------

# coding: utf-8

# In[1]:

from __future__ import division # Python 2 users only
import nltk, re, pprint
from nltk import word_tokenize
from nltk.data import PathPointer, ZipFilePathPointer, find

# In[2]:

#txt在线文档下载
from urllib import urlopen
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = urlopen(url)
raw = response.read().decode('utf8')
print type(raw)
print len(raw)
print raw[:75]

# In[3]:

#分词
tokens = word_tokenize(raw)
print type(tokens)
print len(tokens)
print tokens[:10]

# In[4]:

#创建text
text = nltk.Text(tokens)
print type(text)
print text[1024:1062]
print text.collocations()

# In[10]:

#根据内容定义开始与结尾
print raw.find("PART I")
print raw.rfind("End of Project Gutenberg's Crime")
raw = raw[5338:1157746]
# raw=raw[raw.find("PART I"):raw.rfind("End of Project Gutenberg's Crime")]
print raw.find("PART I")

# In[5]:

#HTML下载
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read().decode('utf8')
html[:60]

# In[13]:

print html

# In[7]:

#HTML解析
from bs4 import BeautifulSoup
raw = BeautifulSoup(html,'lxml').get_text()
tokens = word_tokenize(raw)
tokens

# In[35]:

bs = BeautifulSoup(html,'lxml')
print bs.find("div",class_='bodytext').get_text()

# In[8]:

#过滤无关内容
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance('gene')

# In[9]:

print text

# In[14]:

#读取本地文件
f = open('document.txt')

# In[15]:

f = open('d:/data/document.txt')
f.read()

# In[16]:

f = open('d:/data/document.txt','rU')
for line in f:
print(line.strip())

# In[13]:

raw = open('d:/data/document.txt').read()
print type(raw)
tokens = word_tokenize(raw)
print type(tokens)
words = [w.lower() for w in tokens]
print type(words)
vocab = sorted(set(words))
print type(vocab)

# In[28]:

vocab.append('blog')
raw.append('blog')

# In[29]:

query = 'Who knows?'
beatles = ['john', 'paul', 'george', 'ringo']
query + beatles

# In[24]:

#Unicode字符
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')
f= path.open(encoding='latin2')
for line in f:
line = line.strip()
print(line)

# In[46]:

f= path.open()
for line in f:
line = line.strip()
print(line)

# In[47]:

ord('a')

# In[48]:

a=u'\u0061'
a

# In[49]:

print a

# In[18]:

ord(u'ń')

# In[20]:

nacute =u'\u0144'
nacute

# In[21]:

nacute.encode('utf8')

# In[22]:

print nacute.encode('utf8')

# In[25]:

import unicodedata
lines = path.open( encoding='latin2').readlines()
line = lines[2]
print(line.encode('unicode_escape'))
for c in line:
if ord(c) > 127:
print('%s U+%04x %s'% (c.encode('utf8'), ord(c), unicodedata.name(c)))

# In[26]:

line.find('zosta\u0142y')
line = line.lower()
line

# In[27]:

line.encode('unicode_escape')

# In[76]:

import re
m = re.search(u'\u015b\w*', line)
m.group()

# In[77]:

word_tokenize(line)

# In[28]:

#正则表达式
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

# In[79]:

#查找ed结尾的词汇
[w for w in wordlist if re.search('ed$', w)]

# In[29]:

#字谜：8个字母，第3个字母是j，第6个字母是t
[w for w in wordlist if re.search('^..j..t..$', w)]

# In[81]:

#9宫格输入判断
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]

# In[30]:

#正则表达式中的+
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
[w for w in chat_words if re.search('^m+i+n+e+$', w)]

# In[31]:

[w for w in chat_words if re.search('^[ha]+$', w)]

# In[32]:

wsj = sorted(set(nltk.corpus.treebank.words()))
[w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)]

# In[84]:

[w for w in wsj if re.search(r'^[A-Z]+\$$', w)]

# In[33]:

[w for w in wsj if re.search('^[0-9]{4}$', w)]

# In[34]:

[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]

# In[87]:

[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]

# In[35]:

[w for w in wsj if re.search('(ed|ing)$', w)]

# In[36]:

#提取字符块
word = 'supercalifragilisticexpialidocious'
print re.findall(r'[aeiou]', word)
print len(re.findall(r'[aeiou]', word))

# In[90]:

wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word))
fd.most_common(12)

# In[91]:

regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
pieces = re.findall(regexp, word)
return ''.join(pieces)

english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

# In[38]:

rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()

# In[39]:

cv_word_pairs = [(cv, w) for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs)
print cv_index['su']
print cv_index['po']

# In[40]:

#查找词干
def stem(word):
for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
if word.endswith(suffix):
return word[:-len(suffix)]
return word

# In[95]:

re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

# In[96]:

re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

# In[97]:

re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

# In[98]:

re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

# In[99]:

re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

# In[100]:

re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$', 'language')

# In[41]:

def stem(word):
regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
stem, suffix = re.findall(regexp, word)[0]
return stem

raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
[stem(t) for t in tokens]

# In[102]:

#搜索已分词文本
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
print moby.findall(r"<a> (<.*>) <man>")
chat = nltk.Text(nps_chat.words())
print chat.findall(r"<.*> <.*> <bro>")
print chat.findall(r"<l.*>{3,}")

# In[103]:

from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")

# In[42]:

###规范化文本###
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)

# In[43]:

#词干提取器
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens]

# In[45]:

[lancaster.stem(t) for t in tokens]

# In[108]:

#词性归并
wnl = nltk.WordNetLemmatizer()
[wnl.lemmatize(t) for t in tokens]

# In[121]:

####分割####

# In[123]:

len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())

# In[124]:

text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])

# In[125]:

#分词

# In[126]:

def segment(text, segs):
words = []
last = 0
for i in range(len(segs)):
if segs[i] == '1':
words.append(text[last:i+1])
last = i+1
words.append(text[last:])
return words

# In[127]:

text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
segment(text, seg1)

# In[128]:

segment(text, seg2)

# In[134]:

def evaluate(text, segs):
words = segment(text, segs)
text_size = len(words)
lexicon_size = len(' '.join(list(set(words))))
return text_size + lexicon_size

# In[136]:

text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
seg3 = "0000100100000011001000000110000100010000001100010000001"
print evaluate(text, seg3)
print evaluate(text, seg2)
print evaluate(text, seg1)

# In[50]:

from random import randint

def flip(segs, pos):
return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]

def flip_n(segs, n):
for i in range(n):
segs = flip(segs, randint(0, len(segs)-1))
return segs

def anneal(text, segs, iterations, cooling_rate):
temperature = float(len(segs))
while temperature > 0.5:
best_segs, best = segs, evaluate(text, segs)
for i in range(iterations):
guess = flip_n(segs, int(round(temperature,0)))
score = evaluate(text, guess)
if score < best:
best, best_segs = score, guess
score, segs = best, best_segs
temperature = temperature / cooling_rate
print(evaluate(text, segs), segment(text, segs))
print()
return segs

# In[151]:

text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
anneal(text, seg1, 5000, 1.2)

# In[152]:

####链表与字符串####

# In[155]:

#链表到字符串
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']
print ' '.join(silly)
print ';'.join(silly)
print ''.join(silly)

# In[51]:

#字符串与格式
word = 'cat'
sentence = """hello
world"""
print(word)
print(sentence)

# In[52]:

word

# In[159]:

sentence

# In[53]:

fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
for word in sorted(fdist):
print(word, '->', fdist[word], '; ')

# In[54]:

for word in sorted(fdist):
print '%s->%d;' % (word, fdist[word]),

# In[55]:

'%s->%d;' % ('cat',3)

# In[56]:

'%s->%d;' % 'cat'

# In[57]:

'%s->'%'cat'

# In[58]:

'%d'% 3

# In[59]:

'I want a %s right now' % 'coffee'

# In[60]:

print '%s wants a %s %s'%('Lee', 'sandwich', 'for lunch')

# In[67]:

template = 'Lee wants a %s right now'
menu = ('sandwich', 'spam fritter', 'pancake')
for snack in menu:
print template % snack

# In[69]:

#排列
'%6s' % 'dog'

# In[179]:

'%-6s' % 'dog'

# In[70]:

width = 6
'%-*s' % (width, 'dog')

# In[181]:

count, total = 3205, 9375
"accuracy for %d words: %2.4f%%" % (total, 100 * count / total)

# In[183]:

def tabulate(cfdist, words, categories):
print '%-16s' % 'Category',
for word in words: # column headings
print '%6s' % word,
print
for category in categories:
print '%-16s' % category, # row heading
for word in words: # for each word
print '%6d' % cfdist[category][word], # print table cell
print # end the row
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)

# In[71]:

output_file = open('output.txt', 'w')
words = set(nltk.corpus.genesis.words('english-kjv.txt'))
for word in sorted(words):
output_file.write(word + "\n")

# In[185]:

len(words)

# In[186]:

str(len(words))

# In[72]:

output_file.write(str(len(words)) + "\n")
output_file.close()

# In[ ]:

第三章-处理原始文本

猜你喜欢