Python读取网页的文档

《Python网络数据采集》第七章数据清洗
下面的代码将返回维基百科词条“Python programming language”的 2-gram列表
在语言学里面有一种模型叫n-gram，表示文字或语言中的n个连续的单词组成的序列。

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re,string

def cleanInput(input):
    input = re.sub('\n+'," ",input)
    input = re.sub('\[0-9*\]',"",input)
    input = re.sub(' +'," ",input)
    input = bytes(input,"UTF-8")
    input = input.decode("ascii","ignore")
    cleanInput = []
    input = input.split(' ')

    for item in input:
        item = item.strip(string.punctuation)
        if len(item) >1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput
def ngrams(input,n):
    input = cleanInput(input)
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return output

html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bsoj = BeautifulSoup(html,"html.parser")
content  = bsoj.find("div",{"id":'mw-content-text'}).get_text()
ngrams = ngrams(content,2)
print(ngrams)
print("2-grams count is: "+str(len(ngrams)))

输出
这里写图片描述

re.sub（pattern，repl，string，count = 0，flags = 0 ）
使用替换值repl替换string中最左侧的、未重叠的pattern的出现位置。repl可以是字符串或函数。

>

re.sub(r’def\s+([a-zA-Z_][a-zA-Z_0-9])\s(\s*):’,
… r’static PyObject*\npy_\1(void)\n{‘,
… ‘def myfunc():’)
‘static PyObject*\npy_myfunc(void)\n{’
如果repl是一个函数，则会为每个非重叠的模式调用调用它。该函数接受单个匹配对象参数，并返回替换字符串。例如：

>

def dashrepl(matchobj):
… if matchobj.group(0) == ‘-‘: return ’ ’
… else: return ‘-’
re.sub(‘-{1,2}’, dashrepl, ‘pro—-gram-files’)
‘pro–gram files’
re.sub(r’\sAND\s’, ’ & ‘, ‘Baked Beans And Spam’, flags=re.IGNORECASE)
‘Baked Beans & Spam’
图案可以是字符串或图案对象。

可选参数count是要替换的模式最大出现次数; count必须是非负整数。如果省略或为零，则将替换所有出现的事件。仅当与前一个空匹配不相邻时，才会替换模式的空匹配，因此返回。sub(‘x*’, ‘-‘, ‘abxd’)’-a-b–d-‘

在字符串型repl参数中，除了上面描述的字符转义和反向引用之外， \g还将使用name由(?P…)语法定义的命名组匹配的子字符串。\g使用相应的组号; \g<2>因此，等同于\2，但在替代品中并不含糊\g<2>0。 \20将被解释为对组20的引用，而不是对组2的引用，后跟文字字符’0’。反向引用\g<0>替代了RE匹配的整个子字符串。

纯文本

from urllib.request import urlopen

textPage = urlopen("txt URL")
print(textPage.read())

UTF-8读取

from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("url")
bsObj = BeautifulSoup(html, "html.parser")
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()
content = bytes(content, "UTF-8")
content = content.decode("UTF-8")
print(content)

读取CSV文件

from urllib.request import urlopen
from io import StringIO
import csv

data = urlopen("csv URL").read().decode('ascii', 'ignore')
dataFile = StringIO(data)
dictReader = csv.DictReader(dataFile)

print(dictReader.fieldnames)

for row in dictReader:
    print(row)

PDF文件（这里需要pip3 install PDFMiner3K）

from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
from urllib.request import urlopen

def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)

    process_pdf(rsrcmgr, device, pdfFile)
    device.close()

    content = retstr.getvalue()
    retstr.close()
    return content

pdfFile = urlopen("PDF url")
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()

DOCX文档

from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
from bs4 import BeautifulSoup

wordFile = urlopen("http://pythonscraping.com/pages/AWordDocument.docx").read()
wordFile = BytesIO(wordFile)
document = ZipFile(wordFile)
xml_content = document.read('word/document.xml')

#过滤模式
wordObj = BeautifulSoup(xml_content.decode('utf-8'), "lxml-xml")
textStrings = wordObj.findAll("w:t")
for textElem in textStrings:
    closeTag = ""
    try:
        style = textElem.parent.previousSibling.find("w:pStyle")
        if style is not None and style["w:val"] == "Title":
            print("<h1>")
            closeTag = "</h1>"
    except AttributeError: #不打印标签
        pass 
    print(textElem.text)
    print(closeTag)

Python读取网页的文档

猜你喜欢