Installez le module
pip install pdfplumber
pip install PyPDF2
pip install pypwin32
lecture de fichier pdf
"""
path: pdf的文件路径
pdfplumber.open(path): 文件打开读取
_pdf.pages: 获取pdf总页数
_pdf.pages[i].extract_text() : 获取每页的pdf的内容
"""
def get_pdf_content(path):
_pdf = pdfplumber.open(path)
pages = len(_pdf.pages)
all_content = [_pdf.pages[i].extract_text() for i in range(pages)]
return all_content
if __name__ == '__main__':
data = get_pdf_content('test.pdf')
print(data[0])
Lire les données du tableau en pdf
def get_pdf_tables(path):
_pdf = pdfplumber.open(path)
pages = len(_pdf.pages)
all_table = []
for i in range(pages):
for j in _pdf.pages[i].extract_tables():
content = [k for k in j if '' not in k]
all_table.append(content)
return all_table
Récupérez les données du tableau dans le pdf et écrivez-les dans le tableau
import pdfplumber
import pandas as pd
def get_pdf_tables(path):
_pdf = pdfplumber.open(path)
pages = len(_pdf.pages)
all_table = []
for i in range(pages):
for j in _pdf.pages[i].extract_tables():
content = [k for k in j if '' not in k]
all_table.append(content)
return all_table
def write_to_excel(all_tables):
writer = pd.ExcelWriter("表格数据.xlsx")
for index, content in enumerate(all_tables):
name = "pd_{}".format(index)
df = pd.DataFrame(content)
df.to_excel(writer, sheet_name=name, encoding="gbk")
writer.save()
writer.close()
if __name__ == '__main__':
data = get_pdf_tables('test.pdf')
write_to_excel(data)
mot en pdf
from win32com.client import constants, gencache
def word_to_pdf(word_path, pdf_path):
"""
word转pdf
:param word_path: word文件路径
:param pdf_path: 生成pdf文件路径
"""
word = gencache.EnsureDispatch('Word.Application')
doc = word.Documents.Open(word_path, ReadOnly=1)
doc.ExportAsFixedFormat(pdf_path,
constants.wdExportFormatPDF,
Item=constants.wdExportDocumentWithMarkup,
CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
word.Quit(constants.wdDoNotSaveChanges)
pdf ajouter un filigrane
from PyPDF2 import PdfFileReader, PdfFileWriter
def water_to_file(pdf_path, water_path):
"""
:param pdf_path: 需要加水印的pdf
:param water_path: 水印的pdf
:return:
"""
pdf = PdfFileReader(pdf_path)
water_pdf = PdfFileReader(water_path)
water = water_pdf.getPage(0)
writer = PdfFileWriter()
for i in range(pdf.getNumPages()):
pdf.getPage(i).mergePage(water)
writer.addPage(pdf.getPage(i))
writer.write(open('new.pdf', 'wb'))