import re from bs4 import BeautifulSoup, NavigableString, Tag, Comment import urllib.request
# 获取全部的html defgetAllHtml(url): response1 = urllib.request.urlopen(url) html_doc = response1.read() #创建一个BeautifulSoup解析对象 soup = BeautifulSoup(html_doc, "html5lib", from_encoding="iso-8859-1") for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() for s in soup(['hr']): s.extract()
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Image, Table, TableStyle from reportlab.lib.colors import white, black, blue from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.rl_config import defaultPageSize from reportlab.lib.units import inch from reportlab.lib.enums import TA_RIGHT, TA_CENTER, TA_LEFT from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont
# 将刚刚获取到的内容放入pdf对象中 for chapter in allChapter: p = Paragraph(chapter.get_text() + " " + chapter.a.get_text(), chapterStyle) Story.append(p) for section in allSectionToc: if (section.get_text().split(".")[0] == chapter.get_text().split()[0]): p = Paragraph(section.get_text(), sectionStyle) Story.append(p)
for chapter in allAppendixToc: p = Paragraph(chapter.get_text() + " " + chapter.a.get_text(), chapterStyle) Story.append(p) for section in allSectionToc: if (section.get_text().split(".")[0] == chapter.get_text().split()[0]): p = Paragraph(section.get_text(), sectionStyle) Story.append(p)