目录
- python提取html中文本到txt
- 正则去标签方式
- nltk
- htmlParser
- Python提取txt正则内容
- 总结
Python提取html中文本到txt
正则去标签方式
# -*- coding: utf-8 -*- import re def html_tag_rm(content: str): dr = re.compile(r'<[^>]+>',re.S) return dr.sub('',content)
nltk
比较笨重
需要安装依赖 nltk, numpy, pyyaml
# -*- coding: utf-8编程客栈 -*- import nltk def html_tag_rm(content: str): return nltk.clean_html(content)
htmlParser
import re from sys import stderr from traceback import print_exc from HTMLParser import HTMLParser class _DeHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.__text = [] def handle_data(self, data): text = data.strip() if len(text) > 0: text = re.sub('[ \t\r\n]+', ' ', text) self.__text.append(text + ' ') def handle_starttag(self, tag, attrs): if tag == 'p': self.__text.append('\n\n') elif tag == 'br': self.__text.append('\n') def handle_startendtag(self, tag, attrs): if tag == 'br': self.__text.append('\n\n') def text(self): return ''.join(self.__text).strip() def dehtml(text): try: parser = _DeHTMLParser() parser.feed(text) parser.close() return parser.text() except: print_exc(file=stderr) return text def main(): text = r''''' <html> <body> Project: DeHTML<br> Description:<br> This small script is i编程客栈ntended to allow conversion from HTMphpL markup to plain text. </body> </html> ''' print(dehtml(text)) if __name__ == '__main__': main(编程)
Python提取txt正则内容
其中:
pattern = re.compile(r'^.["“subject”"] [([^[])].*')
为修改的正则匹配部分
import re import pandas as pd with open("C:/data1.txt", 'r', encoding='UTF-8') as f: data = f.readlines() f.close() tol = [] for line inLphdzDQ data: ##s = re.findall('[\u4e00-\u9fa5]', data) print(s) pattern = re.compile(r'^.*\[\"\"subject\"\"\] \[([^\[]*)\].*') string = str(line) url = re.findall(pattern,string) if (url is not None ) and (url != '[]'): tol.append(url) print(tol) pd.DataFrame(tol).to_csv('C:/tol2.csv') ##f1 = open("url.txt", "a+", encoding='utf-8') ##for urls in url: ## f1.write(urls + '\n') ##f1.close() ##reg = re.compile(r'^.*\[\"\"subject\"\"\] \[(.*)\]') ##msg = '""i;octet"" [""subject""] [""小木虫""] ,accounts :in_main [""2012207469@tju.edu.c' ##mtch = reg.match(msg) ##print(mtch.group(1))
总结
以上为个人经验,希望能给大家一个参考,也希望大家多多支持我们。开发者_C教程
精彩评论