开发者

Python如何提取html中文本到txt

开发者 https://www.devze.com 2023-01-04 09:38 出处:网络 作者: 彳亍261
目录python提取html中文本到txt正则去标签方式nltkhtmlParserPython提取txt正则内容总结Python提取html中文本到txt
目录
  • python提取html中文本到txt
    • 正则去标签方式
    • nltk
    • htmlParser
  • Python提取txt正则内容
    • 总结

      Python提取html中文本到txt

      正则去标签方式

      # -*- coding: utf-8 -*-
      import re
      
      def html_tag_rm(content: str):
      	dr = re.compile(r'<[^>]+>',re.S)
      	return dr.sub('',content)
      

      nltk

      比较笨重

      需要安装依赖 nltk, numpy, pyyaml

      # -*- coding: utf-8编程客栈 -*-
      import nltk
      
      
      def html_tag_rm(content: str):
      	return nltk.clean_html(content)
      
      

      htmlParser

      import re
      from sys import stderr 
      from traceback import print_exc
      from HTMLParser import HTMLParser
      
       
      class _DeHTMLParser(HTMLParser): 
          def __init__(self): 
              HTMLParser.__init__(self) 
              self.__text = [] 
       
          def handle_data(self, data): 
              text = data.strip() 
              if len(text) > 0: 
                  text = re.sub('[ \t\r\n]+', ' ', text) 
                  self.__text.append(text + ' ') 
       
          def handle_starttag(self, tag, attrs): 
              if tag == 'p': 
                  self.__text.append('\n\n') 
              elif tag == 'br': 
                  self.__text.append('\n') 
       
          def handle_startendtag(self, tag, attrs): 
              if tag == 'br': 
                  self.__text.append('\n\n') 
       
          def text(self): 
              return ''.join(self.__text).strip() 
       
       
      def dehtml(text): 
          try: 
              parser = _DeHTMLParser() 
              parser.feed(text) 
              parser.close() 
              return parser.text() 
          except: 
              print_exc(file=stderr) 
              return text 
       
       
      def main(): 
          text = r'''''
              <html>
                  <body>
                      Project: DeHTML<br>
                      Description:<br>
                      This small script is i编程客栈ntended to allow conversion from HTMphpL markup to 
                      plain text.
                  </body>
              </html>
          ''' 
          print(dehtml(text)) 
       
       
      if __name__ == '__main__': 
          main(编程)
      

      Python提取txt正则内容

      其中:

      pattern = re.compile(r'^.["“subject”"] [([^[])].*')

      为修改的正则匹配部分

      import re
      import pandas as pd
      with open("C:/data1.txt", 'r', encoding='UTF-8') as f:
          data = f.readlines()
          f.close()
      tol = []
      for line inLphdzDQ data:
      ##s = re.findall('[\u4e00-\u9fa5]', data) print(s)
          pattern = re.compile(r'^.*\[\"\"subject\"\"\] \[([^\[]*)\].*')
          string = str(line)
          url = re.findall(pattern,string)
          if (url is not None ) and (url != '[]'):
              tol.append(url)
      print(tol)
      pd.DataFrame(tol).to_csv('C:/tol2.csv')
      ##f1 = open("url.txt", "a+", encoding='utf-8')
      ##for urls in url:
      ##    f1.write(urls + '\n')
      ##f1.close()
      ##reg = re.compile(r'^.*\[\"\"subject\"\"\] \[(.*)\]')
      ##msg = '""i;octet""  [""subject""] [""小木虫""] ,accounts :in_main [""2012207469@tju.edu.c'
      ##mtch = reg.match(msg)
      ##print(mtch.group(1))
      

      总结

      以上为个人经验,希望能给大家一个参考,也希望大家多多支持我们。开发者_C教程

      0

      精彩评论

      暂无评论...
      验证码 换一张
      取 消

      关注公众号