My question is slightly related to: Strip HTML from strings in Python
I am looking for a simple way to strip HTML code from text. For example:
string = 'foo <SOME_VALID_HTML_TAG> something </SOME_VALID_HTML_TAG> bar'
stripIt(string)
Would 开发者_Go百科then yield foo bar
.
Is there any simple tool to achieve this in Python? The HTML code could be nested.
import lxml.html
import re
def stripIt(s):
doc = lxml.html.fromstring(s) # parse html string
txt = doc.xpath('text()') # ['foo ', ' bar']
txt = ' '.join(txt) # 'foo bar'
return re.sub('\s+', ' ', txt) # 'foo bar'
s = 'foo <SOME_VALID_HTML_TAG> something </SOME_VALID_HTML_TAG> bar'
stripIt(s)
returns
foo bar
from BeautifulSoup import BeautifulSoup
def removeTags(html, *tags):
soup = BeautifulSoup(html)
for tag in tags:
for tag in soup.findAll(tag):
tag.replaceWith("")
return soup
testhtml = '''
<html>
<head>
<title>Page title</title>
</head>
<body>text here<p id="firstpara" align="center">This is paragraph <b>one</b>.</p>
<p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>
</body>
</html>'''
print removeTags(testhtml, 'b', 'p')
You could use regex:
def stripIt(s):
txt = re.sub('<[^<]+?>.*?</[^<]+?>', '', s) # Remove html tags
return re.sub('\s+', ' ', txt) # Normalize whitespace
However, I would prefer Hugh Bothwell's solution as it would be more robust than pure regex.
Try this solution:
from BeautifulSoup import BeautifulSoup
def stripIt(string, tag):
soup = BeautifulSoup(string)
rmtags = soup.findAll(tag)
for t in rmtags:
string = string.replace(str(t), '')
return string
string = 'foo <p> something </p> bar'
print stripIt(string, 'p')
>>> foo bar
string = 'foo <a>bar</a> baz <a>quux</a>'
print stripIt(string, 'a')
>>> foo baz
Edit: This only works on validly nested tags, so for example:
string = 'blaz <div>baz <div>quux</div></div>'
print stripIt(string, 'div')
>>> blaz
string = 'blaz <a>baz <a>quux</a></a>'
print stripIt(string, 'a')
>>> blaz <a>baz </a>
If anyone has this problem and is already working with the jinja templating language: You can use the filter striptags
in templates and the function jinja2.filters.do_striptags()
in your code.
You can take advantage of HTMLParser by overriding methods accordingly:
from HTMLParser import HTMLParser
class HTMLStripper(HTMLParser):
text_parts = []
depth = 0
def handle_data(self, data):
if self.depth == 0:
self.text_parts.append(data.strip())
def handle_charref(self, ref):
data = unichr(int(ref))
self.handle_data(data)
def handle_starttag(self, tag, attrs):
self.depth += 1
def handle_endtag(self, tag):
if self.depth > 0:
self.depth -= 1
def handle_entityref(self, ref):
try:
data = unichr(name2codepoint[ref])
self.handle_data(data)
except KeyError:
pass
def get_stripped_text(self):
return ' '.join(self.text_parts)
def strip_html_from_text(html):
parser = HTMLStripper()
parser.feed(html)
return parser.get_stripped_text()
def main():
import sys
html = sys.stdin.read()
text = strip_html_from_text(html)
print text
if __name__ == '__main__':
main()
精彩评论