i want the following functionality.
input : this i开发者_开发技巧s test <b> bold text </b> normal text
expected output: this is test normal text
i.e. remove the content of the specified tag
Solution using BeautifulSoup
:
from BeautifulSoup import BeautifulSoup
def removeTag(soup, tagname):
for tag in soup.findAll(tagname):
contents = tag.contents
parent = tag.parent
tag.extract()
s = BeautifulSoup("abcd <b> btag </b> hello <d>dtag</d>")
removeTag(s,"b")
print s
removeTag(s, "d")
print s
returns:
>>>
abcd hello <d>dtag</d>
abcd hello
With BeautifulSoup:
from BeautifulSoup import BeautifulSoup
''.join(BeautifulSoup(page).findAll(text=True))
Found at http://www.ghastlyfop.com/blog/2008/12/strip-html-tags-from-string-python.html
If you don't mind Python (although regexps are fairly generic), you can take some inspiration from Django's strip_tags filter.
Reproduced here for completeness -
def strip_tags(value):
"""Returns the given HTML with all tags stripped."""
return re.sub(r'<[^>]*?>', '', force_unicode(value))
EDIT: If you're using this, or any other regexp solution, please keep in mind that it lets through carefully-crafted HTML (see comment) as well as HTML comments and hence should not be used with untrusted input. Consider using some of the beautifulsoup, html5lib or lxml answers for untrusted input instead.
Try with:
import re
input = 'this is test <b> bold text </b> normal text'
output = re.compile(r'<[^<]*?/?>').sub('', input)
print output
Looks like you want HTMLParser
. (html.parser
in Python 3.)
from HTMLParser import HTMLParser
from sys import stdout
class Filter(HTMLParser):
def __init__(self, ignored_tags):
super(Filter, self).__init__()
self.ignorelevel = 0
self. ignored_tags = ignored_tags
def handle_starttag(self, tag, attrs):
if self.ignorelevel > 0:
self.ignorelevel += 1
elif tag in self.ignored_tags:
self.ignorelevel = 1
else:
# One of these two. Test and see.
stdout.write(self.get_starttag_text())
#stdout.write('<' + self.get_starttag_text() + '>')
def handle_startendtag(self, tag, attrs):
if self.ignorelevel == 0 and tag not in self.ignored_tags:
# One of these two. Test and see.
stdout.write(self.get_starttag_text())
#stdout.write('<' + self.get_starttag_text() + '/>')
def handle_endtag(self, tag):
if self.ignorelevel > 0:
self.ignorelevel -= 1
if self.ignorelevel > 0:
return
stdout.write('</' + tag + '>')
def handle_data(self, data):
stdout.write(data)
def handle_charref(self, name):
stdout.write('&#' + name + ';')
def handle_entityref(self, name):
stdout.write('&' + name + ';')
def handle_comment(self, data):
stdout.write('<!-- ' + data + ' -->')
def handle_decl(self, data):
stdout.write('<!' + data + '>')
def handle_pi(self, data):
stdout.write('<?' + data + '>')
I would use http://code.google.com/p/html5lib/ if you want to include some safe tags.
See the "Sanitizing Tokenizer" section at http://code.google.com/p/html5lib/wiki/UserDocumentation.
Remember to test for vulnerabilities if it's an important service: http://ha.ckers.org/xss.html.
This is working code taken from my project Supybot, so it's fairly well tested:
class HtmlToText(sgmllib.SGMLParser): """Taken from some eff-bot code on c.l.p.""" entitydefs = htmlentitydefs.entitydefs.copy() entitydefs['nbsp'] = ' ' def __init__(self, tagReplace=' '): self.data = [] self.tagReplace = tagReplace sgmllib.SGMLParser.__init__(self) def unknown_starttag(self, tag, attr): self.data.append(self.tagReplace) def unknown_endtag(self, tag): self.data.append(self.tagReplace) def handle_data(self, data): self.data.append(data) def getText(self): text = ''.join(self.data).strip() return normalizeWhitespace(text) def htmlToText(s, tagReplace=' '): """Turns HTML into text. tagReplace is a string to replace HTML tags with. """ x = HtmlToText(tagReplace) x.feed(s) return x.getText()
As the docstring notes, it originated with Fredrik Lundh, not me. As they say, great authors steal :)
Sam's answer should do what's wanted fairly well as far as I can tell, but it may pay to make sure that any left over <> characters are replaced with < and > respectively to prevent misuse/invalid HTML.
This approach has the advantage that it can accept incredibly malformed HTML references/tags. BeautifulSoup also handles malformed tags fairly well but html5lib, sgmllib and htmllib can choke on invalid code, some more than others if I remember correctly.
The following code also validates & HTML references:
import re
from htmlentitydefs import name2codepoint, codepoint2name
S = '1234567890ABCDEF'
DHex = {}
for i in S:
DHex[i.lower()] = None
DHex[i.upper()] = None
def IsHex(S):
if not S: return False
for i in S:
if i not in DHex:
return False
return True
def UnEscape(S, LReEscape=None):
# Converts HTML character references into a unicode string to allow manipulation
#
# If LUnEscape is provided, then the positions of the escaped characters will be
# added to allow turning the result back into HTML with ReEscape below, validating
# the references and escaping all the rest
#
# This is needed to prevent browsers from stripping out e.g.   (spaces) etc
re = LReEscape != None
LRtn = []
L = S.split('&')
xx = 0
yy = 0
for iS in L:
if xx:
LSplit = iS.split(';')
if LSplit[0].lower() in name2codepoint:
# A character reference, e.g. '&'
a = unichr(name2codepoint[LSplit[0].lower()])
LRtn.append(a+';'.join(LSplit[1:]))
if re: LReEscape.append((yy, a))
elif LSplit[0] and LSplit[0][0] == '#' and LSplit[0][1:].isdigit():
# A character number e.g. '4'
a = unichr(int(LSplit[0][1:]))
LRtn.append(a+';'.join(LSplit[1:]))
if re: LReEscape.append((yy, a))
elif LSplit[0] and LSplit[0][0] == '#' and LSplit[0][1:2].lower() == 'x' and IsHex(LSplit[0][2:]):
# A hexadecimal encoded character
a = unichr(int(LSplit[0][2:].lower(), 16)) # Hex -> base 16
LRtn.append(a+';'.join(LSplit[1:]))
if re: LReEscape.append((yy, a))
else: LRtn.append('&%s' % ';'.join(LSplit))
else: LRtn.append(iS)
xx += 1
yy += len(LRtn[-1])
return ''.join(LRtn)
def ReEscape(LReEscape, S, EscFn):
# Re-escapes the output of UnEscape to HTML, ensuring e.g.  
# is turned back again and isn't stripped at a browser level
L = []
prev = 0
for x, c in LReEscape:
if x != prev:
L.append(EscFn(S[prev:x]))
o = ord(c)
if o in codepoint2name:
L.append('&%s;' % codepoint2name[o])
else: L.append('&#%s;' % o)
prev = x+len(c)
L.append(EscFn(S[prev:]))
return ''.join(L)
def escape(value):
# Escape left over <>& tags
value = value.replace('&', '&')
value = value.replace('>', '>')
value = value.replace('<', '<')
return value
def strip_tags(value):
# Strip HTML tags
value = re.sub(r'<[^>]*?>', '', value)
print 'No Tags:', value
# Validate & references
LReEscape = []
value = UnEscape(value, LReEscape)
value = ReEscape(LReEscape, value, EscFn=escape)
print 'References Validated:', value
return value
if __name__ == '__main__':
# Outputs:
# No Tags: this is test bold text normal text >< &blah & &
# References Validated: this is test bold text normal text >< &blah & &
strip_tags('this is test <b> bold text </b> normal text >< &blah & &')
Use the webob.exc module:
from webob.exc import strip_tags
And then use it:
print strip_tags('a<br/>b')
>> ab
精彩评论