I want to strip all the html away from the left and right开发者_JS百科 of the textual value:
I have this...
<option value="41">GECommonUI</option>
I want to get this...
GECommonUI
in excel
select all
open replace (ctrl+f)
replace <*>
with nothing
or use perl regex
$line = "This is some text with HTML and words";
$line =~ s/<(.*?)>//gi;
Set a reference to MS Forms 2.0 to use the DataObject object.
Public Function StripHTML(sInput As String) As String
Dim rTemp As Range
Dim oData As DataObject
Set oData = New DataObject
oData.SetText "<html><style>br{mso-data-placement:same-cell;}</style>" & sInput & "</html>"
oData.PutInClipboard
Set rTemp = Workbooks.Add.Worksheets(1).Range("a1")
rTemp.Parent.PasteSpecial "Unicode Text"
StripHTML = rTemp.Text
rTemp.Parent.Parent.Close False
Set rTemp = Nothing
Set oData = Nothing
End Function
See http://www.dailydoseofexcel.com/archives/2005/02/23/html-in-cells-ii/ for more info.
You can download an Excel file with a working user defined function =stripHTML() on the page below. The file has a working example and notes. This will solve your problem. Please note, you will have to allow macros on this Excel document for the function to work.
http://jfrancisconsulting.com/how-to-strip-html-tags-in-excel/
How is my function different?
Most functions I found did a good job of doing a global find and replace to delete all HTML tags that are enclosed in brackets <>.
However this deletes certain line breaks which causes the end result to look a smashed blob of text. I modified an existing function to add back line spaces so the end result keeps appropriate line breaks.
Secondly, the global HTML replaces attributes in brackets only <> All HTML special character notations will remain. Ex. instead of an ampersand &, you'll be left with the HTML version, &. To solve this problem, I inserted 87 replace lines to replace all HTML notations with the characters.
Function StripHTML(cell As Range) As String
Dim RegEx As Object
Set RegEx = CreateObject(“vbscript.regexp”)
Dim sInput As String
Dim sOut As String
sInput = cell.Text
sInput = Replace(sInput, “\x0D\x0A”, Chr(10))
sInput = Replace(sInput, “\x00″, Chr(10))
‘replace HTML breaks and end of paragraphs with line breaks
sInput = Replace(sInput, “</P>”, Chr(10) & Chr(10))
sInput = Replace(sInput, “<BR>”, Chr(10))
‘replace bullets with dashes
sInput = Replace(sInput, “<li>”, “-”)
‘add back all of the special characters
sInput = Replace(sInput, “–”, “–”)
sInput = Replace(sInput, “—”, “—”)
sInput = Replace(sInput, “¡”, “¡”)
sInput = Replace(sInput, “¿”, “¿”)
sInput = Replace(sInput, “"”, “”)
sInput = Replace(sInput, ““”, ““”)
sInput = Replace(sInput, “””, “””)
sInput = Replace(sInput, “”, “‘”)
sInput = Replace(sInput, “‘”, “‘”)
sInput = Replace(sInput, “’”, “’”)
sInput = Replace(sInput, “«”, “«”)
sInput = Replace(sInput, “»”, “»”)
sInput = Replace(sInput, “ ”, ” “)
sInput = Replace(sInput, “&”, “&”)
sInput = Replace(sInput, “¢”, “¢”)
sInput = Replace(sInput, “©”, “©”)
sInput = Replace(sInput, “÷”, “÷”)
sInput = Replace(sInput, “>”, “>”)
sInput = Replace(sInput, “<”, “<”)
sInput = Replace(sInput, “µ”, “µ”)
sInput = Replace(sInput, “·”, “·”)
sInput = Replace(sInput, “¶”, “¶”)
sInput = Replace(sInput, “±”, “±”)
sInput = Replace(sInput, “€”, “€”)
sInput = Replace(sInput, “£”, “£”)
sInput = Replace(sInput, “®”, “®”)
sInput = Replace(sInput, “§”, “§”)
sInput = Replace(sInput, “™”, “™”)
sInput = Replace(sInput, “¥”, “¥”)
sInput = Replace(sInput, “á”, “á”)
sInput = Replace(sInput, “Á”, “Á”)
sInput = Replace(sInput, “à”, “à”)
sInput = Replace(sInput, “À”, “À”)
sInput = Replace(sInput, “â”, “â”)
sInput = Replace(sInput, “”, “”)
sInput = Replace(sInput, “å”, “å”)
sInput = Replace(sInput, “Å”, “Å”)
sInput = Replace(sInput, “ã”, “ã”)
sInput = Replace(sInput, “Ô, “Ô)
sInput = Replace(sInput, “ä”, “ä”)
sInput = Replace(sInput, “Ä”, “Ä”)
sInput = Replace(sInput, “æ”, “æ”)
sInput = Replace(sInput, “Æ”, “Æ”)
sInput = Replace(sInput, “ç”, “ç”)
sInput = Replace(sInput, “Ç”, “Ç”)
sInput = Replace(sInput, “é”, “é”)
sInput = Replace(sInput, “É”, “É”)
sInput = Replace(sInput, “è”, “è”)
sInput = Replace(sInput, “È”, “È”)
sInput = Replace(sInput, “ê”, “ê”)
sInput = Replace(sInput, “Ê”, “Ê”)
sInput = Replace(sInput, “ë”, “ë”)
sInput = Replace(sInput, “Ë”, “Ë”)
sInput = Replace(sInput, “í”, “í”)
sInput = Replace(sInput, “Í”, “Í”)
sInput = Replace(sInput, “ì”, “ì”)
sInput = Replace(sInput, “Ì”, “Ì”)
sInput = Replace(sInput, “î”, “î”)
sInput = Replace(sInput, “Δ, “Δ)
sInput = Replace(sInput, “ï”, “ï”)
sInput = Replace(sInput, “Ï”, “Ï”)
sInput = Replace(sInput, “ñ”, “ñ”)
sInput = Replace(sInput, “Ñ”, “Ñ”)
sInput = Replace(sInput, “ó”, “ó”)
sInput = Replace(sInput, “Ó”, “Ó”)
sInput = Replace(sInput, “ò”, “ò”)
sInput = Replace(sInput, “Ò”, “Ò”)
sInput = Replace(sInput, “ô”, “ô”)
sInput = Replace(sInput, “Ô”, “Ô”)
sInput = Replace(sInput, “ø”, “ø”)
sInput = Replace(sInput, “Ø”, “Ø”)
sInput = Replace(sInput, “õ”, “õ”)
sInput = Replace(sInput, “Õ”, “Õ”)
sInput = Replace(sInput, “ö”, “ö”)
sInput = Replace(sInput, “Ö”, “Ö”)
sInput = Replace(sInput, “ß”, “ß”)
sInput = Replace(sInput, “ú”, “ú”)
sInput = Replace(sInput, “Ú”, “Ú”)
sInput = Replace(sInput, “ù”, “ù”)
sInput = Replace(sInput, “Ù”, “Ù”)
sInput = Replace(sInput, “û”, “û”)
sInput = Replace(sInput, “Û”, “Û”)
sInput = Replace(sInput, “ü”, “ü”)
sInput = Replace(sInput, “Ü”, “Ü”)
sInput = Replace(sInput, “ÿ”, “ÿ”)
sInput = Replace(sInput, “”, “´”)
sInput = Replace(sInput, “”, “`”)
‘replace all the remaining HTML Tags
With RegEx
.Global = True
.IgnoreCase = True
.MultiLine = True
.Pattern = “<[^>]+>” ‘Regular Expression for HTML Tags.
End With
sOut = RegEx.Replace(sInput, “”)
StripHTML = sOut
Set RegEx = Nothing
End Function
You can try VBScripts Regular Expression Support.
http://www.regular-expressions.info/vbscriptexample.html
guys this is a simple set of string functions. as i am not up to speed on excel string functions this is a pain for me to figure out...
in a programing language this would go somethign like mid(start, length) to get the value from the full html string- the trick is to get the length as the start possition minus the lenth of th ending html lenght)
mid( pos(">")+2, len(string)-pos(">") )
精彩评论