开发者

excel formula to strip html

开发者 https://www.devze.com 2023-01-05 04:31 出处:网络
I want to strip all the html away from the left and right开发者_JS百科 of the textual value: I have this...

I want to strip all the html away from the left and right开发者_JS百科 of the textual value:

I have this...

<option value="41">GECommonUI</option>

I want to get this...

GECommonUI


in excel select all
open replace (ctrl+f)
replace <*>
with nothing

or use perl regex
$line = "This is some text with HTML and words";
$line =~ s/<(.*?)>//gi;


Set a reference to MS Forms 2.0 to use the DataObject object.

Public Function StripHTML(sInput As String) As String

    Dim rTemp As Range
    Dim oData As DataObject

    Set oData = New DataObject
    oData.SetText "<html><style>br{mso-data-placement:same-cell;}</style>" & sInput & "</html>"
    oData.PutInClipboard

    Set rTemp = Workbooks.Add.Worksheets(1).Range("a1")
    rTemp.Parent.PasteSpecial "Unicode Text"
    StripHTML = rTemp.Text

    rTemp.Parent.Parent.Close False
    Set rTemp = Nothing
    Set oData = Nothing

End Function

See http://www.dailydoseofexcel.com/archives/2005/02/23/html-in-cells-ii/ for more info.


You can download an Excel file with a working user defined function =stripHTML() on the page below. The file has a working example and notes. This will solve your problem. Please note, you will have to allow macros on this Excel document for the function to work.

http://jfrancisconsulting.com/how-to-strip-html-tags-in-excel/

How is my function different?

Most functions I found did a good job of doing a global find and replace to delete all HTML tags that are enclosed in brackets <>.

However this deletes certain line breaks which causes the end result to look a smashed blob of text. I modified an existing function to add back line spaces so the end result keeps appropriate line breaks.

Secondly, the global HTML replaces attributes in brackets only <> All HTML special character notations will remain. Ex. instead of an ampersand &, you'll be left with the HTML version, &. To solve this problem, I inserted 87 replace lines to replace all HTML notations with the characters.

Function StripHTML(cell As Range) As String

Dim RegEx As Object
Set RegEx = CreateObject(“vbscript.regexp”)
Dim sInput As String
Dim sOut As String
sInput = cell.Text

sInput = Replace(sInput, “\x0D\x0A”, Chr(10))
sInput = Replace(sInput, “\x00″, Chr(10))

‘replace HTML breaks and end of paragraphs with line breaks
sInput = Replace(sInput, “</P>”, Chr(10) & Chr(10))
sInput = Replace(sInput, “<BR>”, Chr(10))

‘replace bullets with dashes
sInput = Replace(sInput, “<li>”, “-”)

‘add back all of the special characters
sInput = Replace(sInput, “&ndash;”, “–”)
sInput = Replace(sInput, “&mdash;”, “—”)
sInput = Replace(sInput, “&iexcl;”, “¡”)
sInput = Replace(sInput, “&iquest;”, “¿”)
sInput = Replace(sInput, “&quot;”, “”)
sInput = Replace(sInput, “&ldquo;”, ““”)
sInput = Replace(sInput, “&rdquo;”, “””)
sInput = Replace(sInput, “”, “‘”)
sInput = Replace(sInput, “&lsquo;”, “‘”)
sInput = Replace(sInput, “&rsquo;”, “’”)
sInput = Replace(sInput, “&laquo;”, “«”)
sInput = Replace(sInput, “&raquo;”, “»”)
sInput = Replace(sInput, “&nbsp;”, ” “)
sInput = Replace(sInput, “&amp;”, “&”)
sInput = Replace(sInput, “&cent;”, “¢”)
sInput = Replace(sInput, “&copy;”, “©”)
sInput = Replace(sInput, “&divide;”, “÷”)
sInput = Replace(sInput, “&gt;”, “>”)
sInput = Replace(sInput, “&lt;”, “<”)
sInput = Replace(sInput, “&micro;”, “µ”)
sInput = Replace(sInput, “&middot;”, “·”)
sInput = Replace(sInput, “&para;”, “¶”)
sInput = Replace(sInput, “&plusmn;”, “±”)
sInput = Replace(sInput, “&euro;”, “€”)
sInput = Replace(sInput, “&pound;”, “£”)
sInput = Replace(sInput, “&reg;”, “®”)
sInput = Replace(sInput, “&sect;”, “§”)
sInput = Replace(sInput, “&trade;”, “™”)
sInput = Replace(sInput, “&yen;”, “¥”)
sInput = Replace(sInput, “&aacute;”, “á”)
sInput = Replace(sInput, “&Aacute;”, “Á”)
sInput = Replace(sInput, “&agrave;”, “à”)
sInput = Replace(sInput, “&Agrave;”, “À”)
sInput = Replace(sInput, “&acirc;”, “â”)
sInput = Replace(sInput, “&Acirc;”, “”)
sInput = Replace(sInput, “&aring;”, “å”)
sInput = Replace(sInput, “&Aring;”, “Å”)
sInput = Replace(sInput, “&atilde;”, “ã”)
sInput = Replace(sInput, “&Atilde;”, “Ô)
sInput = Replace(sInput, “&auml;”, “ä”)
sInput = Replace(sInput, “&Auml;”, “Ä”)
sInput = Replace(sInput, “&aelig;”, “æ”)
sInput = Replace(sInput, “&AElig;”, “Æ”)
sInput = Replace(sInput, “&ccedil;”, “ç”)
sInput = Replace(sInput, “&Ccedil;”, “Ç”)
sInput = Replace(sInput, “&eacute;”, “é”)
sInput = Replace(sInput, “&Eacute;”, “É”)
sInput = Replace(sInput, “&egrave;”, “è”)
sInput = Replace(sInput, “&Egrave;”, “È”)
sInput = Replace(sInput, “&ecirc;”, “ê”)
sInput = Replace(sInput, “&Ecirc;”, “Ê”)
sInput = Replace(sInput, “&euml;”, “ë”)
sInput = Replace(sInput, “&Euml;”, “Ë”)
sInput = Replace(sInput, “&iacute;”, “í”)
sInput = Replace(sInput, “&Iacute;”, “Í”)
sInput = Replace(sInput, “&igrave;”, “ì”)
sInput = Replace(sInput, “&Igrave;”, “Ì”)
sInput = Replace(sInput, “&icirc;”, “î”)
sInput = Replace(sInput, “&Icirc;”, “Δ)
sInput = Replace(sInput, “&iuml;”, “ï”)
sInput = Replace(sInput, “&Iuml;”, “Ï”)
sInput = Replace(sInput, “&ntilde;”, “ñ”)
sInput = Replace(sInput, “&Ntilde;”, “Ñ”)
sInput = Replace(sInput, “&oacute;”, “ó”)
sInput = Replace(sInput, “&Oacute;”, “Ó”)
sInput = Replace(sInput, “&ograve;”, “ò”)
sInput = Replace(sInput, “&Ograve;”, “Ò”)
sInput = Replace(sInput, “&ocirc;”, “ô”)
sInput = Replace(sInput, “&Ocirc;”, “Ô”)
sInput = Replace(sInput, “&oslash;”, “ø”)
sInput = Replace(sInput, “&Oslash;”, “Ø”)
sInput = Replace(sInput, “&otilde;”, “õ”)
sInput = Replace(sInput, “&Otilde;”, “Õ”)
sInput = Replace(sInput, “&ouml;”, “ö”)
sInput = Replace(sInput, “&Ouml;”, “Ö”)
sInput = Replace(sInput, “&szlig;”, “ß”)
sInput = Replace(sInput, “&uacute;”, “ú”)
sInput = Replace(sInput, “&Uacute;”, “Ú”)
sInput = Replace(sInput, “&ugrave;”, “ù”)
sInput = Replace(sInput, “&Ugrave;”, “Ù”)
sInput = Replace(sInput, “&ucirc;”, “û”)
sInput = Replace(sInput, “&Ucirc;”, “Û”)
sInput = Replace(sInput, “&uuml;”, “ü”)
sInput = Replace(sInput, “&Uuml;”, “Ü”)
sInput = Replace(sInput, “&yuml;”, “ÿ”)
sInput = Replace(sInput, “”, “´”)
sInput = Replace(sInput, “”, “`”)

‘replace all the remaining HTML Tags
With RegEx
.Global = True
.IgnoreCase = True
.MultiLine = True
.Pattern = “<[^>]+>” ‘Regular Expression for HTML Tags.

End With
sOut = RegEx.Replace(sInput, “”)
StripHTML = sOut
Set RegEx = Nothing
End Function


You can try VBScripts Regular Expression Support.

http://www.regular-expressions.info/vbscriptexample.html


guys this is a simple set of string functions. as i am not up to speed on excel string functions this is a pain for me to figure out...

in a programing language this would go somethign like mid(start, length) to get the value from the full html string- the trick is to get the length as the start possition minus the lenth of th ending html lenght)

mid( pos(">")+2, len(string)-pos(">") )

0

精彩评论

暂无评论...
验证码 换一张
取 消