开发者

Element to string in HTMLDocument

开发者 https://www.devze.com 2022-12-31 10:25 出处:网络
i have a Element object its a HTMLDocument objectand i want to string value of this element. i want this result

i have a Element object its a HTMLDocument object and i want to string value of this element.

i want this result

Christina Toth, Pharm. D.

=======================

plz see below code.

public static void main(String args[]) throws Exception {

    InputStream is = Nullsoft.getInputStream();
    InputStreamReader isr = new InputStreamReader(is);
    BufferedReader br = new BufferedReader(isr);

    HTMLEditorKit htmlKit = new HTMLEditorKit();
    HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();

    HTMLEditorKit.Parser parser = new ParserDelegator();
    HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
    parser.parse(br, callback, true);

    // Parse
    ElementIterator iterator = new ElementIterator(htmlDoc);
    Element element;
    while ((element = iterator.next()) != null) {
        AttributeSet attributes = element.getAttributes();
        Object name = attributes.getAttribute(StyleConstants.NameAttribute);
        if ((name instanceof HTML.Tag)
                && ((name == HTML.Tag.DIV) || (name == HTML.Tag.H2) || (name == HTML.Tag.H3))) {
            StringBuffer text = new StringBuffer();
            int count = element.getElementCount();
            for (int i = 0; i < count; i++) {
                Element child = element.getElement(i);
                AttributeSet childAttributes = child.getAttributes();
              //  if (childAttributes.getAttribute(StyleConstants.NameAttribute) == HTML.Tag.CONTENT)
                {
                    int startOffset = child.getStartOffset();
                    int endOffset = child.getEndOffset();
                    int length = endOffset - startOffset;
                    text.append(htmlDoc.getText(startOffset, length));
                }
            }
            System.out.println(name + ": " + text.toString());
        }
    }
    System.exit(0);
}

public static InputStream getInputStream() {

       String text = "<html>\n" +
            "<head>\n" +
            "<title>pg_0001</title>\n" +
            "\n" +
            "<开发者_如何学Go;style type=\"text/css\">\n" +
            ".ft3{font-style:normal;font-weight:bold;font-size:11px;font-family:Helvetica;color:#000000;}\n" +
            "</style>\n" +
            "</head>\n" +
            "<body vlink=\"#FFFFFF\" link=\"#FFFFFF\" bgcolor=\"#ffffff\">\n" +
            "\n" +
            "\n" +
            "<div style=\"position:absolute;top:597;left:252\"><nobr><span class=\"ft3\">Christina Toth, Pharm. D.</span></nobr></div>\n" +
                          "\n" +
            "\n" +
            "</body>\n" +
            "</html>";
    InputStream is = null;
    try {

        is = new ByteArrayInputStream(text.getBytes("UTF-8"));

    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();

    }
    return is;
}


Try this instead.

Edited to use the read() method of HTMLEditorKit.

import java.io.StringReader;
import javax.swing.text.AttributeSet;
import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.StyleConstants;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

public class NewMain {

    public static void main(String args[]) throws Exception {
        HTMLEditorKit htmlKit = new HTMLEditorKit();
        HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
        htmlKit.read(new StringReader(text), htmlDoc, 0);
        // Parse
        ElementIterator iterator = new ElementIterator(htmlDoc);
        Element element;
        while ((element = iterator.next()) != null) {
            AttributeSet as = element.getAttributes();
            Object name = as.getAttribute(StyleConstants.NameAttribute);
            if (name == HTML.Tag.DIV) {
                StringBuffer sb = new StringBuffer();
                sb.append(name).append(": ");
                int count = element.getElementCount();
                for (int i = 0; i < count; i++) {
                    Element child = element.getElement(i);
                    int startOffset = child.getStartOffset();
                    int endOffset = child.getEndOffset();
                    int length = endOffset - startOffset;
                    sb.append(htmlDoc.getText(startOffset, length));
                }
                System.out.println(sb);
            }
        }
    }
    private static String text
        = "<html>\n"
        + "<head>\n"
        + "<title>pg_0001</title>\n"
        + "\n"
        + "<style type=\"text/css\">\n"
        + ".ft3{font-style:normal;font-weight:bold;font-size:11px;"
        + "font-family:Helvetica;color:#000000;}\n"
        + "</style>\n"
        + "</head>\n"
        + "<body vlink=\"#FFFFFF\" link=\"#FFFFFF\" bgcolor=\"#ffffff\">\n"
        + "\n"
        + "\n"
        + "<div style=\"position:absolute;top:597;left:252\"><nobr><span "
        + "class=\"ft3\">Christina Toth, Pharm. D.</span></nobr></div>\n"
        + "\n"
        + "\n"
        + "</body>\n"
        + "</html>";
}

Console:

div: Christina Toth, Pharm. D.
0

精彩评论

暂无评论...
验证码 换一张
取 消