开发者

IHTMLDocument2->documentElement->outerHTML is too slow recreating HTML from DOM, is there a faster way?

开发者 https://www.devze.com 2022-12-10 02:37 出处:网络
I\'ve got an IE BHO plugin that sends out via a COM call 开发者_运维问答the HTML of a page that was loaded in the window.

I've got an IE BHO plugin that sends out via a COM call 开发者_运维问答the HTML of a page that was loaded in the window.

// Note all error handling removed for readability :)
STDMETHODIMP CPlugin::get_HTML(long lMaxSize, BSTR *pbstrHTML)
{
    CComPtr<IDispatch> pDispatch;
    MSHTML::IHTMLDocument2Ptr pDocument2 = NULL;
    MSHTML::IHTMLDocument3Ptr pDocument3 = NULL;
    hr = m_spWebBrowser->get_Document(&pDispatch);
    hr = pDispatch->QueryInterface(IID_IHTMLDocument3, (void**)&pDocument3);
    MSHTML::IHTMLElementPtr pRoot = pDocument3->documentElement;
    wstring strHTML = pRoot->outerHTML;
    CComBSTR bstrHTML = strOutput.c_str();
    bstrHTML.CopyTo(pbstrHTML);
}

However when it encounters a very large page (e.g. "http://sitemap.zillow.com/uncompressed/ForSale_Hood_MedPri_1.xml"), it takes 3 minutes to create the HTML from the DOM.

Is there a way to access the raw HTML/XML?

When you do a 'view page source' in IE, it pops up almost immediately, so internally IE must be using some API that can do what I want.

Thanks,

Shane.


It seems that in old versions of MSHTML, outerHTML had a O(n^2) performance. However, in newer versions (IE8) this problem is gone. If you have a choice, use IE8 or later.

Otherwise, using IPersistStream::Save is an option. But CreateStreamOnHGlobal won't help you since its implementation is also O(n^2). You'll have to use a custom IStream for that.

Included is an IStream implementation which was made for this purpose and supports quick writes:

#include <atlbase.h>
#include <atlcom.h>
#include <vector>

// an implementation of a write-only IStream.
// needed because the CreateStreamOnHGlobal implementation doesn't handle
// resizes well (N writes seem to take O(N^2) time)
class MyStream :
    public CComObjectRootEx<CComSingleThreadModel>,
    public CComCoClass<MyStream>,
    public IStreamImpl 
{
public: 

    std::vector<char> buf;

BEGIN_COM_MAP(MyStream)
    COM_INTERFACE_ENTRY(IStream)
END_COM_MAP()

    STDMETHOD(Write) (const void * pv, ULONG cb, ULONG *pcbWritten);
};
/*

Usage:

    CComPtr<IStream> stream;
    hr = MyStream::CreateInstance(&stream);
    // streamObj will be valid as long as IStream smart pointer lives
    MyStream *streamObj = (MyStream*)stream.p;
 */


STDMETHODIMP MyStream::Write(const void * pv, ULONG cb, ULONG *pcbWritten) 
{
    buf.insert(buf.end(), (char*)pv, (char*)pv+cb);
    return S_OK;
}


Yes, you can QI for IPersistStream and save to a memory stream created by CreateStreamOnHGlobal Note the document must finished downloading (ready state needs to be complete).


Thanks Amnon, the following code is mostly working for me.

// an implementation of a write-only IStream.
// needed because the CreateStreamOnHGlobal implementation doesn't handle
// resizes well (N writes seem to take O(N^2) time)
class MyStream :
    public CComObjectRootEx<CComSingleThreadModel>,
    public CComCoClass<MyStream>,
    public IStream
{
public: 

    std::vector<char> buf;

BEGIN_COM_MAP(MyStream)
    COM_INTERFACE_ENTRY(IStream)
END_COM_MAP()

    STDMETHOD(Write) (const void * pv, ULONG cb, ULONG *pcbWritten);

    // Implement IStream abstract functions
    STDMETHOD(Read) (void *pv, ULONG cb, ULONG *pcbRead) { return S_OK; };
    STDMETHOD(Seek) (LARGE_INTEGER dlibMove,DWORD dwOrigin,ULARGE_INTEGER *plibNewPosition) { return S_OK; };
    STDMETHOD(SetSize) (ULARGE_INTEGER libNewSize) { return S_OK; };
    STDMETHOD(CopyTo) (IStream *pstm,ULARGE_INTEGER cb,ULARGE_INTEGER *pcbRead,ULARGE_INTEGER *pcbWritten) { return S_OK; };
    STDMETHOD(Commit) (DWORD grfCommitFlags) { return S_OK; };
    STDMETHOD(Revert) () { return S_OK; };
    STDMETHOD(LockRegion) (ULARGE_INTEGER libOffset,ULARGE_INTEGER cb,DWORD dwLockType) { return S_OK; };
    STDMETHOD(UnlockRegion) (ULARGE_INTEGER libOffset,ULARGE_INTEGER cb,DWORD dwLockType) { return S_OK; };
    STDMETHOD(Stat) (__RPC__out STATSTG *pstatstg,DWORD grfStatFlag) { return S_OK; };
    STDMETHOD(Clone) (__RPC__deref_out_opt IStream **ppstm) { return S_OK; };
};

STDMETHODIMP MyStream::Write(const void * pv, ULONG cb, ULONG *pcbWritten) 
{
    buf.insert(buf.end(), (char*)pv, (char*)pv+cb);
    return S_OK;
}

// Retrieves the HTML of the current page
STDMETHODIMP CPlugin::get_HTML(long lMaxSize, BSTR *pbstrHTML)
{
    HRESULT hr = S_OK;
    try
    {
        CComPtr<IDispatch> pDispatch;
        MSHTML::IHTMLDocumentPtr pDocument = NULL;

        CComPtr<IStream> mystream;
        hr = MyStream::CreateInstance(&mystream);
        // streamObj will be valid as long as IStream smart pointer lives
        MyStream *streamObj = (MyStream*)mystream.p;

        hr = m_spWebBrowser->get_Document(&pDispatch);

        hr = pDispatch->QueryInterface(IID_IHTMLDocument, (void**)&pDocument);
        IPersistStreamInitPtr persistStream = pDocument;

        hr = CreateStreamOnHGlobal(NULL, TRUE, &stream);
        hr = persistStream->Save(mystream, FALSE);
    }
    catch(...)
    {
        TRACE_FN("Got exception somewhere");
    }
    return hr;
}

Now the only problem left is how to figure why some it returns me single-byte chars most times, and double-byte chars at other times. Any ideas?

Thanks for the help.

0

精彩评论

暂无评论...
验证码 换一张
取 消