I've got an IE BHO plugin that sends out via a COM call 开发者_运维问答the HTML of a page that was loaded in the window.
// Note all error handling removed for readability :)
STDMETHODIMP CPlugin::get_HTML(long lMaxSize, BSTR *pbstrHTML)
{
CComPtr<IDispatch> pDispatch;
MSHTML::IHTMLDocument2Ptr pDocument2 = NULL;
MSHTML::IHTMLDocument3Ptr pDocument3 = NULL;
hr = m_spWebBrowser->get_Document(&pDispatch);
hr = pDispatch->QueryInterface(IID_IHTMLDocument3, (void**)&pDocument3);
MSHTML::IHTMLElementPtr pRoot = pDocument3->documentElement;
wstring strHTML = pRoot->outerHTML;
CComBSTR bstrHTML = strOutput.c_str();
bstrHTML.CopyTo(pbstrHTML);
}
However when it encounters a very large page (e.g. "http://sitemap.zillow.com/uncompressed/ForSale_Hood_MedPri_1.xml"), it takes 3 minutes to create the HTML from the DOM.
Is there a way to access the raw HTML/XML?
When you do a 'view page source' in IE, it pops up almost immediately, so internally IE must be using some API that can do what I want.
Thanks,
Shane.It seems that in old versions of MSHTML, outerHTML had a O(n^2) performance. However, in newer versions (IE8) this problem is gone. If you have a choice, use IE8 or later.
Otherwise, using IPersistStream::Save is an option. But CreateStreamOnHGlobal won't help you since its implementation is also O(n^2). You'll have to use a custom IStream for that.
Included is an IStream implementation which was made for this purpose and supports quick writes:
#include <atlbase.h>
#include <atlcom.h>
#include <vector>
// an implementation of a write-only IStream.
// needed because the CreateStreamOnHGlobal implementation doesn't handle
// resizes well (N writes seem to take O(N^2) time)
class MyStream :
public CComObjectRootEx<CComSingleThreadModel>,
public CComCoClass<MyStream>,
public IStreamImpl
{
public:
std::vector<char> buf;
BEGIN_COM_MAP(MyStream)
COM_INTERFACE_ENTRY(IStream)
END_COM_MAP()
STDMETHOD(Write) (const void * pv, ULONG cb, ULONG *pcbWritten);
};
/*
Usage:
CComPtr<IStream> stream;
hr = MyStream::CreateInstance(&stream);
// streamObj will be valid as long as IStream smart pointer lives
MyStream *streamObj = (MyStream*)stream.p;
*/
STDMETHODIMP MyStream::Write(const void * pv, ULONG cb, ULONG *pcbWritten)
{
buf.insert(buf.end(), (char*)pv, (char*)pv+cb);
return S_OK;
}
Yes, you can QI for IPersistStream and save to a memory stream created by CreateStreamOnHGlobal Note the document must finished downloading (ready state needs to be complete).
Thanks Amnon, the following code is mostly working for me.
// an implementation of a write-only IStream.
// needed because the CreateStreamOnHGlobal implementation doesn't handle
// resizes well (N writes seem to take O(N^2) time)
class MyStream :
public CComObjectRootEx<CComSingleThreadModel>,
public CComCoClass<MyStream>,
public IStream
{
public:
std::vector<char> buf;
BEGIN_COM_MAP(MyStream)
COM_INTERFACE_ENTRY(IStream)
END_COM_MAP()
STDMETHOD(Write) (const void * pv, ULONG cb, ULONG *pcbWritten);
// Implement IStream abstract functions
STDMETHOD(Read) (void *pv, ULONG cb, ULONG *pcbRead) { return S_OK; };
STDMETHOD(Seek) (LARGE_INTEGER dlibMove,DWORD dwOrigin,ULARGE_INTEGER *plibNewPosition) { return S_OK; };
STDMETHOD(SetSize) (ULARGE_INTEGER libNewSize) { return S_OK; };
STDMETHOD(CopyTo) (IStream *pstm,ULARGE_INTEGER cb,ULARGE_INTEGER *pcbRead,ULARGE_INTEGER *pcbWritten) { return S_OK; };
STDMETHOD(Commit) (DWORD grfCommitFlags) { return S_OK; };
STDMETHOD(Revert) () { return S_OK; };
STDMETHOD(LockRegion) (ULARGE_INTEGER libOffset,ULARGE_INTEGER cb,DWORD dwLockType) { return S_OK; };
STDMETHOD(UnlockRegion) (ULARGE_INTEGER libOffset,ULARGE_INTEGER cb,DWORD dwLockType) { return S_OK; };
STDMETHOD(Stat) (__RPC__out STATSTG *pstatstg,DWORD grfStatFlag) { return S_OK; };
STDMETHOD(Clone) (__RPC__deref_out_opt IStream **ppstm) { return S_OK; };
};
STDMETHODIMP MyStream::Write(const void * pv, ULONG cb, ULONG *pcbWritten)
{
buf.insert(buf.end(), (char*)pv, (char*)pv+cb);
return S_OK;
}
// Retrieves the HTML of the current page
STDMETHODIMP CPlugin::get_HTML(long lMaxSize, BSTR *pbstrHTML)
{
HRESULT hr = S_OK;
try
{
CComPtr<IDispatch> pDispatch;
MSHTML::IHTMLDocumentPtr pDocument = NULL;
CComPtr<IStream> mystream;
hr = MyStream::CreateInstance(&mystream);
// streamObj will be valid as long as IStream smart pointer lives
MyStream *streamObj = (MyStream*)mystream.p;
hr = m_spWebBrowser->get_Document(&pDispatch);
hr = pDispatch->QueryInterface(IID_IHTMLDocument, (void**)&pDocument);
IPersistStreamInitPtr persistStream = pDocument;
hr = CreateStreamOnHGlobal(NULL, TRUE, &stream);
hr = persistStream->Save(mystream, FALSE);
}
catch(...)
{
TRACE_FN("Got exception somewhere");
}
return hr;
}
Now the only problem left is how to figure why some it returns me single-byte chars most times, and double-byte chars at other times. Any ideas?
Thanks for the help.
精彩评论