Lightweight HTML Parsing Using MSHTML

Environment: Windows 2000 / Windows ME / IE 5.0+

I have a lot of experience in programming low-level MSHTML and I always see questions on how one can use MSHTML to parse HTML and then access elements via the DOM.

Well, here it is. I use IMarkupServices provided by MSHTML. There is no need for an IOleClientSite or any sort of embedding. I think is is just about as light as anyone can get.

In future articles, I will be concentrating on the reuse of MSHTML in other aspects of programming. Such as using MSHTML as an editor, for example.

This code makes use of simple COM calls and nothing more. It can be easily adapted for ATL, MFC and VB, among other languages. Please don't ask me to provide samples in other languages. In order to build this you need the IE SDK

/******************************************************************
 * ParseHTML.cpp
 *
 *  ParseHTML: Lightweight UI-less HTML parser using MSHTML
 *
 *  Note: This is for accessing the DOM only. No image download, 
 *        script execution, etc...
 *
 *  8 June 2001 - Asher Kobin (asherk@pobox.com)
 *  
 *  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY 
 *  OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT 
 *  LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR 
 *  FITNESS FOR A PARTICULAR PURPOSE.
 *
 *******************************************************************/

#include <windows.h>
#include <mshtml.h>

OLECHAR szHTML[] = OLESTR("<HTML><BODY>Hello World!</BODY></HTML>");

int __stdcall WinMain(HINSTANCE hInst, 
                      HINSTANCE hPrev, 
                      LPSTR lpCmdLine, 
                      int nShowCmd)
{
  IHTMLDocument2 *pDoc = NULL;

  CoInitialize(NULL);

  CoCreateInstance(CLSID_HTMLDocument, 
                   NULL, 
                   CLSCTX_INPROC_SERVER, 
                   IID_IHTMLDocument2, 
                   (LPVOID *) &pDoc);

  if (pDoc)
  {
    IPersistStreamInit *pPersist = NULL;

    pDoc->QueryInterface(IID_IPersistStreamInit, 
                       (LPVOID *) &pPersist);

    if (pPersist)
    {
      IMarkupServices *pMS = NULL;
  
      pPersist->InitNew();
      pPersist->Release();

      pDoc->QueryInterface(IID_IMarkupServices, 
                              (LPVOID *) &pMS);

      if (pMS)
      {
        IMarkupContainer *pMC = NULL;
        IMarkupPointer *pMkStart = NULL;
        IMarkupPointer *pMkFinish = NULL;

        pMS->CreateMarkupPointer(&pMkStart);
        pMS->CreateMarkupPointer(&pMkFinish);

        pMS->ParseString(szHTML, 
                         0, 
                         &pMC, 
                         pMkStart, 
                         pMkFinish);

        if (pMC)
        {
          IHTMLDocument2 *pNewDoc = NULL;

          pMC->QueryInterface(IID_IHTMLDocument, 
                              (LPVOID *) &pNewDoc);

          if (pNewDoc)
          {
            // do anything with pNewDoc, in this case 
            // get the body innerText.

            IHTMLElement *pBody;
            pNewDoc-gt;get_body(&pBody);

            if (pBody)
            {
              BSTR strText;

              pBody->get_innerText(&strText);
              pBody->Release();

              SysFreeString(strText);
            }

            pNewDoc->Release();
          }

          pMC->Release();
        }

        if (pMkStart)
            pMkStart->Release();

        if (pMkFinish)
          pMkFinish->Release();

        pMS->Release();
      }
    }

    pDoc->Release();
  }

  CoUninitialize();
  
  return TRUE;
}

Downloads

None. Source code provided above.