Development/Tutorials/Programming Tutorial KDE 4/How to write an XML parser: Difference between revisions

    From KDE TechBase
    (This is not a tutorial and is nothing more than an overly specific code snippet)
    m (Text replace - "</code>" to "</syntaxhighlight>")
    (3 intermediate revisions by 2 users not shown)
    Line 5: Line 5:


    <b>parser.h:</b>
    <b>parser.h:</b>
    <code cppqt n>
    <syntaxhighlight lang="cpp-qt" line>
    /*
    /*
      parser.h - demonstration of a parser in C++
      parser.h - demonstration of a parser in C++
    Line 33: Line 33:


    #endif
    #endif
    </code>
    </syntaxhighlight>


    <b>parser.cpp:</b>
    <b>parser.cpp:</b>
    <code cppqt n>
    <syntaxhighlight lang="cpp-qt" line>
    /*
    /*
      parser.cpp - demonstration of a parser in C++
      parser.cpp - demonstration of a parser in C++
    Line 50: Line 50:
       bool Parser::startDocument()
       bool Parser::startDocument()
       {
       {
         kDebug() << "Searching document for tags" << endl;
         kDebug() << "Searching document for tags";
         return true;
         return true;
       }
       }
    Line 56: Line 56:
       bool Parser::startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att )
       bool Parser::startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att )
       {
       {
         kDebug() << "Found Element " << qName << endl;
         kDebug() << "Found Element" << qName;
         return true;
         return true;
      }
      }
    </code>
    </syntaxhighlight>


    <b>hello.cpp:</b>
    <b>hello.cpp:</b>
    <code cppqt n>
    <syntaxhighlight lang="cpp-qt" line>
    /*
    /*
    hello.cpp
    hello.cpp
    Line 83: Line 83:
       reader.parse( source );
       reader.parse( source );
    }
    }
    </code>
    </syntaxhighlight>


    =The DOM approach=
    =The DOM approach=
    <code cppqt n>
    <syntaxhighlight lang="cpp-qt" line>
    /*
    /*
       dom.cpp
       dom.cpp
    Line 110: Line 110:
       kdDebug() << node.nodeName() << endl;
       kdDebug() << node.nodeName() << endl;
    }
    }
    </code>
    </syntaxhighlight>


    =Drawbacks=
    =Drawbacks=
    HTML parsing only works for "legal" html documents.  
    HTML parsing only works for "legal" html documents.  
    For example, look at this code:
    For example, look at this code:
    <code xml>
    <syntaxhighlight lang="xml">
    <html>
    <html>
       <body>
       <body>
    Line 123: Line 123:
       </body>
       </body>
    </html>
    </html>
    </code>
    </syntaxhighlight>
    This code contains a &amp; and will bring your parser to an error.
    This code contains a &amp; and will bring your parser to an error.


    See here:
    See here:
    <code xml>
    <syntaxhighlight lang="xml">
    <html>
    <html>
       <body>
       <body>
    Line 135: Line 135:
       </body>
       </body>
    </html>
    </html>
    </code>
    </syntaxhighlight>
    This code will throw an error because of the '''nowrap''' that is not xml-conform.
    This code will throw an error because of the '''nowrap''' that is not xml-conform.

    Revision as of 20:54, 29 June 2011

    Warning
    This page has been nominated for deletion.

    Reason: {{{1}}}

    If you disagree with its deletion, remove the template and discuss it on its talk page.

    A parser is used to distinguish between formal language and bulk data of a given grammar. See http://en.wikipedia.org/wiki/Parser for more information. There are two ways to write a parser: to split up the content of a file into an object as known from object-oriented programming (the DOM approach) or to trigger a function everytime a reader occurs a given syntax tag (the QXML approach).

    The QXML approach

    parser.h:

    /*
     parser.h - demonstration of a parser in C++
    */
    
    #ifndef PARSER_H
    #define PARSER_H
    
    #include <qstring.h>
    #include <QtXml/QXmlDefaultHandler>
    #include <QtXml/QXmlAttributes>
    
    class Parser : public QXmlDefaultHandler
    {
    public:
    
      Parser();
    
      /** given by the framework from qxml. Called when parsing the xml-document starts.          */
      bool startDocument();
    
      /** given by the framework from qxml. Called when the reader occurs an open tag (e.g. \<b\> ) */
      bool startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att );
    
    };
    
    
    #endif
    

    parser.cpp:

    /*
     parser.cpp - demonstration of a parser in C++
    */
    
    #include "parser.h"
    #include <kdebug.h>
    
      Parser::Parser()
      {
      }
      
      bool Parser::startDocument()
      {
        kDebug() << "Searching document for tags";
        return true;
      }
      
      bool Parser::startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att )
      {
        kDebug() << "Found Element" << qName;
        return true;
     }
    

    hello.cpp:

    /*
    hello.cpp
    compile it with
    g++ -I. -I/home/kde-devel/kde/include -I/home/kde-devel/qt-unstable/include/Qt -I/home/kde-devel/qt-unstable/include /home/kde-devel/qt-unstable/include/QtXml parser.h parser.cpp hello.cpp -L/home/kde-devel/kde/lib -L/home/kde-devel/qt-unstable/lib -lQtCore_debug -lQtXml_debug -lkdeui
    */
    
    
    #include <qstring.h>
    #include <QXmlInputSource>
    #include <qfile.h>
    #include <parser.h>
    
    int main()
    {  
      Parser* handler=new Parser();
      QXmlInputSource* source=new QXmlInputSource(new QFile("hello.htm"));
      QXmlSimpleReader reader;
      reader.setContentHandler( handler );
      reader.parse( source );
    }
    

    The DOM approach

    /*
       dom.cpp
       A demonstration how to use the dom parsing framework.
       Prints the first subnode of an HTML file, i.e. typically 
       "head" or "body".
       compile it like this:
       g++ -I. -I/opt/kde3/include -I/usr/lib/qt3/include dom.cpp \
       -L/opt/kde3/lib -L/usr/lib/qt3/lib -lqt-mt -lkdeui   
    */
    #include <qdom.h>
    #include <qfile.h>
    #include <kdebug.h>
    
    int main()
    {
      QDomDocument doc( "myDocument" );
      QFile qf("hello.htm");
      doc.setContent( &qf );
      QDomElement docElement = doc.documentElement(); 
      QDomNode node;
      node = docElement.firstChild();
      kdDebug() << node.nodeName() << endl;
    }
    

    Drawbacks

    HTML parsing only works for "legal" html documents. For example, look at this code:

    <html>
      <body>
          <a href="http://www.kde.org/"></a>
          <a href="/index.php?title=Special:User&returnto=Main_Page">Log in</a>
          <a href="http://www.gmx.de"></a>
      </body>
    </html>
    

    This code contains a & and will bring your parser to an error.

    See here:

    <html>
      <body>
          <a href="http://www.kde.org/"></a>
          <a href="/index.php" nowrap>Log in</a>
          <a href="http://www.gmx.de"></a>
      </body>
    </html>
    

    This code will throw an error because of the nowrap that is not xml-conform.