Revision as of 06:51, 17 December 2006

A parser is used to distinguish between formal language and bulk data of a given grammar. See http://en.wikipedia.org/wiki/Parser for more information. There are two ways to write a parser: to split up the content of a file into an object as known from object-oriented programming (the DOM approach) or to trigger a function everytime a reader occurs a given syntax tag (the QXML approach).

The QXML approach

parser.h: /*

parser.h - demonstration of a parser in C++

/

ifndef PARSER_H
define PARSER_H

include <qstring.h>
include <QtXml/QXmlDefaultHandler>
include <QtXml/QXmlAttributes>

class Parser : public QXmlDefaultHandler
{
public:

 Parser();

 /** given by the framework from qxml. Called when parsing the xml-document starts.          */
 bool startDocument();

 /** given by the framework from qxml. Called when the reader occurs an open tag (e.g. \<b\> ) */
 bool startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att );

};

endif

parser.cpp: /*

parser.cpp - demonstration of a parser in C++

/

include "parser.h"
include <kdebug.h>

 Parser::Parser()
 {
 }
 
 bool Parser::startDocument()
 {
   kDebug() << "Searching document for tags" << endl;
   return true;
 }
 
 bool Parser::startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att )
 {
   kDebug() << "Found Element " << qName << endl;
   return true;
}

hello.cpp: /* hello.cpp compile it with g++ -I. -I/home/kde-devel/kde/include -I/home/kde-devel/qt-unstable/include/Qt -I/home/kde-devel/qt-unstable/include /home/kde-devel/qt-unstable/include/QtXml parser.h parser.cpp hello.cpp -L/home/kde-devel/kde/lib -L/home/kde-devel/qt-unstable/lib -lQtCore_debug -lQtXml_debug -lkdeui

/

include <qstring.h>
include <QXmlInputSource>
include <qfile.h>
include <parser.h>

int main()
{

 Parser* handler=new Parser();
 QXmlInputSource* source=new QXmlInputSource(new QFile("hello.htm"));
 QXmlSimpleReader reader;
 reader.setContentHandler( handler );
 reader.parse( source );

}

The DOM approach

/*

  dom.cpp
  A demonstration how to use the dom parsing framework.
  Prints the first subnode of an HTML file, i.e. typically 
  "head" or "body".
  compile it like this:
  g++ -I. -I/opt/kde3/include -I/usr/lib/qt3/include dom.cpp \
  -L/opt/kde3/lib -L/usr/lib/qt3/lib -lqt-mt -lkdeui

/

include <qdom.h>
include <qfile.h>
include <kdebug.h>

int main()
{

 QDomDocument doc( "myDocument" );
 QFile qf("hello.htm");
 doc.setContent( &qf );
 QDomElement docElement = doc.documentElement(); 
 QDomNode node;
 node = docElement.firstChild();
 kdDebug() << node.nodeName() << endl;

}

Drawbacks

HTML parsing only works for "legal" html documents. For example, look at this code:

<html>
  <body>
      <a href="http://www.kde.org/"></a>
      <a href="/index.php?title=Special:User&returnto=Main_Page">Log in</a>
      <a href="http://www.gmx.de"></a>
  </body>
</html>

This code contains a & and will bring your parser to an error.

See here:

<html>
  <body>
      <a href="http://www.kde.org/"></a>
      <a href="/index.php" nowrap>Log in</a>
      <a href="http://www.gmx.de"></a>
  </body>
</html>

This code will throw an error because of the nowrap that is not xml-conform.

@@ Line 4: / Line 4: @@
 <b>parser.h:</b>
-<highlightSyntax language="cpp">
+<code cppqt n>
 /*
   parser.h - demonstration of a parser in C++
@@ Line 32: / Line 32: @@
 #endif
-</highlightSyntax>
+</code>
 <b>parser.cpp:</b>
-<highlightSyntax language="cpp">
+<code cppqt n>
 /*
   parser.cpp - demonstration of a parser in C++
@@ Line 58: / Line 58: @@
      return true;
   }
-</highlightSyntax>
+</code>
 <b>hello.cpp:</b>
-<highlightSyntax language="cpp">
+<code cppqt n>
 /*
 hello.cpp
@@ Line 82: / Line 82: @@
    reader.parse( source );
 }
-</highlightSyntax>
+</code>
 =The DOM approach=
-<highlightSyntax language="cpp">
+<code cppqt n>
 /*
     dom.cpp
@@ Line 109: / Line 109: @@
    kdDebug() << node.nodeName() << endl;
 }
-</highlightSyntax>
+</code>
 =Drawbacks=
 HTML parsing only works for "legal" html documents.
 For example, look at this code:
+<code>
 <pre>
 <html>
@@ Line 123: / Line 124: @@
 </html>
 </pre>
+</code>
 This code contains a &amp; and will bring your parser to an error.
 See here:
+<code>
 <pre>
 <html>
@@ Line 135: / Line 138: @@
 </html>
 </pre>
+</code>
 This code will throw an error because of the '''nowrap''' that is not xml-conform.