Development/Tutorials/Programming Tutorial KDE 4/How to write an XML parser: Difference between revisions

From KDE TechBase
(Use Template:Proposed_deletion)
 
(18 intermediate revisions by 10 users not shown)
Line 1: Line 1:
{{Proposed_deletion}}
A parser is used to distinguish between formal language and bulk data of a given grammar. See http://en.wikipedia.org/wiki/Parser for more information. There are two ways to write a parser: to split up the content of a file into an object as known from object-oriented programming (the <b>DOM approach</b>) or to trigger a function everytime a reader occurs a given syntax tag (the <b>QXML approach</b>).
A parser is used to distinguish between formal language and bulk data of a given grammar. See http://en.wikipedia.org/wiki/Parser for more information. There are two ways to write a parser: to split up the content of a file into an object as known from object-oriented programming (the <b>DOM approach</b>) or to trigger a function everytime a reader occurs a given syntax tag (the <b>QXML approach</b>).


Line 4: Line 5:


<b>parser.h:</b>
<b>parser.h:</b>
<highlightSyntax language="cpp">
<syntaxhighlight lang="cpp-qt" line>
/*
/*
  parser.h - demonstration of a parser in C++
  parser.h - demonstration of a parser in C++
Line 32: Line 33:


#endif
#endif
</highlightSyntax>
</syntaxhighlight>


<b>parser.cpp:</b>
<b>parser.cpp:</b>
<highlightSyntax language="cpp">
<syntaxhighlight lang="cpp-qt" line>
/*
/*
  parser.cpp - demonstration of a parser in C++
  parser.cpp - demonstration of a parser in C++
Line 49: Line 50:
   bool Parser::startDocument()
   bool Parser::startDocument()
   {
   {
     kDebug() << "Searching document for tags" << endl;
     kDebug() << "Searching document for tags";
     return true;
     return true;
   }
   }
Line 55: Line 56:
   bool Parser::startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att )
   bool Parser::startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att )
   {
   {
     kDebug() << "Found Element " << qName << endl;
     kDebug() << "Found Element" << qName;
     return true;
     return true;
  }
  }
</highlightSyntax>
</syntaxhighlight>


<b>hello.cpp:</b>
<b>hello.cpp:</b>
<highlightSyntax language="cpp">
<syntaxhighlight lang="cpp-qt" line>
/*
/*
hello.cpp
hello.cpp
compile it with
compile it with
g++ -I. -I/home/kde-devel/kde/include -I/home/kde-devel/qt-unstable/include/Qt  
g++ -I. -I/home/kde-devel/kde/include -I/home/kde-devel/qt-unstable/include/Qt -I/home/kde-devel/qt-unstable/include /home/kde-devel/qt-unstable/include/QtXml parser.h parser.cpp hello.cpp -L/home/kde-devel/kde/lib -L/home/kde-devel/qt-unstable/lib -lQtCore_debug -lQtXml_debug -lkdeui
-I/home/kde-devel/qt-unstable/include I/home/kde-devel/qt-unstable/include/QtXml  
parser.h parser.cpp hello.cpp -L/home/kde-devel/kde/lib  
-L/home/kde-devel/qt-unstable/lib -lQtCore_debug -lQtXml_debug -lkdeui
*/
*/


Line 85: Line 83:
   reader.parse( source );
   reader.parse( source );
}
}
</highlightSyntax>
</syntaxhighlight>


=The DOM approach=
=The DOM approach=
<highlightSyntax language="cpp">
<syntaxhighlight lang="cpp-qt" line>
/*
/*
   dom.cpp
   dom.cpp
   A demonstration how to use the dom parsing framework.
   A demonstration how to use the dom parsing framework.
   Prints the first subnode of an html file, i.e. typically "head" or "body".
   Prints the first subnode of an HTML file, i.e. typically  
  "head" or "body".
   compile it like this:
   compile it like this:
   g++ -I. -I/opt/kde3/include -I/usr/lib/qt3/include dom.cpp -L/opt/kde3/lib \
   g++ -I. -I/opt/kde3/include -I/usr/lib/qt3/include dom.cpp \
-L/usr/lib/qt3/lib -lqt-mt -lkdeui   
  -L/opt/kde3/lib -L/usr/lib/qt3/lib -lqt-mt -lkdeui   
*/
*/
#include <qdom.h>
#include <qdom.h>
Line 111: Line 110:
   kdDebug() << node.nodeName() << endl;
   kdDebug() << node.nodeName() << endl;
}
}
</highlightSyntax>
</syntaxhighlight>
 
=Drawbacks=
HTML parsing only works for "legal" html documents.
For example, look at this code:
<syntaxhighlight lang="xml">
<html>
  <body>
      <a href="http://www.kde.org/"></a>
      <a href="/index.php?title=Special:User&returnto=Main_Page">Log in</a>
      <a href="http://www.gmx.de"></a>
  </body>
</html>
</syntaxhighlight>
This code contains a &amp; and will bring your parser to an error.
 
See here:
<syntaxhighlight lang="xml">
<html>
  <body>
      <a href="http://www.kde.org/"></a>
      <a href="/index.php" nowrap>Log in</a>
      <a href="http://www.gmx.de"></a>
  </body>
</html>
</syntaxhighlight>
This code will throw an error because of the '''nowrap''' that is not xml-conform.
 
[[Category:Proposed_deletion]]

Latest revision as of 12:18, 15 May 2019

 
Proposed for Deletion
This page has been proposed for deletion.

A parser is used to distinguish between formal language and bulk data of a given grammar. See http://en.wikipedia.org/wiki/Parser for more information. There are two ways to write a parser: to split up the content of a file into an object as known from object-oriented programming (the DOM approach) or to trigger a function everytime a reader occurs a given syntax tag (the QXML approach).

The QXML approach

parser.h:

/*
 parser.h - demonstration of a parser in C++
*/

#ifndef PARSER_H
#define PARSER_H

#include <qstring.h>
#include <QtXml/QXmlDefaultHandler>
#include <QtXml/QXmlAttributes>

class Parser : public QXmlDefaultHandler
{
public:

  Parser();

  /** given by the framework from qxml. Called when parsing the xml-document starts.          */
  bool startDocument();

  /** given by the framework from qxml. Called when the reader occurs an open tag (e.g. \<b\> ) */
  bool startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att );

};


#endif

parser.cpp:

/*
 parser.cpp - demonstration of a parser in C++
*/

#include "parser.h"
#include <kdebug.h>

  Parser::Parser()
  {
  }
  
  bool Parser::startDocument()
  {
    kDebug() << "Searching document for tags";
    return true;
  }
  
  bool Parser::startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att )
  {
    kDebug() << "Found Element" << qName;
    return true;
 }

hello.cpp:

/*
hello.cpp
compile it with
g++ -I. -I/home/kde-devel/kde/include -I/home/kde-devel/qt-unstable/include/Qt -I/home/kde-devel/qt-unstable/include /home/kde-devel/qt-unstable/include/QtXml parser.h parser.cpp hello.cpp -L/home/kde-devel/kde/lib -L/home/kde-devel/qt-unstable/lib -lQtCore_debug -lQtXml_debug -lkdeui
*/


#include <qstring.h>
#include <QXmlInputSource>
#include <qfile.h>
#include <parser.h>

int main()
{  
  Parser* handler=new Parser();
  QXmlInputSource* source=new QXmlInputSource(new QFile("hello.htm"));
  QXmlSimpleReader reader;
  reader.setContentHandler( handler );
  reader.parse( source );
}

The DOM approach

/*
   dom.cpp
   A demonstration how to use the dom parsing framework.
   Prints the first subnode of an HTML file, i.e. typically 
   "head" or "body".
   compile it like this:
   g++ -I. -I/opt/kde3/include -I/usr/lib/qt3/include dom.cpp \
   -L/opt/kde3/lib -L/usr/lib/qt3/lib -lqt-mt -lkdeui   
*/
#include <qdom.h>
#include <qfile.h>
#include <kdebug.h>

int main()
{
  QDomDocument doc( "myDocument" );
  QFile qf("hello.htm");
  doc.setContent( &qf );
  QDomElement docElement = doc.documentElement(); 
  QDomNode node;
  node = docElement.firstChild();
  kdDebug() << node.nodeName() << endl;
}

Drawbacks

HTML parsing only works for "legal" html documents. For example, look at this code:

<html>
  <body>
      <a href="http://www.kde.org/"></a>
      <a href="/index.php?title=Special:User&returnto=Main_Page">Log in</a>
      <a href="http://www.gmx.de"></a>
  </body>
</html>

This code contains a & and will bring your parser to an error.

See here:

<html>
  <body>
      <a href="http://www.kde.org/"></a>
      <a href="/index.php" nowrap>Log in</a>
      <a href="http://www.gmx.de"></a>
  </body>
</html>

This code will throw an error because of the nowrap that is not xml-conform.