Difference between revisions of "Development/Tutorials/Programming Tutorial KDE 4/How to write an XML parser"
(Use Template:Proposed_deletion) |
|||
(23 intermediate revisions by 10 users not shown) | |||
Line 1: | Line 1: | ||
− | A parser is used to distinguish between formal language and bulk data of a given grammar. See en.wikipedia.org/wiki/Parser | + | {{Proposed_deletion}} |
− | < | + | A parser is used to distinguish between formal language and bulk data of a given grammar. See http://en.wikipedia.org/wiki/Parser for more information. There are two ways to write a parser: to split up the content of a file into an object as known from object-oriented programming (the <b>DOM approach</b>) or to trigger a function everytime a reader occurs a given syntax tag (the <b>QXML approach</b>). |
+ | |||
+ | =The QXML approach= | ||
+ | |||
+ | <b>parser.h:</b> | ||
+ | <syntaxhighlight lang="cpp-qt" line> | ||
+ | /* | ||
+ | parser.h - demonstration of a parser in C++ | ||
+ | */ | ||
+ | |||
+ | #ifndef PARSER_H | ||
+ | #define PARSER_H | ||
+ | |||
+ | #include <qstring.h> | ||
+ | #include <QtXml/QXmlDefaultHandler> | ||
+ | #include <QtXml/QXmlAttributes> | ||
+ | |||
+ | class Parser : public QXmlDefaultHandler | ||
+ | { | ||
+ | public: | ||
+ | |||
+ | Parser(); | ||
+ | |||
+ | /** given by the framework from qxml. Called when parsing the xml-document starts. */ | ||
+ | bool startDocument(); | ||
+ | |||
+ | /** given by the framework from qxml. Called when the reader occurs an open tag (e.g. \<b\> ) */ | ||
+ | bool startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att ); | ||
+ | |||
+ | }; | ||
+ | |||
+ | |||
+ | #endif | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | <b>parser.cpp:</b> | ||
+ | <syntaxhighlight lang="cpp-qt" line> | ||
+ | /* | ||
+ | parser.cpp - demonstration of a parser in C++ | ||
+ | */ | ||
+ | |||
+ | #include "parser.h" | ||
+ | #include <kdebug.h> | ||
+ | |||
+ | Parser::Parser() | ||
+ | { | ||
+ | } | ||
+ | |||
+ | bool Parser::startDocument() | ||
+ | { | ||
+ | kDebug() << "Searching document for tags"; | ||
+ | return true; | ||
+ | } | ||
+ | |||
+ | bool Parser::startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att ) | ||
+ | { | ||
+ | kDebug() << "Found Element" << qName; | ||
+ | return true; | ||
+ | } | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | <b>hello.cpp:</b> | ||
+ | <syntaxhighlight lang="cpp-qt" line> | ||
/* | /* | ||
hello.cpp | hello.cpp | ||
compile it with | compile it with | ||
− | g++ -I. -I/home/kde-devel/kde/include -I/home/kde-devel/qt-unstable/include/Qt -I/home/kde-devel/qt-unstable/include | + | g++ -I. -I/home/kde-devel/kde/include -I/home/kde-devel/qt-unstable/include/Qt -I/home/kde-devel/qt-unstable/include /home/kde-devel/qt-unstable/include/QtXml parser.h parser.cpp hello.cpp -L/home/kde-devel/kde/lib -L/home/kde-devel/qt-unstable/lib -lQtCore_debug -lQtXml_debug -lkdeui |
*/ | */ | ||
Line 21: | Line 83: | ||
reader.parse( source ); | reader.parse( source ); | ||
} | } | ||
− | </ | + | </syntaxhighlight> |
+ | |||
+ | =The DOM approach= | ||
+ | <syntaxhighlight lang="cpp-qt" line> | ||
+ | /* | ||
+ | dom.cpp | ||
+ | A demonstration how to use the dom parsing framework. | ||
+ | Prints the first subnode of an HTML file, i.e. typically | ||
+ | "head" or "body". | ||
+ | compile it like this: | ||
+ | g++ -I. -I/opt/kde3/include -I/usr/lib/qt3/include dom.cpp \ | ||
+ | -L/opt/kde3/lib -L/usr/lib/qt3/lib -lqt-mt -lkdeui | ||
+ | */ | ||
+ | #include <qdom.h> | ||
+ | #include <qfile.h> | ||
+ | #include <kdebug.h> | ||
+ | |||
+ | int main() | ||
+ | { | ||
+ | QDomDocument doc( "myDocument" ); | ||
+ | QFile qf("hello.htm"); | ||
+ | doc.setContent( &qf ); | ||
+ | QDomElement docElement = doc.documentElement(); | ||
+ | QDomNode node; | ||
+ | node = docElement.firstChild(); | ||
+ | kdDebug() << node.nodeName() << endl; | ||
+ | } | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | =Drawbacks= | ||
+ | HTML parsing only works for "legal" html documents. | ||
+ | For example, look at this code: | ||
+ | <syntaxhighlight lang="xml"> | ||
+ | <html> | ||
+ | <body> | ||
+ | <a href="http://www.kde.org/"></a> | ||
+ | <a href="/index.php?title=Special:User&returnto=Main_Page">Log in</a> | ||
+ | <a href="http://www.gmx.de"></a> | ||
+ | </body> | ||
+ | </html> | ||
+ | </syntaxhighlight> | ||
+ | This code contains a & and will bring your parser to an error. | ||
+ | |||
+ | See here: | ||
+ | <syntaxhighlight lang="xml"> | ||
+ | <html> | ||
+ | <body> | ||
+ | <a href="http://www.kde.org/"></a> | ||
+ | <a href="/index.php" nowrap>Log in</a> | ||
+ | <a href="http://www.gmx.de"></a> | ||
+ | </body> | ||
+ | </html> | ||
+ | </syntaxhighlight> | ||
+ | This code will throw an error because of the '''nowrap''' that is not xml-conform. | ||
+ | |||
+ | [[Category:Proposed_deletion]] |
Latest revision as of 12:18, 15 May 2019

Proposed for Deletion |
---|
This page has been proposed for deletion. |
A parser is used to distinguish between formal language and bulk data of a given grammar. See http://en.wikipedia.org/wiki/Parser for more information. There are two ways to write a parser: to split up the content of a file into an object as known from object-oriented programming (the DOM approach) or to trigger a function everytime a reader occurs a given syntax tag (the QXML approach).
The QXML approach
parser.h:
1 /*
2 parser.h - demonstration of a parser in C++
3 */
4
5 #ifndef PARSER_H
6 #define PARSER_H
7
8 #include <qstring.h>
9 #include <QtXml/QXmlDefaultHandler>
10 #include <QtXml/QXmlAttributes>
11
12 class Parser : public QXmlDefaultHandler
13 {
14 public:
15
16 Parser();
17
18 /** given by the framework from qxml. Called when parsing the xml-document starts. */
19 bool startDocument();
20
21 /** given by the framework from qxml. Called when the reader occurs an open tag (e.g. \<b\> ) */
22 bool startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att );
23
24 };
25
26
27 #endif
parser.cpp:
1 /*
2 parser.cpp - demonstration of a parser in C++
3 */
4
5 #include "parser.h"
6 #include <kdebug.h>
7
8 Parser::Parser()
9 {
10 }
11
12 bool Parser::startDocument()
13 {
14 kDebug() << "Searching document for tags";
15 return true;
16 }
17
18 bool Parser::startElement( const QString&, const QString&, const QString& qName, const QXmlAttributes& att )
19 {
20 kDebug() << "Found Element" << qName;
21 return true;
22 }
hello.cpp:
1 /*
2 hello.cpp
3 compile it with
4 g++ -I. -I/home/kde-devel/kde/include -I/home/kde-devel/qt-unstable/include/Qt -I/home/kde-devel/qt-unstable/include /home/kde-devel/qt-unstable/include/QtXml parser.h parser.cpp hello.cpp -L/home/kde-devel/kde/lib -L/home/kde-devel/qt-unstable/lib -lQtCore_debug -lQtXml_debug -lkdeui
5 */
6
7
8 #include <qstring.h>
9 #include <QXmlInputSource>
10 #include <qfile.h>
11 #include <parser.h>
12
13 int main()
14 {
15 Parser* handler=new Parser();
16 QXmlInputSource* source=new QXmlInputSource(new QFile("hello.htm"));
17 QXmlSimpleReader reader;
18 reader.setContentHandler( handler );
19 reader.parse( source );
20 }
The DOM approach
1 /*
2 dom.cpp
3 A demonstration how to use the dom parsing framework.
4 Prints the first subnode of an HTML file, i.e. typically
5 "head" or "body".
6 compile it like this:
7 g++ -I. -I/opt/kde3/include -I/usr/lib/qt3/include dom.cpp \
8 -L/opt/kde3/lib -L/usr/lib/qt3/lib -lqt-mt -lkdeui
9 */
10 #include <qdom.h>
11 #include <qfile.h>
12 #include <kdebug.h>
13
14 int main()
15 {
16 QDomDocument doc( "myDocument" );
17 QFile qf("hello.htm");
18 doc.setContent( &qf );
19 QDomElement docElement = doc.documentElement();
20 QDomNode node;
21 node = docElement.firstChild();
22 kdDebug() << node.nodeName() << endl;
23 }
Drawbacks
HTML parsing only works for "legal" html documents. For example, look at this code:
<html>
<body>
<a href="http://www.kde.org/"></a>
<a href="/index.php?title=Special:User&returnto=Main_Page">Log in</a>
<a href="http://www.gmx.de"></a>
</body>
</html>
This code contains a & and will bring your parser to an error.
See here:
<html>
<body>
<a href="http://www.kde.org/"></a>
<a href="/index.php" nowrap>Log in</a>
<a href="http://www.gmx.de"></a>
</body>
</html>
This code will throw an error because of the nowrap that is not xml-conform.
This page was last edited on 15 May 2019, at 12:18. Content is available under Creative Commons License SA 4.0 unless otherwise noted.