(→Showing tags) |
(→Showing tags) |
||
Line 56: | Line 56: | ||
kdDebug() << "Size of your doc " << sizeof(doc) << endl; | kdDebug() << "Size of your doc " << sizeof(doc) << endl; | ||
kdDebug() << doc->toString().string() << endl; | kdDebug() << doc->toString().string() << endl; | ||
+ | } | ||
+ | </highlightSyntax> | ||
+ | |||
+ | |||
+ | =Non-conform syntax= | ||
+ | As we remember from http://developernew.kde.org/Development/Tutorials/Programming_Tutorial_KDE_4/How_to_write_an_HTML_parser, biggest thing is to be able to parse non-XML-conform syntax. It works here: | ||
+ | <highlightSyntax language="cpp"> | ||
+ | #include <kapplication.h> | ||
+ | #include <kaboutdata.h> | ||
+ | #include <kcmdlineargs.h> | ||
+ | #include <dom/html_document.h> | ||
+ | |||
+ | int main (int argc, char *argv[]) | ||
+ | { | ||
+ | KAboutData aboutData( "test", "test", | ||
+ | "1.0", "test", KAboutData::License_GPL, | ||
+ | "(c) 2006" ); | ||
+ | KCmdLineArgs::init( argc, argv, &aboutData ); | ||
+ | KApplication khello; | ||
+ | |||
+ | DOM::HTMLDocument doc; | ||
+ | DOM::DOMString tag("*"); | ||
+ | DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>"); | ||
+ | |||
+ | doc.loadXML(uri); | ||
+ | kdDebug() << "Does this doc have child elements ? " << doc.hasChildNodes() << endl; | ||
+ | for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl; | ||
+ | kdDebug() << "Size of your doc " << sizeof(doc.firstChild()) << endl; | ||
+ | kdDebug() << doc.isHTMLDocument() << endl; | ||
+ | kdDebug() << doc.toString().string() << endl; | ||
} | } | ||
</highlightSyntax> | </highlightSyntax> |
For HTML parsing, you have the following possibilities:
Obviously, QXML and QDOM need xml-compliant html pages, and the least html pages are xml-compliant. Perl is not the scope of this site. So, this tutorial choses the khtml approach.
Our first khtml-program does plain nothing: <highlightSyntax language="cpp">
int main (int argc, char *argv[]) {
KAboutData aboutData( "test", "test", "1.0", "test", KAboutData::License_GPL, "(c) 2006" ); KCmdLineArgs::init( argc, argv, &aboutData ); KApplication khello; DOM::HTMLDocument();
} </highlightSyntax> It can be compiled like:
gcc -I/usr/lib/qt3/include -I/opt/kde3/include \ -L/opt/kde3/lib -lkdeui -lkhtml -o khtml khtml.cpp
The next program is more advanced, it shows you the first tags of an html file: <highlightSyntax language="cpp">
int main (int argc, char *argv[]) {
KAboutData aboutData( "test", "test", "1.0", "test", KAboutData::License_GPL, "(c) 2006" ); KCmdLineArgs::init( argc, argv, &aboutData ); KApplication khello;
DOM::Document* doc=new DOM::Document(); DOM::DOMString tag("*"); DOM::DOMString uri("<html>test</html>"); doc->loadXML(uri); kdDebug() << "Does this doc have child elements ? " << doc->hasChildNodes() << endl; kdDebug() << "First child node name: " << doc->firstChild().nodeName().string() << endl; kdDebug() << "First grandchild node name: " << doc->firstChild().firstChild().nodeName().string() << endl; kdDebug() << "Count of elements in your doc " << doc->getElementsByTagName(tag).length()<< endl; for (int i=0; i<doc->getElementsByTagName(tag).length(); i++) kdDebug() << doc->getElementsByTagName(tag).item(i).nodeName().string() << endl; kdDebug() << "Size of your doc " << sizeof(doc) << endl; kdDebug() << doc->toString().string() << endl;
} </highlightSyntax>
As we remember from http://developernew.kde.org/Development/Tutorials/Programming_Tutorial_KDE_4/How_to_write_an_HTML_parser, biggest thing is to be able to parse non-XML-conform syntax. It works here: <highlightSyntax language="cpp">
int main (int argc, char *argv[]) {
KAboutData aboutData( "test", "test", "1.0", "test", KAboutData::License_GPL, "(c) 2006" ); KCmdLineArgs::init( argc, argv, &aboutData ); KApplication khello;
DOM::HTMLDocument doc; DOM::DOMString tag("*"); DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");
doc.loadXML(uri); kdDebug() << "Does this doc have child elements ? " << doc.hasChildNodes() << endl; for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl; kdDebug() << "Size of your doc " << sizeof(doc.firstChild()) << endl; kdDebug() << doc.isHTMLDocument() << endl; kdDebug() << doc.toString().string() << endl;
} </highlightSyntax>