Difference between revisions of "Development/Tutorials/Programming Tutorial KDE 3/KHTML"

Jump to: navigation, search
 
m (Text replace - "</code>" to "</syntaxhighlight>")
 
(22 intermediate revisions by 8 users not shown)
Line 1: Line 1:
For HTML parsing, you have 3 possibilities:
+
For HTML parsing, you have the following possibilities:
- QXML
+
* QXML
- QDOM
+
* QDOM
- khtml
+
* Perl
Obviously, QXML and QDOM need xml-compliant html pages, and the least html pages are xml-compliant. So, this tutorial choses the khtml approach.
+
* XHTML
 +
Obviously, QXML and QDOM need XML-compliant HTML pages, and the least HTML pages are XML-compliant. Perl is not the scope of this site. This tutorial chooses the XHTML approach.  
 +
 
 +
=First step=
 +
As we remember from http://developernew.kde.org/Development/Tutorials/Programming_Tutorial_KDE_4/How_to_write_an_HTML_parser, biggest thing is to be able to parse non-XML-conform syntax. It works with the following program.
 +
 
 +
'''tags.cpp'''
 +
<syntaxhighlight lang="cpp-qt" line>
 +
#include <kapplication.h>
 +
#include <kaboutdata.h>
 +
#include <kcmdlineargs.h>
 +
#include <dom/html_document.h>
 +
 
 +
int main (int argc, char *argv[])
 +
{
 +
        KAboutData aboutData( "test", "test",
 +
        "1.0", "test", KAboutData::License_GPL,
 +
        "(c) 2006" );
 +
        KCmdLineArgs::init( argc, argv, &aboutData );
 +
        KApplication khello;
 +
 
 +
        DOM::HTMLDocument doc;
 +
        DOM::DOMString tag("*");
 +
        DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");
 +
 
 +
        doc.loadXML(uri);
 +
        kdDebug() << "Does this doc have child elements ? " << doc.hasChildNodes() << endl;
 +
        for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
 +
        kdDebug() << "Size of your doc " << sizeof(doc.firstChild()) << endl;
 +
        kdDebug() << doc.isHTMLDocument() << endl;
 +
        kdDebug() << doc.toString().string() << endl;
 +
}
 +
</syntaxhighlight>
 +
 
 +
 
 +
Compile it like this:
 +
gcc -I/usr/lib/qt3/include -I/opt/kde3/include \
 +
-L/opt/kde3/lib -lkdeui -lkhtml -o tags tags.cpp
 +
 
 +
=Second=
 +
 
 +
<syntaxhighlight lang="cpp-qt">
 +
#include <kapplication.h>
 +
#include <kaboutdata.h>
 +
#include <kcmdlineargs.h>
 +
#include <dom/html_document.h>
 +
#include <dom/html_element.h>
 +
#include <dom/dom_node.h>
 +
 
 +
int main (int argc, char *argv[])
 +
{
 +
        KAboutData aboutData( "test", "test",
 +
        "1.0", "test", KAboutData::License_GPL,
 +
        "(c) 2006" );
 +
        KCmdLineArgs::init( argc, argv, &aboutData );
 +
        KApplication khello;
 +
 
 +
        DOM::HTMLDocument doc;
 +
        DOM::DOMString tag("*");
 +
        DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"><b>fat</b></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");
 +
 
 +
        doc.loadXML(uri);
 +
        kdDebug() << "Here's a list of the document elements" << endl;
 +
        for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
 +
     
 +
        DOM::HTMLDocument doc2;
 +
        DOM::DOMString uri2("<html><body>this is html<b>fat</b></body></html>");
 +
        doc2.loadXML(uri2);
 +
        kdDebug() << "This is the in-memory html:" << endl;
 +
        kdDebug() << doc.toString().string() << endl;
 +
        doc.body().insertBefore(doc.body().firstChild().firstChild(),doc.body().firstChild());
 +
        kdDebug() << "Moving around nodes" << endl;
 +
        kdDebug() << doc.toString().string() << endl;
 +
}
 +
</syntaxhighlight>

Latest revision as of 21:54, 29 June 2011

For HTML parsing, you have the following possibilities:

  • QXML
  • QDOM
  • Perl
  • XHTML

Obviously, QXML and QDOM need XML-compliant HTML pages, and the least HTML pages are XML-compliant. Perl is not the scope of this site. This tutorial chooses the XHTML approach.

[edit] First step

As we remember from http://developernew.kde.org/Development/Tutorials/Programming_Tutorial_KDE_4/How_to_write_an_HTML_parser, biggest thing is to be able to parse non-XML-conform syntax. It works with the following program.

tags.cpp

  1. #include <kapplication.h>
  2. #include <kaboutdata.h>
  3. #include <kcmdlineargs.h>
  4. #include <dom/html_document.h>
  5.  
  6. int main (int argc, char *argv[])
  7. {
  8.         KAboutData aboutData( "test", "test",
  9.         "1.0", "test", KAboutData::License_GPL,
  10.         "(c) 2006" );
  11.         KCmdLineArgs::init( argc, argv, &aboutData );
  12.         KApplication khello;
  13.  
  14.         DOM::HTMLDocument doc;
  15.         DOM::DOMString tag("*");
  16.         DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");
  17.  
  18.         doc.loadXML(uri);
  19.         kdDebug() << "Does this doc have child elements ? " << doc.hasChildNodes() << endl;
  20.         for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
  21.         kdDebug() << "Size of your doc " << sizeof(doc.firstChild()) << endl;
  22.         kdDebug() << doc.isHTMLDocument() << endl;
  23.         kdDebug() << doc.toString().string() << endl;
  24. }


Compile it like this:

gcc -I/usr/lib/qt3/include -I/opt/kde3/include \
-L/opt/kde3/lib -lkdeui -lkhtml -o tags tags.cpp

[edit] Second

#include <kapplication.h>
#include <kaboutdata.h>
#include <kcmdlineargs.h>
#include <dom/html_document.h>
#include <dom/html_element.h>
#include <dom/dom_node.h>
 
int main (int argc, char *argv[])
{
        KAboutData aboutData( "test", "test",
        "1.0", "test", KAboutData::License_GPL,
        "(c) 2006" );
        KCmdLineArgs::init( argc, argv, &aboutData );
        KApplication khello;
 
        DOM::HTMLDocument doc;
        DOM::DOMString tag("*");
        DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"><b>fat</b></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");
 
        doc.loadXML(uri);
        kdDebug() << "Here's a list of the document elements" << endl;
        for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
 
        DOM::HTMLDocument doc2;
        DOM::DOMString uri2("<html><body>this is html<b>fat</b></body></html>");
        doc2.loadXML(uri2);
        kdDebug() << "This is the in-memory html:" << endl;
        kdDebug() << doc.toString().string() << endl;
        doc.body().insertBefore(doc.body().firstChild().firstChild(),doc.body().firstChild());
        kdDebug() << "Moving around nodes" << endl;
        kdDebug() << doc.toString().string() << endl;
}

This page was last modified on 29 June 2011, at 21:54. This page has been accessed 4,954 times. Content is available under Creative Commons License SA 3.0 as well as the GNU Free Documentation License 1.2.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V.Legal