Development/Tutorials/Programming Tutorial KDE 3/KHTML: Difference between revisions

From KDE TechBase
m (Text replace - "</code>" to "</syntaxhighlight>")
 
(13 intermediate revisions by 8 users not shown)
Line 2: Line 2:
* QXML
* QXML
* QDOM
* QDOM
* perl
* Perl
* khtml
* XHTML
Obviously, QXML and QDOM need xml-compliant html pages, and the least html pages are xml-compliant. Perl is not the scope of this site. So, this tutorial choses the khtml approach.  
Obviously, QXML and QDOM need XML-compliant HTML pages, and the least HTML pages are XML-compliant. Perl is not the scope of this site. This tutorial chooses the XHTML approach.  
 
=First step=
=First step=
Our first khtml-program does plain nothing:
As we remember from http://developernew.kde.org/Development/Tutorials/Programming_Tutorial_KDE_4/How_to_write_an_HTML_parser, biggest thing is to be able to parse non-XML-conform syntax. It works with the following program.
<highlightSyntax language="cpp">
 
#include <qstring.h>
'''tags.cpp'''
<syntaxhighlight lang="cpp-qt" line>
#include <kapplication.h>
#include <kapplication.h>
#include <kaboutdata.h>
#include <kaboutdata.h>
#include <kmessagebox.h>
#include <kcmdlineargs.h>
#include <kcmdlineargs.h>
#include <dom/html_document.h>
#include <dom/html_document.h>
Line 22: Line 23:
         KCmdLineArgs::init( argc, argv, &aboutData );
         KCmdLineArgs::init( argc, argv, &aboutData );
         KApplication khello;
         KApplication khello;
         DOM::HTMLDocument();
 
         DOM::HTMLDocument doc;
        DOM::DOMString tag("*");
        DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");
 
        doc.loadXML(uri);
        kdDebug() << "Does this doc have child elements ? " << doc.hasChildNodes() << endl;
        for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
        kdDebug() << "Size of your doc " << sizeof(doc.firstChild()) << endl;
        kdDebug() << doc.isHTMLDocument() << endl;
        kdDebug() << doc.toString().string() << endl;
}
}
</highlightSyntax>
</syntaxhighlight>
It can be compiled like:
 
 
Compile it like this:
  gcc -I/usr/lib/qt3/include -I/opt/kde3/include \
  gcc -I/usr/lib/qt3/include -I/opt/kde3/include \
  -L/opt/kde3/lib -lkdeui -lkhtml -o khtml khtml.cpp
  -L/opt/kde3/lib -lkdeui -lkhtml -o tags tags.cpp
 
=Second=


=Showing tags=
<syntaxhighlight lang="cpp-qt">
The next program is more advanced, it shows you the first tags of an html file:
<highlightSyntax language="cpp">
#include <kapplication.h>
#include <kapplication.h>
#include <kaboutdata.h>
#include <kaboutdata.h>
#include <kcmdlineargs.h>
#include <kcmdlineargs.h>
#include <dom/html_document.h>
#include <dom/html_document.h>
#include <dom/html_element.h>
#include <dom/dom_node.h>


int main (int argc, char *argv[])
int main (int argc, char *argv[])
{
{
  KAboutData aboutData( "test", "test",
        KAboutData aboutData( "test", "test",
  "1.0", "test", KAboutData::License_GPL,
        "1.0", "test", KAboutData::License_GPL,
  "(c) 2006" );
        "(c) 2006" );
  KCmdLineArgs::init( argc, argv, &aboutData );
        KCmdLineArgs::init( argc, argv, &aboutData );
  KApplication khello;
        KApplication khello;
 
  DOM::Document doc=DOM::Document();
        DOM::HTMLDocument doc;
  DOM::HTMLDocument htmldoc=DOM::HTMLDocument();
        DOM::DOMString tag("*");
  DOM::DOMString tag("*");
        DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"><b>fat</b></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");
  doc.loadXML("hello.htm");
 
  kdDebug() << "Does this doc have child elements ? " << doc.hasChildNodes() << endl;
        doc.loadXML(uri);
  kdDebug() << "First child node name: " << doc.firstChild().nodeName().string() << endl;
        kdDebug() << "Here's a list of the document elements" << endl;
  kdDebug() << "First grandchild node name: " << doc.firstChild().firstChild().nodeName().string() << endl;
        for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
  kdDebug() << "Count of elements in your doc " << doc.getElementsByTagName(tag).length()<< endl;
     
  kdDebug() << "Size of your doc " << sizeof(doc) << endl;
        DOM::HTMLDocument doc2;
  kdDebug() << doc.toString().string() << endl;
        DOM::DOMString uri2("<html><body>this is html<b>fat</b></body></html>");
        doc2.loadXML(uri2);
        kdDebug() << "This is the in-memory html:" << endl;
        kdDebug() << doc.toString().string() << endl;
        doc.body().insertBefore(doc.body().firstChild().firstChild(),doc.body().firstChild());
        kdDebug() << "Moving around nodes" << endl;
        kdDebug() << doc.toString().string() << endl;
}
}
</highlightSyntax>
</syntaxhighlight>
You can use this e.g. with the following
hello.htm:
<pre>
<html>
<head>
<title>blah</title>
</head>
<body>
<b>fat</b>
<a href="http://www.de">denic</a>
</body>
</html>
</pre>
 
You get an error because your file is not UTF-16 encoded. Here's how I proceed:
scorpio:~/html # hexdump hello.htm
0000000 3c00 000a
0000003

Latest revision as of 20:54, 29 June 2011

For HTML parsing, you have the following possibilities:

  • QXML
  • QDOM
  • Perl
  • XHTML

Obviously, QXML and QDOM need XML-compliant HTML pages, and the least HTML pages are XML-compliant. Perl is not the scope of this site. This tutorial chooses the XHTML approach.

First step

As we remember from http://developernew.kde.org/Development/Tutorials/Programming_Tutorial_KDE_4/How_to_write_an_HTML_parser, biggest thing is to be able to parse non-XML-conform syntax. It works with the following program.

tags.cpp

#include <kapplication.h>
#include <kaboutdata.h>
#include <kcmdlineargs.h>
#include <dom/html_document.h>

int main (int argc, char *argv[])
{
        KAboutData aboutData( "test", "test",
        "1.0", "test", KAboutData::License_GPL,
        "(c) 2006" );
        KCmdLineArgs::init( argc, argv, &aboutData );
        KApplication khello;

        DOM::HTMLDocument doc;
        DOM::DOMString tag("*");
        DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");

        doc.loadXML(uri);
        kdDebug() << "Does this doc have child elements ? " << doc.hasChildNodes() << endl;
        for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
        kdDebug() << "Size of your doc " << sizeof(doc.firstChild()) << endl;
        kdDebug() << doc.isHTMLDocument() << endl;
        kdDebug() << doc.toString().string() << endl;
}


Compile it like this:

gcc -I/usr/lib/qt3/include -I/opt/kde3/include \
-L/opt/kde3/lib -lkdeui -lkhtml -o tags tags.cpp

Second

#include <kapplication.h>
#include <kaboutdata.h>
#include <kcmdlineargs.h>
#include <dom/html_document.h>
#include <dom/html_element.h>
#include <dom/dom_node.h>

int main (int argc, char *argv[])
{
        KAboutData aboutData( "test", "test",
        "1.0", "test", KAboutData::License_GPL,
        "(c) 2006" );
        KCmdLineArgs::init( argc, argv, &aboutData );
        KApplication khello;

        DOM::HTMLDocument doc;
        DOM::DOMString tag("*");
        DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"><b>fat</b></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");

        doc.loadXML(uri);
        kdDebug() << "Here's a list of the document elements" << endl;
        for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
       
        DOM::HTMLDocument doc2;
        DOM::DOMString uri2("<html><body>this is html<b>fat</b></body></html>");
        doc2.loadXML(uri2);
        kdDebug() << "This is the in-memory html:" << endl;
        kdDebug() << doc.toString().string() << endl;
        doc.body().insertBefore(doc.body().firstChild().firstChild(),doc.body().firstChild());
        kdDebug() << "Moving around nodes" << endl;
        kdDebug() << doc.toString().string() << endl;
}