Development/Tutorials/Programming Tutorial KDE 3/KHTML: Difference between revisions

    From KDE TechBase
    m (Text replace - "</code>" to "</syntaxhighlight>")
     
    (13 intermediate revisions by 8 users not shown)
    Line 2: Line 2:
    * QXML
    * QXML
    * QDOM
    * QDOM
    * perl
    * Perl
    * khtml
    * XHTML
    Obviously, QXML and QDOM need xml-compliant html pages, and the least html pages are xml-compliant. Perl is not the scope of this site. So, this tutorial choses the khtml approach.  
    Obviously, QXML and QDOM need XML-compliant HTML pages, and the least HTML pages are XML-compliant. Perl is not the scope of this site. This tutorial chooses the XHTML approach.  
     
    =First step=
    =First step=
    Our first khtml-program does plain nothing:
    As we remember from http://developernew.kde.org/Development/Tutorials/Programming_Tutorial_KDE_4/How_to_write_an_HTML_parser, biggest thing is to be able to parse non-XML-conform syntax. It works with the following program.
    <highlightSyntax language="cpp">
     
    #include <qstring.h>
    '''tags.cpp'''
    <syntaxhighlight lang="cpp-qt" line>
    #include <kapplication.h>
    #include <kapplication.h>
    #include <kaboutdata.h>
    #include <kaboutdata.h>
    #include <kmessagebox.h>
    #include <kcmdlineargs.h>
    #include <kcmdlineargs.h>
    #include <dom/html_document.h>
    #include <dom/html_document.h>
    Line 22: Line 23:
             KCmdLineArgs::init( argc, argv, &aboutData );
             KCmdLineArgs::init( argc, argv, &aboutData );
             KApplication khello;
             KApplication khello;
             DOM::HTMLDocument();
     
             DOM::HTMLDocument doc;
            DOM::DOMString tag("*");
            DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");
     
            doc.loadXML(uri);
            kdDebug() << "Does this doc have child elements ? " << doc.hasChildNodes() << endl;
            for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
            kdDebug() << "Size of your doc " << sizeof(doc.firstChild()) << endl;
            kdDebug() << doc.isHTMLDocument() << endl;
            kdDebug() << doc.toString().string() << endl;
    }
    }
    </highlightSyntax>
    </syntaxhighlight>
    It can be compiled like:
     
     
    Compile it like this:
      gcc -I/usr/lib/qt3/include -I/opt/kde3/include \
      gcc -I/usr/lib/qt3/include -I/opt/kde3/include \
      -L/opt/kde3/lib -lkdeui -lkhtml -o khtml khtml.cpp
      -L/opt/kde3/lib -lkdeui -lkhtml -o tags tags.cpp
     
    =Second=


    =Showing tags=
    <syntaxhighlight lang="cpp-qt">
    The next program is more advanced, it shows you the first tags of an html file:
    <highlightSyntax language="cpp">
    #include <kapplication.h>
    #include <kapplication.h>
    #include <kaboutdata.h>
    #include <kaboutdata.h>
    #include <kcmdlineargs.h>
    #include <kcmdlineargs.h>
    #include <dom/html_document.h>
    #include <dom/html_document.h>
    #include <dom/html_element.h>
    #include <dom/dom_node.h>


    int main (int argc, char *argv[])
    int main (int argc, char *argv[])
    {
    {
      KAboutData aboutData( "test", "test",
            KAboutData aboutData( "test", "test",
      "1.0", "test", KAboutData::License_GPL,
            "1.0", "test", KAboutData::License_GPL,
      "(c) 2006" );
            "(c) 2006" );
      KCmdLineArgs::init( argc, argv, &aboutData );
            KCmdLineArgs::init( argc, argv, &aboutData );
      KApplication khello;
            KApplication khello;
     
      DOM::Document doc=DOM::Document();
            DOM::HTMLDocument doc;
      DOM::HTMLDocument htmldoc=DOM::HTMLDocument();
            DOM::DOMString tag("*");
      DOM::DOMString tag("*");
            DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"><b>fat</b></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");
      doc.loadXML("hello.htm");
     
      kdDebug() << "Does this doc have child elements ? " << doc.hasChildNodes() << endl;
            doc.loadXML(uri);
      kdDebug() << "First child node name: " << doc.firstChild().nodeName().string() << endl;
            kdDebug() << "Here's a list of the document elements" << endl;
      kdDebug() << "First grandchild node name: " << doc.firstChild().firstChild().nodeName().string() << endl;
            for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
      kdDebug() << "Count of elements in your doc " << doc.getElementsByTagName(tag).length()<< endl;
         
      kdDebug() << "Size of your doc " << sizeof(doc) << endl;
            DOM::HTMLDocument doc2;
      kdDebug() << doc.toString().string() << endl;
            DOM::DOMString uri2("<html><body>this is html<b>fat</b></body></html>");
            doc2.loadXML(uri2);
            kdDebug() << "This is the in-memory html:" << endl;
            kdDebug() << doc.toString().string() << endl;
            doc.body().insertBefore(doc.body().firstChild().firstChild(),doc.body().firstChild());
            kdDebug() << "Moving around nodes" << endl;
            kdDebug() << doc.toString().string() << endl;
    }
    }
    </highlightSyntax>
    </syntaxhighlight>
    You can use this e.g. with the following
    hello.htm:
    <pre>
    <html>
    <head>
    <title>blah</title>
    </head>
    <body>
    <b>fat</b>
    <a href="http://www.de">denic</a>
    </body>
    </html>
    </pre>
     
    You get an error because your file is not UTF-16 encoded. Here's how I proceed:
    scorpio:~/html # hexdump hello.htm
    0000000 3c00 000a
    0000003

    Latest revision as of 20:54, 29 June 2011

    For HTML parsing, you have the following possibilities:

    • QXML
    • QDOM
    • Perl
    • XHTML

    Obviously, QXML and QDOM need XML-compliant HTML pages, and the least HTML pages are XML-compliant. Perl is not the scope of this site. This tutorial chooses the XHTML approach.

    First step

    As we remember from http://developernew.kde.org/Development/Tutorials/Programming_Tutorial_KDE_4/How_to_write_an_HTML_parser, biggest thing is to be able to parse non-XML-conform syntax. It works with the following program.

    tags.cpp

    #include <kapplication.h>
    #include <kaboutdata.h>
    #include <kcmdlineargs.h>
    #include <dom/html_document.h>
    
    int main (int argc, char *argv[])
    {
            KAboutData aboutData( "test", "test",
            "1.0", "test", KAboutData::License_GPL,
            "(c) 2006" );
            KCmdLineArgs::init( argc, argv, &aboutData );
            KApplication khello;
    
            DOM::HTMLDocument doc;
            DOM::DOMString tag("*");
            DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");
    
            doc.loadXML(uri);
            kdDebug() << "Does this doc have child elements ? " << doc.hasChildNodes() << endl;
            for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
            kdDebug() << "Size of your doc " << sizeof(doc.firstChild()) << endl;
            kdDebug() << doc.isHTMLDocument() << endl;
            kdDebug() << doc.toString().string() << endl;
    }
    


    Compile it like this:

    gcc -I/usr/lib/qt3/include -I/opt/kde3/include \
    -L/opt/kde3/lib -lkdeui -lkhtml -o tags tags.cpp
    

    Second

    #include <kapplication.h>
    #include <kaboutdata.h>
    #include <kcmdlineargs.h>
    #include <dom/html_document.h>
    #include <dom/html_element.h>
    #include <dom/dom_node.h>
    
    int main (int argc, char *argv[])
    {
            KAboutData aboutData( "test", "test",
            "1.0", "test", KAboutData::License_GPL,
            "(c) 2006" );
            KCmdLineArgs::init( argc, argv, &aboutData );
            KApplication khello;
    
            DOM::HTMLDocument doc;
            DOM::DOMString tag("*");
            DOM::DOMString uri("<html><body><a href=\"http://www.kde.org/\"><b>fat</b></a><a href=\"/index.php\" nowrap>Log in</a><a href=\"http://www.gmx.de\"></a></body></html>");
    
            doc.loadXML(uri);
            kdDebug() << "Here's a list of the document elements" << endl;
            for (int i=0; i<doc.getElementsByTagName(tag).length(); i++) kdDebug() << doc.getElementsByTagName(tag).item(i).nodeName().string() << endl;
           
            DOM::HTMLDocument doc2;
            DOM::DOMString uri2("<html><body>this is html<b>fat</b></body></html>");
            doc2.loadXML(uri2);
            kdDebug() << "This is the in-memory html:" << endl;
            kdDebug() << doc.toString().string() << endl;
            doc.body().insertBefore(doc.body().firstChild().firstChild(),doc.body().firstChild());
            kdDebug() << "Moving around nodes" << endl;
            kdDebug() << doc.toString().string() << endl;
    }