Menu

cannot extract the text alone using htmlcxx

Help
Anonymous
2010-08-30
2013-05-15
  • Anonymous

    Anonymous - 2010-08-30

    include <string>

    #include <iostream>
    #include <sstream>
    #include <curl/curl.h>
      #include <htmlcxx/html/ParserDom.h>
      #include <iostream>
      using namespace std;
      using namespace htmlcxx;
    static size_t http_write(void* buf, size_t size, size_t nmemb, void* userp)
    {
        if(userp)
        {
            ostringstream* oss = static_cast<ostringstream*>(userp);
            streamsize len = size * nmemb;
            oss->write(static_cast<char*>(buf), len);
            return nmemb;
        }
        return 0;
    }
    string get_html_page(const string& url, long timeout = 0)
    {
        CURL* curl = curl_easy_init();
        ostringstream oss;
        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &http_write);
        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L);
        curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
        curl_easy_setopt(curl, CURLOPT_FILE, &oss);
        curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
        curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
        curl_easy_perform(curl);
        curl_easy_cleanup(curl);
        return oss.str();
    }
    int main()
    {
        string html = get_html_page("http://www.google.co.in");
        //cout << html << endl;
         HTML::ParserDom parser;
          tree<HTML::Node> dom = parser.parseTree(html);
          //Print whole DOM tree
          //cout <<dom <<endl;
          //Dump all links in the tree
          tree<HTML::Node>::iterator it = dom.begin();
          tree<HTML::Node>::iterator end = dom.end();
          for (; it !=end; ++it)
          {
             if (strcasecmp(it->tagName().c_str(), "A") == 0)
             {
               it->parseAttributes();
               //cout << it->attribute("href").second << endl;
             }
          }
          //Dump all text of the document
          it = dom.begin();
          end = dom.end();
          for (; it != end; ++it)
          {
            if ((!it->isTag()) && (!it->isComment()))
            {
              cout << it->text();
            }
          }
        //  cout << endl;
        return 0;
    }
    

    i am getting this as output

    Googlewindow.google={kEI:"UsF7TK35Ns-3rAehkoTbBg",kEXPI:"25651,25901,26119,26325",kCSI:{e:"25651,25901,26119,26325",ei:"UsF7TK35Ns-3rAehkoTbBg",expi:"25651,25901,26119,26325"},ml:function(){},kHL:"en",time:function(){return(new Date).getTime()},log:function(b,d,c){var a=new Image,e=google,g=e.lc,f=e.li;a.onerror=(a.onload=(a.onabort=function(){delete g[f]}));g[f]=a;c=c||"/gen_204?atyp=i&ct="+b+"&cad="+d+"&zx="+google.time();a.src=c;e.li=f+1},lc:[],li:0,Toolbelt:{}};
    window.google.sn="webhp";window.google.timers={load:{t:{start:(new Date).getTime()}}};try{}catch(u){}window.google.jsrt_kill=1;
    var _gjwl=location;function _gjuc(){var e=_gjwl.href.indexOf("#");if(e>=0){var a=_gjwl.href.substring(e);if(a.indexOf("&q=")>0||a.indexOf("#q=")>=0){a=a.substring(1);if(a.indexOf("#")==-1){for(var c=0;c<a.length;){var d=c;if(a.charAt(d)=="&")++d;var b=a.indexOf("&",d);if(b==-1)b=a.length;var f=a.substring(d,b);if(f.indexOf("fp=")==0){a=a.substring(0,c)+a.substring(b,a.length);b=c}else if(f=="cad=h")return 0;c=b}_gjwl.href="/search?"+a+"&cad=h";return 1}}}return 0}function _gjp(){!(window._gjwl.hash&&
    window._gjuc())&&setTimeout(_gjp,500)};
    window._gjp && _gjp()body{margin:0}#gog{padding:3px 8px 0}td{line-height:.8em}.gac_m td{line-height:17px}form{margin-bottom:20px}body,td,a,p,.h{font-family:arial,sans-serif}.h{color:#36c;font-size:20px}.q{color:#00c}.ts td{padding:0}.ts{border-collapse:collapse}em{font-weight:bold;font-style:normal}.lst{width:496px}.tiah{width:458px}input{font-family:inherit}a.gb1,a.gb2,a.gb3,a.gb4{color:#11c !important}#gog{background:#fff}#gbar,#guser{font-size:13px;padding-top:1px !important}#gbar{float:left;height:22px}#guser{padding-bottom:7px !important;text-align:right}.gbh,.gbd{border-top:1px solid #c9d7f1;font-size:1px}.gbh{height:0;position:absolute;top:24px;width:100%}#gbs,.gbm{background:#fff;left:0;position:absolute;text-align:left;visibility:hidden;z-index:1000}.gbm{border:1px solid;border-color:#c9d7f1 #36c #36c #a2bae7;z-index:1001}.gb1{margin-right:.5em}.gb1,.gb3{zoom:1}.gb2{display:block;padding:.2em .5em}.gb2,.gb3{text-decoration:none;border-bottom:none}a.gb1,a.gb2,a.gb3,a.gb4{color:#00c !important}a.gb2:hover{background:#36c;color:#fff !important}body{background:#fff;color:black}input{-moz-box-sizing:content-box}a{color:#11c;text-decoration:none}a:hover,a:active{text-decoration:underline}.fl a{color:#4272db}a:visited{color:#551a8b}a.gb1,a.gb4{text-decoration:underline}a.gb3:hover{text-decoration:none}#ghead a.gb2:hover{color:#fff!important}.ds{display:-moz-inline-box}.ds{border-bottom:solid 1px #e7e7e7;border-right:solid 1px #e7e7e7;display:inline-block;margin:3px 0 4px;margin-left:4px}.sblc{padding-top:5px}.sblc a{display:block;margin:2px 0;margin-left:13px;font-size:11px;}.lsbb{background:#eee;border:solid 1px;border-color:#ccc #999 #999 #ccc;height:30px;display:block}.lsb{background:url(/images/srpr/nav_logo14.png) bottom;font:15px arial,sans-serif;border:none;color:#000;cursor:pointer;height:30px;margin:0;outline:0;vertical-align:top}.lsb:active{background:#ccc}.lst:focus{outline:none}.ftl,#fll a{margin:0 12px}#addlang a{padding:0 3px}.gac_v div{display:none}.gac_v .gac_v2,.gac_bt{display:block!important}google.y={};google.x=function(e,g){google.y[e.id]=[e,g];return false};window.gbar={qs:function(){},tg:function(e){var o={id:'gbar'};for(i in e)o[i]=e[i];google.x(o,function(){gbar.tg(o)})}};Web Images Maps News Orkut Books Gmail more &#9660;Translate Scholar Blogs YouTube Calendar Photos Documents Reader Sites Groups even more &raquo; iGoogle | Search settings | Sign in India&nbsp;Advanced SearchLanguage ToolsGoogle.co.in offered in: Hindi Bengali Telugu Marathi Tamil Gujarati Kannada Malayalam PunjabiAdvertising&nbsp;ProgramsAbout GoogleGo to Google.com&copy; 2010 - Privacy if(google.y)google.y.first=[];if(google.y)google.y.first=[];google.dstr=[];google.rein=[];window.setTimeout(function(){var a=document.createElement("script");a.src="/extern_js/f/CgJlbhICaW4gACswRTgBLCswWjgDLCswDjgALCswFzgHLCswJzgELCswPDgDLCswUTgDLCswCjhzQB0sKzAWOB0sKzAZOCAsKzAlOMqIASwrMDU4BCwrMEA4EiwrMEE4BSwrME44BiwrMFQ4ASwrMBg4BSwrMCY4DSyAAheQAhg/x2R96GGjycQ.js";(document.getElementById("xjsd")||document.body).appendChild(a);if(google.timers&&google.timers.load.t)google.timers.load.t.xjsls=(new Date).getTime();},0);
    ;google.neegg=1;google.y.first.push(function(){var form=document.f||document.f||document.gs;google.ac.i(form,form.q,'','','',{o:1,sw:1});google.History&&google.History.initialize('/')});if(google.j&&google.j.en&&google.j.xi){window.setTimeout(google.j.xi,0);google.fade=null;}(function(){
    var b,d,e,f;function g(a,c){if(a.removeEventListener){a.removeEventListener("load",c,false);a.removeEventListener("error",c,false)}else{a.detachEvent("onload",c);a.detachEvent("onerror",c)}}function h(a){f=(new Date).getTime();++d;a=a||window.event;var c=a.target||a.srcElement;g(c,h)}var i=document.getElementsByTagName("img");b=i.length;d=0;for(var j=0,k;j<b;++j){k=i[j];if(k.complete||typeof k.src!="string"||!k.src)++d;else if(k.addEventListener){k.addEventListener("load",h,false);k.addEventListener("error",
    h,false)}else{k.attachEvent("onload",h);k.attachEvent("onerror",h)}}e=b-d;function l(){if(!google.timers.load.t)return;google.timers.load.t.ol=(new Date).getTime();google.timers.load.t.iml=f;google.kCSI.imc=d;google.kCSI.imn=b;google.kCSI.imp=e;google.timers.load.t.xjs&&google.report&&google.report(google.timers.load,google.kCSI)}if(window.addEventListener)window.addEventListener("load",l,false);else if(window.attachEvent)window.attachEvent("onload",l);google.timers.load.t.prt=(f=(new Date).getTime());
    })();
    

    please tellme y i am getting the java script code also…

     
  • Nikita Vostretsov

    to skip javascript and  style info skip "script" and "style" html tags

     

Log in to post a comment.

Want the latest updates on software, tech news, and AI?
Get latest updates about software, tech news, and AI from SourceForge directly in your inbox once a month.