Hello, I trying to parse http://vk.com but have some troubles. I'm using bash:
For example (variables and files are announced and exist):
curl -A "$USERAGENT" -b /tmp/vklogin.cookie -L "http://m.vk.com/search" | xmlstarlet sel -t -v /html/body/
The trouble:
-:12.312: xmlParseEntityRef: no name
"display:none" />');var f=document.getElementById("up_helper"),l="file"==f.type&
^
-:12.313: xmlParseEntityRef: no name
display:none" />');var f=document.getElementById("up_helper"),l="file"==f.type&&
^
-:12.459: xmlParseEntityRef: no name
window?c:b,h;try{h=typeof navigator.geolocation!==a?c:typeof window.google!==a&
^
-:12.466: EntityRef: expecting ';'
?c:b,h;try{h=typeof navigator.geolocation!==a?c:typeof window.google!==a&&typeof
^
-:12.504: xmlParseEntityRef: no name
ion!==a?c:typeof window.google!==a&&typeof google.gears!==a?c:typeof device!==a&
^
-:12.511: EntityRef: expecting ';'
?c:typeof window.google!==a&&typeof google.gears!==a?c:typeof device!==a&&typeof
^
-:12.558: xmlParseEntityRef: no name
s!==a?c:typeof device!==a&&typeof device.getServiceObject!==a?c:typeof Mojo!==a&
^
-:12.559: xmlParseEntityRef: no name
!==a?c:typeof device!==a&&typeof device.getServiceObject!==a?c:typeof Mojo!==a&&
^
-:12.708: xmlParseEntityRef: no name
}var i;a:{try{var j=e.Audio?new Audio:g.createElement("audio");if(j.canPlayType&
^
-:12.722: EntityRef: expecting ';'
var j=e.Audio?new Audio:g.createElement("audio");if(j.canPlayType&&j.canPlayType
^
-:12.737: xmlParseEntityRef: no name
ew Audio:g.createElement("audio");if(j.canPlayType&&j.canPlayType("audio/mpeg")&
^
-:12.738: xmlParseEntityRef: no name
w Audio:g.createElement("audio");if(j.canPlayType&&j.canPlayType("audio/mpeg")&&
^
-:15.8: Opening and ending tag mismatch: base line 10 and head
</head>
^
-:27.55: EntityRef: expecting ';'
<form method="post" action="https://login.vk.com/?act=login&to=c2VhcmNo&_origin=
^
-:27.72: EntityRef: expecting ';'
<form method="post" action="https://login.vk.com/?act=login&to=c2VhcmNo&_origin=
^
-:27.93: EntityRef: expecting ';'
action="https://login.vk.com/?act=login&to=c2VhcmNo&_origin=http://m.vk.com&ip_h
^
-:27.117: EntityRef: expecting ';'
.com/?act=login&to=c2VhcmNo&_origin=http://m.vk.com&ip_h=1526eff9b3ffcd2220&role
^
-:27.126: EntityRef: expecting ';'
=login&to=c2VhcmNo&_origin=http://m.vk.com&ip_h=1526eff9b3ffcd2220&role=pda&utf8
^
-:40.41: EntityRef: expecting ';'
<div class="near_btn"><a href="/login?act=restore&to=c2VhcmNo">Забули па
^
-:51.47: EntityRef: expecting ';'
<li><a href="/settings?act=change_regional&hash=d269719b205db90ebb&to=c2VhcmNo&l
^
-:51.69: EntityRef: expecting ';'
<li><a href="/settings?act=change_regional&hash=d269719b205db90ebb&to=c2VhcmNo&l
^
-:51.86: EntityRef: expecting ';'
href="/settings?act=change_regional&hash=d269719b205db90ebb&to=c2VhcmNo&lang_id
^
-:51.147: EntityRef: expecting ';'
ang_id=0">Русский</a></li><li><a href="/settings?act=change_regional&hash
^
-:51.169: EntityRef: expecting ';'
й</a></li><li><a href="/settings?act=change_regional&hash=d269719b205db90ebb&to
^
-:51.186: EntityRef: expecting ';'
href="/settings?act=change_regional&hash=d269719b205db90ebb&to=c2VhcmNo&lang_id
^
-:58.36: xmlParseEntityRef: no name
<script id="page_script" type="text/javascript">!window.__jsInited&&(function(){
^
-:58.37: xmlParseEntityRef: no name
<script id="page_script" type="text/javascript">!window.__jsInited&&(function(){
^
-:58.96: AttValue: " or ' expected
!window.__jsInited&&(function(){window.langConfig={"id":1,"numDel":"<span class=
^
-:58.96: attributes construct error
!window.__jsInited&&(function(){window.langConfig={"id":1,"numDel":"<span class=
^
-:58.96: Couldn't find end of Start Tag span line 58
!window.__jsInited&&(function(){window.langConfig={"id":1,"numDel":"<span class=
^
-:58.112: StartTag: invalid element name
ed&&(function(){window.langConfig={"id":1,"numDel":"<span class=\"num_delim\"> <
^
-:58.700: Entity 'larr' not defined
],[10,[2,3,4],2],["*",0,3]],"float":2},"sexRules":[[1,2],["*",1]],"larr":"←
^
-:58.716: Entity 'rarr' not defined
,["*",0,3]],"float":2},"sexRules":[[1,2],["*",1]],"larr":"←","rarr":"→
^
-:58.1221: StartTag: invalid element name
ть назву..","mobile_audio_search_not_found":"За запитом «<b>%s<
^
-:58.1725: Opening and ending tag mismatch: b line 58 and script
ів"]});window.vk={"id":0,"__debug":false};window.__jsInited=true;})();</script
^
-:60.58: EntityRef: expecting ';'
<img width="1" height="1" src="//b.scorecardresearch.com/p?c1=2&c2=13765216&c3=&
^
-:60.70: EntityRef: expecting ';'
<img width="1" height="1" src="//b.scorecardresearch.com/p?c1=2&c2=13765216&c3=&
^
-:60.74: EntityRef: expecting ';'
mg width="1" height="1" src="//b.scorecardresearch.com/p?c1=2&c2=13765216&c3=&c4
^
-:60.145: EntityRef: expecting ';'
16&c3=&c4=http%3A%2F%2Fm.vk.com%2Flogin%3Frole%3Dfast%26to%3Dc2VhcmNo%26s%3D0&c5
^
-:60.149: EntityRef: expecting ';'
3=&c4=http%3A%2F%2Fm.vk.com%2Flogin%3Frole%3Dfast%26to%3Dc2VhcmNo%26s%3D0&c5=&c9
^
-:60.157: EntityRef: expecting ';'
tp%3A%2F%2Fm.vk.com%2Flogin%3Frole%3Dfast%26to%3Dc2VhcmNo%26s%3D0&c5=&c9=c15=&cv
^
-:60.164: EntityRef: expecting ';'
F%2Fm.vk.com%2Flogin%3Frole%3Dfast%26to%3Dc2VhcmNo%26s%3D0&c5=&c9=c15=&cv=2.0&cj
^
-:60.169: EntityRef: expecting ';'
.vk.com%2Flogin%3Frole%3Dfast%26to%3Dc2VhcmNo%26s%3D0&c5=&c9=c15=&cv=2.0&cj=1&rn
^
-:60.196: Opening and ending tag mismatch: script line 58 and div
c2VhcmNo%26s%3D0&c5=&c9=c15=&cv=2.0&cj=1&rn=2980093" alt="" align="left" /></div
^
-:62.21: xmlParseEntityRef: no name
<script type="text/javascript">parent&&parent!==window&&(document.getElementsByT
^
-:62.28: EntityRef: expecting ';'
<script type="text/javascript">parent&&parent!==window&&(document.getElementsByT
^
-:62.38: xmlParseEntityRef: no name
<script type="text/javascript">parent&&parent!==window&&(document.getElementsByT
^
-:62.39: xmlParseEntityRef: no name
<script type="text/javascript">parent&&parent!==window&&(document.getElementsByT
^
-:63.8: Opening and ending tag mismatch: div line 17 and body
</body>
^
-:64.8: Opening and ending tag mismatch: body line 16 and html
</html>
^
-:64.8: Premature end of data in tag head line 4
</html>
^
-:64.8: Premature end of data in tag html line 3
</html>
^
I had tried to replace special xml symbols with sed, but it not helped.
It looks that webpage is not valid XML, you can use the fo command to convert HTML to XML. However, it seems like it has some trouble with this page, the errors can be ignored, but it also puts an <?xml...?> declaration in the wrong place even with the --omit-decl option! Finally, it uses http://www.w3.org/1999/xhtml as the default namespace so you have to deal with that too:
curl -A "$USERAGENT" -b /tmp/vklogin.cookie -L "http://m.vk.com/search" | xmlstarlet fo --quiet --html --omit-decl | sed '/^<?xml/d' | xmlstarlet sel -N xhtml=http://www.w3.org/1999/xhtml -t -v /xhtml:html/xhtml:body