!/usr/bin/python3

# Code goes here ...
"""
 This script reads Stanford`s Event Calendar Page and generates a csv file with this format: title,date,location
"""
from htmldom import htmldom
import re

page = htmldom.HtmlDom( "http://events.stanford.edu/xml/rss.xml" ).createDom()
items = page.find( "channel item" )
csv_file = open( "cal.csv", mode = "w" , encoding = "utf-8" )

dateReg = re.compile( r'Date:\s+((\w+),\s+(\w+)\s+(\d+),\s+(\d+)\.\s+([\w:\d\s]+))' )
locationReg = re.compile( r'Location:\s*([\w\s]+)' )
comma = re.compile( r',' )

for item in items:
    title = item.find( "title" ).text().strip()
    date = dateReg.search( item.find( "description" ).text().strip() )
    if date:
        date = date.group( 1 )
    else:
        date = ""
    location = locationReg.search( item.find( "description" ).text().strip() )
    if location:
        location = location.group( 1 )
    else:
        location = ""
    csv_file.write( comma.sub( '', title ) + "," + comma.sub( '', date ) + "," + comma.sub( '', location ) + "\n" )

csv_file.close()

Last edit: Bhimsen.S.Kularni 2012-06-30

!/usr/bin/python3

# Code goes here ...
"""
    Extracting Content from http://digg.com/ and dynamically generating a new page.
"""
from htmldom import htmldom

page = htmldom.HtmlDom( "http://digg.com/" ).createDom()
items = page.find( "div.story-item-content" )
new_page = htmldom.HtmlDom().createDom( "<html><head><title>Digged Data</title></head><body><table></table></body></html>" )

for item in items:
    title = item.find( ".story-item-title" ).text().strip()
    description = item.find( ".story-item-description" ).text().strip()
    new_page.find( "table" ).append( "<tr><td>" + title + "</td><td>" + description + "</td></tr>" );

f = open( "new_page.html", mode = "w", encoding = "utf-8" )
f.write( new_page.find( "html" ).html() )
f.close()

!/usr/bin/python3

"""
    Extracting top 250 movie name,rating,votes from http://www.imdb.com/chart/top
    and storing it in a "|" separated file.[ format: name,year,votes,rating,rank ]
"""
from htmldom import htmldom
import re

year_reg = re.compile( r'\((\d+)\)' )
new_line = re.compile( r'\n' )

page = htmldom.HtmlDom( "http://www.imdb.com/chart/top" ).createDom()

# Slicing is needed since the first row contains headers.
rows = page.find( "div#main table tr" )[1:]
mv_file = open( "movie_list.csv", mode = "w", encoding = "utf-8" );

for row in rows:
    columns = row.find( "td" )
    name,votes,rating,rank = columns[ 2 ].text().strip(), columns[ 3 ].text().strip(),\
                         columns[ 1 ].text().strip(), columns[ 0 ].text().strip()
    year = year_reg.search( name )
    if year:
        year = year.group( 1 )
    else:
        year = "Not Given"

    mv_file.write( new_line.sub( '', year_reg.sub( '', name ) ) + "|" + year + "|" + votes + "|" + rating + "|" + rank[:-1] + "\n" )

mv_file.close()

HTML DOM Parser Examples

HTML parser which can be used for screen-scraping applications

Home

Project Admins:

Discussion

!/usr/bin/python3

!/usr/bin/python3

!/usr/bin/python3