Menu

Home

Bhimsen.S.Kularni

Welcome to your wiki!

This is the default page, edit it as you see fit. To add a new page simply reference it within brackets, e.g.: [SamplePage].

The wiki uses Markdown syntax.

Project Admins:


Discussion

  • Bhimsen.S.Kularni

    !/usr/bin/python3

    # Code goes here ...
    """
     This script reads Stanford`s Event Calendar Page and generates a csv file with this format: title,date,location
    """
    from htmldom import htmldom
    import re
    
    page = htmldom.HtmlDom( "http://events.stanford.edu/xml/rss.xml" ).createDom()
    items = page.find( "channel item" )
    csv_file = open( "cal.csv", mode = "w" , encoding = "utf-8" )
    
    dateReg = re.compile( r'Date:\s+((\w+),\s+(\w+)\s+(\d+),\s+(\d+)\.\s+([\w:\d\s]+))' )
    locationReg = re.compile( r'Location:\s*([\w\s]+)' )
    comma = re.compile( r',' )
    
    for item in items:
        title = item.find( "title" ).text().strip()
        date = dateReg.search( item.find( "description" ).text().strip() )
        if date:
            date = date.group( 1 )
        else:
            date = ""
        location = locationReg.search( item.find( "description" ).text().strip() )
        if location:
            location = location.group( 1 )
        else:
            location = ""
        csv_file.write( comma.sub( '', title ) + "," + comma.sub( '', date ) + "," + comma.sub( '', location ) + "\n" )
    
    csv_file.close()
    
     

    Last edit: Bhimsen.S.Kularni 2012-06-30
  • Bhimsen.S.Kularni

    !/usr/bin/python3

    # Code goes here ...
    """
        Extracting Content from http://digg.com/ and dynamically generating a new page.
    """
    from htmldom import htmldom
    
    page = htmldom.HtmlDom( "http://digg.com/" ).createDom()
    items = page.find( "div.story-item-content" )
    new_page = htmldom.HtmlDom().createDom( "<html><head><title>Digged Data</title></head><body><table></table></body></html>" )
    
    for item in items:
        title = item.find( ".story-item-title" ).text().strip()
        description = item.find( ".story-item-description" ).text().strip()
        new_page.find( "table" ).append( "<tr><td>" + title + "</td><td>" + description + "</td></tr>" );
    
    f = open( "new_page.html", mode = "w", encoding = "utf-8" )
    f.write( new_page.find( "html" ).html() )
    f.close()
    
     
  • Bhimsen.S.Kularni

    !/usr/bin/python3

    """
        Extracting top 250 movie name,rating,votes from http://www.imdb.com/chart/top
        and storing it in a "|" separated file.[ format: name,year,votes,rating,rank ]
    """
    from htmldom import htmldom
    import re
    
    year_reg = re.compile( r'\((\d+)\)' )
    new_line = re.compile( r'\n' )
    
    page = htmldom.HtmlDom( "http://www.imdb.com/chart/top" ).createDom()
    
    # Slicing is needed since the first row contains headers.
    rows = page.find( "div#main table tr" )[1:]
    mv_file = open( "movie_list.csv", mode = "w", encoding = "utf-8" );
    
    for row in rows:
        columns = row.find( "td" )
        name,votes,rating,rank = columns[ 2 ].text().strip(), columns[ 3 ].text().strip(),\
                             columns[ 1 ].text().strip(), columns[ 0 ].text().strip()
        year = year_reg.search( name )
        if year:
            year = year.group( 1 )
        else:
            year = "Not Given"
    
        mv_file.write( new_line.sub( '', year_reg.sub( '', name ) ) + "|" + year + "|" + votes + "|" + rating + "|" + rank[:-1] + "\n" )
    
    mv_file.close()
    
     

Log in to post a comment.