From: <jbo...@li...> - 2005-11-27 16:00:26
|
Author: aron.gombas Date: 2005-11-27 11:00:20 -0500 (Sun, 27 Nov 2005) New Revision: 1653 Modified: trunk/labs/kosmos/src/java/hu/midori/kosmos/server/sf/SfConstants.java trunk/labs/kosmos/src/java/hu/midori/kosmos/server/sf/SfServiceImpl.java Log: SF service handles the new SF site Modified: trunk/labs/kosmos/src/java/hu/midori/kosmos/server/sf/SfConstants.java =================================================================== --- trunk/labs/kosmos/src/java/hu/midori/kosmos/server/sf/SfConstants.java 2005-11-27 15:47:02 UTC (rev 1652) +++ trunk/labs/kosmos/src/java/hu/midori/kosmos/server/sf/SfConstants.java 2005-11-27 16:00:20 UTC (rev 1653) @@ -20,5 +20,5 @@ public final static String SF_BASEURL = "http://www.sourceforge.net"; /** Dateformat used in the SF site. */ - public final static DateFormat SF_FILERELEASE_DATEFORMAT = new SimpleDateFormat("MMM d, yyyy"); + public final static DateFormat SF_FILERELEASE_DATEFORMAT = new SimpleDateFormat("MMMMMM d, yyyy"); } Modified: trunk/labs/kosmos/src/java/hu/midori/kosmos/server/sf/SfServiceImpl.java =================================================================== --- trunk/labs/kosmos/src/java/hu/midori/kosmos/server/sf/SfServiceImpl.java 2005-11-27 15:47:02 UTC (rev 1652) +++ trunk/labs/kosmos/src/java/hu/midori/kosmos/server/sf/SfServiceImpl.java 2005-11-27 16:00:20 UTC (rev 1653) @@ -15,15 +15,13 @@ import java.net.URL; import java.util.ArrayList; import java.util.Date; -import java.util.Iterator; import java.util.List; -import java.util.StringTokenizer; -import net.sf.saxon.om.NodeInfo; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; /** * Implementation of the <i>SourceForge</i> service. @@ -59,45 +57,57 @@ try { Document dom = ScrapingUtils.downloadHtmlDom(new URL(url)); - - // run XQuery - String query = - "for " + - "$d at $i in //b[contains(text(), \"Latest File Releases\")]/parent::font/parent::td/parent::tr/following-sibling::tr/following-sibling::tr/child::td/child::table/child::tr " + - "where " + - "$i > 1 " + // skip header - "return " + - "<dummy>" + - "{ data($d/child::td[1]) }|" + - "{ data($d/child::td[1]/child::b/child::a/@href) }|" + - "{ data($d/child::td[2]) }|" + - "{ data($d/child::td[2]/child::a/@href) }|" + - "{ data($d/child::td[3]) }" + - "</dummy>"; - List result = ScrapingUtils.runXQuery(dom, query); - if (result.size() < 1) - throw new IllegalStateException(String.format("Couldn't find file releases (%d) at \"%s\"", result.size(), url)); + + // scrape "Files" link from the "Summary" page and download it + Node aNode = findNodeByAttribute(dom.getElementsByTagName("a"), "href", "/project/showfiles.php?group_id="); + String filesPageUrl = SfConstants.SF_BASEURL + aNode.getAttributes().getNamedItem("href").getNodeValue(); + log.debug(String.format("Going to project file releases page at \"%s\"", filesPageUrl)); + Document filesPageDom = ScrapingUtils.downloadHtmlDom(new URL(filesPageUrl)); + + // look for "frelease" table + Node releasesTableNode = findNodeByAttribute(filesPageDom.getElementsByTagName("table"), "id", "frelease"); + NodeList trNodes = releasesTableNode.getLastChild().getChildNodes(); + for(int j = 0; j < trNodes.getLength(); j++) { + Node trNode = trNodes.item(j); + Node a0Node = trNode.getFirstChild().getFirstChild(); + Node a1Node = trNode.getFirstChild().getNextSibling().getFirstChild(); + Node tdNode = trNode.getFirstChild().getNextSibling().getNextSibling(); + + String packageName = a0Node.getFirstChild().getNodeValue(); + String packageUrl = SfConstants.SF_BASEURL + a0Node.getAttributes().getNamedItem("href").getNodeValue(); + String version = a1Node.getFirstChild().getNodeValue();; + String versionUrl = SfConstants.SF_BASEURL + a1Node.getAttributes().getNamedItem("href").getNodeValue(); + Date date = SfConstants.SF_FILERELEASE_DATEFORMAT.parse(tdNode.getFirstChild().getNodeValue()); + + SfRelease release = new SfRelease(packageName, packageUrl, version, versionUrl, date); + log.debug(String.format("Adding file release \"%s\"...", release)); + releases.add(release); + } - // scrape - for(Iterator it = result.iterator(); it.hasNext();) { - String value = ((NodeInfo)it.next()).getStringValue(); - - StringTokenizer tokenizer = new StringTokenizer(value, "|"); - String packageName = tokenizer.nextToken(); - String packageUrl = SfConstants.SF_BASEURL + tokenizer.nextToken(); - String version = tokenizer.nextToken(); - String versionUrl = SfConstants.SF_BASEURL + tokenizer.nextToken(); - Date date = SfConstants.SF_FILERELEASE_DATEFORMAT.parse(tokenizer.nextToken()); - - SfRelease release = new SfRelease(packageName, packageUrl, version, versionUrl, date); - log.debug(String.format("Adding %s...", release)); - releases.add(release); - } + if (releases.isEmpty()) + throw new IllegalStateException(String.format("Couldn't find file releases at \"%s\"", url)); } catch(Exception ex) { log.error("Unable to scrape", ex); } return releases; } + + /** + * Returns the node with the given attribute value from the given list or + * <code>null</code> if not found. + */ + protected Node findNodeByAttribute(NodeList nodes, String attribName, String attribValue) { + for(int i = 0; i < nodes.getLength(); i++) { + Node node = nodes.item(i); + Node attrib = node.getAttributes().getNamedItem(attribName); + if(attrib == null) + continue; + if(attrib.getNodeValue().indexOf(attribValue) != -1) + return node; + } + + return null; + } } } |