From: Brad <bra...@us...> - 2005-11-17 02:49:18
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11862/src/java/org/archive/wayback/cdx/indexer Modified Files: ArcIndexer.java Log Message: BUGFIX: was not using CDXRecord to "serialize" SearchResult objects BUGFIX: was not escaping Location: HTTP Header URLs FEATURE: added skipping of dns: records in ARC files FEATURE: added fixing/resolving of relative Location: HTTP headers, altho this is against spec. Index: ArcIndexer.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer/ArcIndexer.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ArcIndexer.java 16 Nov 2005 03:11:29 -0000 1.1 --- ArcIndexer.java 17 Nov 2005 02:49:08 -0000 1.2 *************** *** 35,42 **** --- 35,45 ---- import org.archive.io.arc.ARCRecordMetaData; import org.archive.net.UURI; + import org.archive.net.UURIFactory; import org.archive.wayback.WaybackConstants; + import org.archive.wayback.cdx.CDXRecord; import org.archive.wayback.core.SearchResult; import org.archive.wayback.core.SearchResults; import org.apache.commons.httpclient.Header; + import org.apache.commons.httpclient.URIException; /** *************** *** 50,54 **** private final static String LOCATION_HTTP_HEADER = "Location"; private final static String CDX_HEADER_STRING = " CDX N b h m s k r V g"; ! /** --- 53,57 ---- private final static String LOCATION_HTTP_HEADER = "Location"; private final static String CDX_HEADER_STRING = " CDX N b h m s k r V g"; ! private final static String DNS_URL_PREFIX = "dns:"; /** *************** *** 113,118 **** return null; } UURI uri = new UURI(uriStr, false); ! result.put(WaybackConstants.RESULT_ORIG_HOST,uri.getHost()); String redirectUrl = "-"; --- 116,132 ---- return null; } + if(uriStr.startsWith(DNS_URL_PREFIX)) { + // skip dns records... + return null; + } + UURI uri = new UURI(uriStr, false); ! String uriHost = uri.getHost(); ! if(uriHost == null) { ! System.out.println("No host in " + uriStr + " in " + ! arc.getAbsolutePath()); ! return null; ! } ! result.put(WaybackConstants.RESULT_ORIG_HOST,uriHost); String redirectUrl = "-"; *************** *** 121,125 **** for (int i = 0; i < headers.length; i++) { if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { ! redirectUrl = headers[i].getValue(); break; } --- 135,155 ---- for (int i = 0; i < headers.length; i++) { if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { ! String locationStr = headers[i].getValue(); ! // TODO: "Location" is supposed to be absolute: ! // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) ! // (section 14.30) but Content-Location can be relative. ! // is it correct to resolve a relative Location, as we are? ! // it's also possible to have both in the HTTP headers... ! // should we prefer one over the other? ! // right now, we're ignoring "Content-Location" ! try { ! UURI uriRedirect = UURIFactory.getInstance(uri,locationStr); ! redirectUrl = uriRedirect.getEscapedURI(); ! ! } catch (URIException e) { ! System.out.println("Bad Location: " + locationStr + ! " for " + uriStr + " in " + ! arc.getAbsolutePath() + " Skipped"); ! } break; } *************** *** 151,159 **** FileOutputStream output = new FileOutputStream(target); output.write((CDX_HEADER_STRING + "\n").getBytes()); ! Iterator itr = results.iterator(); while (itr.hasNext()) { SearchResult result = (SearchResult) itr.next(); ! output.write((result.toString() + "\n").getBytes()); } } --- 181,190 ---- FileOutputStream output = new FileOutputStream(target); output.write((CDX_HEADER_STRING + "\n").getBytes()); ! CDXRecord cdxRecord = new CDXRecord(); Iterator itr = results.iterator(); while (itr.hasNext()) { SearchResult result = (SearchResult) itr.next(); ! cdxRecord.fromSearchResult(result); ! output.write((cdxRecord.toValue() + "\n").getBytes()); } } |
From: stack <st...@ar...> - 2005-11-17 17:35:58
|
Brad wrote: >Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer >In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11862/src/java/org/archive/wayback/cdx/indexer > >Modified Files: > ArcIndexer.java >Log Message: >BUGFIX: was not using CDXRecord to "serialize" SearchResult objects >BUGFIX: was not escaping Location: HTTP Header URLs >FEATURE: added skipping of dns: records in ARC files >FEATURE: added fixing/resolving of relative Location: HTTP headers, altho this is against spec. > >Index: ArcIndexer.java >=================================================================== >RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer/ArcIndexer.java,v >retrieving revision 1.1 >retrieving revision 1.2 >diff -C2 -d -r1.1 -r1.2 >*** ArcIndexer.java 16 Nov 2005 03:11:29 -0000 1.1 >--- ArcIndexer.java 17 Nov 2005 02:49:08 -0000 1.2 >*************** >*** 35,42 **** >--- 35,45 ---- > import org.archive.io.arc.ARCRecordMetaData; > import org.archive.net.UURI; >+ import org.archive.net.UURIFactory; > import org.archive.wayback.WaybackConstants; >+ import org.archive.wayback.cdx.CDXRecord; > import org.archive.wayback.core.SearchResult; > import org.archive.wayback.core.SearchResults; > import org.apache.commons.httpclient.Header; >+ import org.apache.commons.httpclient.URIException; > > /** >*************** >*** 50,54 **** > private final static String LOCATION_HTTP_HEADER = "Location"; > private final static String CDX_HEADER_STRING = " CDX N b h m s k r V g"; >! > > /** >--- 53,57 ---- > private final static String LOCATION_HTTP_HEADER = "Location"; > private final static String CDX_HEADER_STRING = " CDX N b h m s k r V g"; >! private final static String DNS_URL_PREFIX = "dns:"; > > /** >*************** >*** 113,118 **** > return null; > } > UURI uri = new UURI(uriStr, false); >! result.put(WaybackConstants.RESULT_ORIG_HOST,uri.getHost()); > > String redirectUrl = "-"; >--- 116,132 ---- > return null; > } >+ if(uriStr.startsWith(DNS_URL_PREFIX)) { >+ // skip dns records... >+ return null; >+ } >+ > UURI uri = new UURI(uriStr, false); > > You don't want to use UURIFactory making your UURIs? It takes care of fixup and proper escaping (The UURI constructors used to be shutdown so you had to go via UURIFactory to get your UURIs). You might check it out. >! String uriHost = uri.getHost(); >! if(uriHost == null) { >! System.out.println("No host in " + uriStr + " in " + >! arc.getAbsolutePath()); > > You don't want to use a logger here? (java.util.logging?). >! return null; >! } >! result.put(WaybackConstants.RESULT_ORIG_HOST,uriHost); > > String redirectUrl = "-"; >*************** >*** 121,125 **** > for (int i = 0; i < headers.length; i++) { > if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { >! redirectUrl = headers[i].getValue(); > break; > } >--- 135,155 ---- > for (int i = 0; i < headers.length; i++) { > if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { >! String locationStr = headers[i].getValue(); >! // TODO: "Location" is supposed to be absolute: >! // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) >! // (section 14.30) but Content-Location can be relative. >! // is it correct to resolve a relative Location, as we are? >! // it's also possible to have both in the HTTP headers... >! // should we prefer one over the other? >! // right now, we're ignoring "Content-Location" >! try { >! UURI uriRedirect = UURIFactory.getInstance(uri,locationStr); >! redirectUrl = uriRedirect.getEscapedURI(); >! >! } catch (URIException e) { >! System.out.println("Bad Location: " + locationStr + >! " for " + uriStr + " in " + >! arc.getAbsolutePath() + " Skipped"); >! } > break; > } >*************** >*** 151,159 **** > FileOutputStream output = new FileOutputStream(target); > output.write((CDX_HEADER_STRING + "\n").getBytes()); >! > Do you think encoding wil ever be an issue? Does CDX only ever have ASCII? Any chance of multibyte chars? (Maybe you want to do getBytes("UTF-8")? > > Iterator itr = results.iterator(); > while (itr.hasNext()) { > > You might do instead of above: for (final Iterator itr = results.iterator(); itr.hasNext();) { etc. > SearchResult result = (SearchResult) itr.next(); >! output.write((result.toString() + "\n").getBytes()); > > Any reason you don't want to use something more sophisticated than a byte writer? If you used a PrintWriter you could just pass the String and you could call println. You might also want to wrap your FileOutputStream in a BufferedOutputStream as in: OutputStream output = new BufferedOutputStream(new FileOutputStream(....)); St.Ack > } > } >--- 181,190 ---- > FileOutputStream output = new FileOutputStream(target); > output.write((CDX_HEADER_STRING + "\n").getBytes()); >! CDXRecord cdxRecord = new CDXRecord(); > Iterator itr = results.iterator(); > while (itr.hasNext()) { > SearchResult result = (SearchResult) itr.next(); >! cdxRecord.fromSearchResult(result); >! output.write((cdxRecord.toValue() + "\n").getBytes()); > } > } > > > >------------------------------------------------------- >This SF.Net email is sponsored by the JBoss Inc. Get Certified Today >Register for a JBoss Training Course. Free Certification Exam >for All Training Attendees Through End of 2005. For more info visit: >http://ads.osdn.com/?ad_id=7628&alloc_id=16845&op=click >_______________________________________________ >Archive-access-cvs mailing list >Arc...@li... >https://lists.sourceforge.net/lists/listinfo/archive-access-cvs > > |