You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <nl...@ar...> - 2012-10-15 14:34:46
|
wayback - Build # 34 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/34/ to view the results. |
From: <nl...@ar...> - 2012-10-12 01:26:49
|
wayback - Build # 33 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/33/ to view the results. |
From: <nl...@ar...> - 2012-10-09 19:55:38
|
wayback - Build # 32 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/32/ to view the results. |
From: <nl...@ar...> - 2012-10-09 19:49:57
|
wayback - Build # 31 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/31/ to view the results. |
From: <nl...@ar...> - 2012-10-05 22:07:15
|
wayback - Build # 30 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/30/ to view the results. |
From: <nl...@ar...> - 2012-10-02 22:27:03
|
wayback - Build # 29 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/29/ to view the results. |
From: <nl...@ar...> - 2012-09-28 16:07:11
|
wayback - Build # 28 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/28/ to view the results. |
From: <nl...@ar...> - 2012-09-26 09:22:03
|
wayback - Build # 27 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/27/ to view the results. |
From: <nl...@ar...> - 2012-09-26 02:12:11
|
wayback - Build # 26 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/26/ to view the results. |
From: <nl...@ar...> - 2012-09-13 15:57:06
|
wayback - Build # 25 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/25/ to view the results. |
From: <nl...@ar...> - 2012-09-06 19:16:39
|
wayback - Build # 24 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/24/ to view the results. |
From: <nl...@ar...> - 2012-09-05 21:46:48
|
wayback - Build # 23 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/23/ to view the results. |
From: <nl...@ar...> - 2012-08-31 03:56:28
|
wayback - Build # 22 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/22/ to view the results. |
From: <nl...@ar...> - 2012-08-30 08:46:20
|
wayback - Build # 21 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/21/ to view the results. |
From: <nl...@ar...> - 2012-08-30 04:31:49
|
wayback - Build # 20 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/20/ to view the results. |
From: <nl...@ar...> - 2012-08-29 17:51:43
|
wayback - Build # 19 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/19/ to view the results. |
From: <nl...@ar...> - 2012-08-15 21:21:25
|
wayback - Build # 18 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/18/ to view the results. |
From: <nl...@ar...> - 2012-08-15 19:26:37
|
wayback - Build # 17 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/17/ to view the results. |
From: <nl...@ar...> - 2012-08-04 00:51:33
|
wayback - Build # 16 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/16/ to view the results. |
From: <nl...@ar...> - 2012-08-02 23:46:36
|
wayback - Build # 15 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/15/ to view the results. |
From: <nl...@ar...> - 2012-07-20 01:01:35
|
wayback - Build # 14 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/14/ to view the results. |
From: <nl...@ar...> - 2012-07-13 02:21:28
|
wayback - Build # 13 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/13/ to view the results. |
From: <nl...@ar...> - 2012-07-04 21:44:37
|
wayback - Build # 12 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/12/ to view the results. |
Revision: 3628 http://archive-access.svn.sourceforge.net/archive-access/?rev=3628&view=rev Author: binzino Date: 2012-07-03 00:43:30 +0000 (Tue, 03 Jul 2012) Log Message: ----------- Prepend PDF-only metadata property keys with "pdf.". Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-07-02 23:33:10 UTC (rev 3627) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-07-03 00:43:30 UTC (rev 3628) @@ -123,7 +123,8 @@ } else { - content.getMetadata().set( kv[0].trim(), kv[1].trim() ); + // For all other properties, append "pdf." prexif. + content.getMetadata().set( "pdf." + kv[0].trim(), kv[1].trim() ); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3627 http://archive-access.svn.sourceforge.net/archive-access/?rev=3627&view=rev Author: binzino Date: 2012-07-02 23:33:10 +0000 (Mon, 02 Jul 2012) Log Message: ----------- Use pdfinfo for extracting PDF file metadata. Also re-wrote sub-process handling to correctly handle stdout and stderr in separate threads. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-04-25 02:12:22 UTC (rev 3626) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-07-02 23:33:10 UTC (rev 3627) @@ -51,11 +51,26 @@ { public static final Log LOG = LogFactory.getLog( PDFParser.class ); - private Configuration conf; + public Configuration conf; + public String pdfinfo; + public String pdftotext; + public String lastPage; + public boolean raw; + public void setConf( Configuration conf ) { this.conf = conf; + + this.pdfinfo = this.conf.get( "nutchwax.parse.pdf2.pdfinfo.path", "/usr/bin/pdfinfo" ); + this.pdftotext = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" ); + this.lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) ); + this.raw = this.conf.getBoolean( "nutchwax.parse.pdf2.raw", true ); + + LOG.info( "nutchwax.parse.pdf2.pdfinfo.path = " + this.pdfinfo ); + LOG.info( "nutchwax.parse.pdf2.pdftotext.path = " + this.pdftotext ); + LOG.info( "nutchwax.parse.pdf2.lastPage = " + this.lastPage ); + LOG.info( "nutchwax.parse.pdf2.raw = " + this.raw ); } public Configuration getConf( ) @@ -69,8 +84,6 @@ String title = ""; String text = ""; - byte[] raw = content.getContent( ); - File tmpfile = null; try { @@ -78,51 +91,58 @@ // Write the PDF document to the tmp file. FileOutputStream fos = new FileOutputStream( tmpfile ); - fos.write( raw ); + fos.write( content.getContent( ) ); fos.close(); - String exepath = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" ); - String lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) ); + // Create a Process which calls 'pdfinfo' to extract the metadata. + ProcessInfo pinfo = execute( pdfinfo, "-enc", "UTF-8", tmpfile.toString() ); - // Create a Process which calls 'pdftotext' to extract the metadata. Only get the first page. - ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); - - Process p = pb.start(); - - p.getOutputStream( ).close(); - String head = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) ); - byte[] err = suck( p.getErrorStream( ) ); + if ( pinfo.stderr.length() > 0 ) + { + LOG.warn( "Error from pdfinfo: " + pinfo.stderr ); + } - if ( err.length > 0 ) + // Extract the title and other metadata properties from the above call to pdfinfo. + String[] properties = pinfo.stdout.split("\n"); + for ( String property : properties ) { - LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) ); + String[] kv = property.split( "[:]", 2 ); + if ( kv.length != 2 ) continue ; + + if ( kv[0].trim().equals( "Title" ) ) + { + title = kv[1].trim(); + } + else if ( kv[0].trim().equals( "Keywords" ) ) + { + content.getMetadata().set( "keywords", kv[1].trim() ); + } + else if ( kv[0].trim().equals( "Subject" ) ) + { + content.getMetadata().set( "subject", kv[1].trim() ); + } + else + { + content.getMetadata().set( kv[0].trim(), kv[1].trim() ); + } } - p.destroy( ); - - // Extract the title from the HTML-formatted metadata output of the above call to pdftotext. - Matcher m = Pattern.compile( "<title>(.+)</title>", Pattern.DOTALL ).matcher( head ); - if ( m.find( ) ) + // Create a Process which calls 'pdftotext' to extract the content. + if ( raw ) { - title = m.group(1); + pinfo = execute( pdftotext, "-q", "-raw", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" ); } - - // Create a Process which calls 'pdftotext' to extract the content. - pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" ); - p = pb.start( ); + else + { + pinfo = execute( pdftotext, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" ); + } - p.getOutputStream( ).close( ); - text = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) ); - err = suck( p.getErrorStream( ) ); - - if ( err.length > 0 ) + if ( pinfo.stderr.length() > 0 ) { - LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) ); + LOG.warn( "Error from pdftotext: " + pinfo.stderr ); } - p.destroy( ); - - // No outlinks, sorry :( + // No outlinks with pdftotext, sorry :( Outlink[] outlinks = new Outlink[0]; ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS, @@ -131,7 +151,7 @@ content.getMetadata(), metadata ); - return ParseResult.createParseResult( content.getUrl(), new ParseImpl( text, parseData ) ); + return ParseResult.createParseResult( content.getUrl(), new ParseImpl( pinfo.stdout, parseData ) ); } catch ( Exception e ) { @@ -145,37 +165,83 @@ } } - // TODO! + // FIXME: Is there anything better/smarter to do here? return null; } - - private byte[] suck( InputStream is ) - throws IOException + + ProcessInfo execute( String... args ) throws Exception { - ByteArrayOutputStream baos = new ByteArrayOutputStream( 4* 1024 ); - byte[] buf = new byte[1024*4]; - int c = -1; - while ( (c = is.read( buf )) != -1 ) - { - baos.write( buf, 0, c ); - } + // Create a Process which calls 'pdfinfo' to extract the metadata. + ProcessBuilder pb = new ProcessBuilder( args ); - return baos.toByteArray(); + Process p = pb.start(); + + // Close the stdin of the child process. + p.getOutputStream( ).close(); + + Sucker stdoutSucker = new Sucker( p.getInputStream() ); + Sucker stderrSucker = new Sucker( p.getErrorStream() ); + + stdoutSucker.start(); + stderrSucker.start(); + + p.waitFor(); + + ProcessInfo result = new ProcessInfo(); + + result.returnCode = p.exitValue(); + result.stdout = new String( stdoutSucker.getBytes(), "utf-8" ); + result.stderr = new String( stderrSucker.getBytes(), "utf-8" ); + + return result; } - private String suck( InputStreamReader reader ) - throws IOException + /** + * Simple struct to hold sub-process return code, stdout and stderr. + */ + static class ProcessInfo { - StringBuilder sb = new StringBuilder( 1024 * 4 ); - char[] buf = new char[1024*4]; - int c = -1; + public int returnCode; + public String stdout; + public String stderr; + } + + /** + * Thread that sucks up the output of a sub-process stdout or stderr. + */ + static class Sucker extends Thread + { + InputStream is; + ByteArrayOutputStream baos; - while ( (c = reader.read( buf )) != -1 ) - { - sb.append( buf, 0, c ); - } + public Sucker( InputStream is ) + { + this.is = is; + this.baos = new ByteArrayOutputStream( 4*1024 ); + } - return sb.toString(); - } + public byte[] getBytes() + { + return this.baos.toByteArray(); + } + + public void run( ) + { + try + { + byte[] buf = new byte[4*1024]; + int c = -1; + while ( (c = is.read( buf )) != -1 ) + { + baos.write( buf, 0, c ); + } + } + catch ( IOException ioe ) + { + PDFParser.LOG.warn( "Error reading from sub-process: " + ioe ); + } + } + + }; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |