You can subscribe to this list here.
| 2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
| 2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
| 2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
| 2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
| 2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
| 2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
| 2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
|
From: <nl...@ar...> - 2012-10-15 14:34:46
|
wayback - Build # 34 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/34/ to view the results. |
|
From: <nl...@ar...> - 2012-10-12 01:26:49
|
wayback - Build # 33 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/33/ to view the results. |
|
From: <nl...@ar...> - 2012-10-09 19:55:38
|
wayback - Build # 32 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/32/ to view the results. |
|
From: <nl...@ar...> - 2012-10-09 19:49:57
|
wayback - Build # 31 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/31/ to view the results. |
|
From: <nl...@ar...> - 2012-10-05 22:07:15
|
wayback - Build # 30 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/30/ to view the results. |
|
From: <nl...@ar...> - 2012-10-02 22:27:03
|
wayback - Build # 29 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/29/ to view the results. |
|
From: <nl...@ar...> - 2012-09-28 16:07:11
|
wayback - Build # 28 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/28/ to view the results. |
|
From: <nl...@ar...> - 2012-09-26 09:22:03
|
wayback - Build # 27 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/27/ to view the results. |
|
From: <nl...@ar...> - 2012-09-26 02:12:11
|
wayback - Build # 26 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/26/ to view the results. |
|
From: <nl...@ar...> - 2012-09-13 15:57:06
|
wayback - Build # 25 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/25/ to view the results. |
|
From: <nl...@ar...> - 2012-09-06 19:16:39
|
wayback - Build # 24 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/24/ to view the results. |
|
From: <nl...@ar...> - 2012-09-05 21:46:48
|
wayback - Build # 23 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/23/ to view the results. |
|
From: <nl...@ar...> - 2012-08-31 03:56:28
|
wayback - Build # 22 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/22/ to view the results. |
|
From: <nl...@ar...> - 2012-08-30 08:46:20
|
wayback - Build # 21 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/21/ to view the results. |
|
From: <nl...@ar...> - 2012-08-30 04:31:49
|
wayback - Build # 20 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/20/ to view the results. |
|
From: <nl...@ar...> - 2012-08-29 17:51:43
|
wayback - Build # 19 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/19/ to view the results. |
|
From: <nl...@ar...> - 2012-08-15 21:21:25
|
wayback - Build # 18 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/18/ to view the results. |
|
From: <nl...@ar...> - 2012-08-15 19:26:37
|
wayback - Build # 17 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/17/ to view the results. |
|
From: <nl...@ar...> - 2012-08-04 00:51:33
|
wayback - Build # 16 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/16/ to view the results. |
|
From: <nl...@ar...> - 2012-08-02 23:46:36
|
wayback - Build # 15 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/15/ to view the results. |
|
From: <nl...@ar...> - 2012-07-20 01:01:35
|
wayback - Build # 14 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/14/ to view the results. |
|
From: <nl...@ar...> - 2012-07-13 02:21:28
|
wayback - Build # 13 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/13/ to view the results. |
|
From: <nl...@ar...> - 2012-07-04 21:44:37
|
wayback - Build # 12 - Successful: Check console output at https://builds.archive.org:1443/job/wayback/12/ to view the results. |
Revision: 3628
http://archive-access.svn.sourceforge.net/archive-access/?rev=3628&view=rev
Author: binzino
Date: 2012-07-03 00:43:30 +0000 (Tue, 03 Jul 2012)
Log Message:
-----------
Prepend PDF-only metadata property keys with "pdf.".
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-07-02 23:33:10 UTC (rev 3627)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-07-03 00:43:30 UTC (rev 3628)
@@ -123,7 +123,8 @@
}
else
{
- content.getMetadata().set( kv[0].trim(), kv[1].trim() );
+ // For all other properties, append "pdf." prexif.
+ content.getMetadata().set( "pdf." + kv[0].trim(), kv[1].trim() );
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 3627
http://archive-access.svn.sourceforge.net/archive-access/?rev=3627&view=rev
Author: binzino
Date: 2012-07-02 23:33:10 +0000 (Mon, 02 Jul 2012)
Log Message:
-----------
Use pdfinfo for extracting PDF file metadata. Also re-wrote sub-process handling to correctly handle stdout and stderr in separate threads.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-04-25 02:12:22 UTC (rev 3626)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-07-02 23:33:10 UTC (rev 3627)
@@ -51,11 +51,26 @@
{
public static final Log LOG = LogFactory.getLog( PDFParser.class );
- private Configuration conf;
+ public Configuration conf;
+ public String pdfinfo;
+ public String pdftotext;
+ public String lastPage;
+ public boolean raw;
+
public void setConf( Configuration conf )
{
this.conf = conf;
+
+ this.pdfinfo = this.conf.get( "nutchwax.parse.pdf2.pdfinfo.path", "/usr/bin/pdfinfo" );
+ this.pdftotext = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" );
+ this.lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) );
+ this.raw = this.conf.getBoolean( "nutchwax.parse.pdf2.raw", true );
+
+ LOG.info( "nutchwax.parse.pdf2.pdfinfo.path = " + this.pdfinfo );
+ LOG.info( "nutchwax.parse.pdf2.pdftotext.path = " + this.pdftotext );
+ LOG.info( "nutchwax.parse.pdf2.lastPage = " + this.lastPage );
+ LOG.info( "nutchwax.parse.pdf2.raw = " + this.raw );
}
public Configuration getConf( )
@@ -69,8 +84,6 @@
String title = "";
String text = "";
- byte[] raw = content.getContent( );
-
File tmpfile = null;
try
{
@@ -78,51 +91,58 @@
// Write the PDF document to the tmp file.
FileOutputStream fos = new FileOutputStream( tmpfile );
- fos.write( raw );
+ fos.write( content.getContent( ) );
fos.close();
- String exepath = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" );
- String lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) );
+ // Create a Process which calls 'pdfinfo' to extract the metadata.
+ ProcessInfo pinfo = execute( pdfinfo, "-enc", "UTF-8", tmpfile.toString() );
- // Create a Process which calls 'pdftotext' to extract the metadata. Only get the first page.
- ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" );
-
- Process p = pb.start();
-
- p.getOutputStream( ).close();
- String head = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) );
- byte[] err = suck( p.getErrorStream( ) );
+ if ( pinfo.stderr.length() > 0 )
+ {
+ LOG.warn( "Error from pdfinfo: " + pinfo.stderr );
+ }
- if ( err.length > 0 )
+ // Extract the title and other metadata properties from the above call to pdfinfo.
+ String[] properties = pinfo.stdout.split("\n");
+ for ( String property : properties )
{
- LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) );
+ String[] kv = property.split( "[:]", 2 );
+ if ( kv.length != 2 ) continue ;
+
+ if ( kv[0].trim().equals( "Title" ) )
+ {
+ title = kv[1].trim();
+ }
+ else if ( kv[0].trim().equals( "Keywords" ) )
+ {
+ content.getMetadata().set( "keywords", kv[1].trim() );
+ }
+ else if ( kv[0].trim().equals( "Subject" ) )
+ {
+ content.getMetadata().set( "subject", kv[1].trim() );
+ }
+ else
+ {
+ content.getMetadata().set( kv[0].trim(), kv[1].trim() );
+ }
}
- p.destroy( );
-
- // Extract the title from the HTML-formatted metadata output of the above call to pdftotext.
- Matcher m = Pattern.compile( "<title>(.+)</title>", Pattern.DOTALL ).matcher( head );
- if ( m.find( ) )
+ // Create a Process which calls 'pdftotext' to extract the content.
+ if ( raw )
{
- title = m.group(1);
+ pinfo = execute( pdftotext, "-q", "-raw", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
}
-
- // Create a Process which calls 'pdftotext' to extract the content.
- pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
- p = pb.start( );
+ else
+ {
+ pinfo = execute( pdftotext, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
+ }
- p.getOutputStream( ).close( );
- text = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) );
- err = suck( p.getErrorStream( ) );
-
- if ( err.length > 0 )
+ if ( pinfo.stderr.length() > 0 )
{
- LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) );
+ LOG.warn( "Error from pdftotext: " + pinfo.stderr );
}
- p.destroy( );
-
- // No outlinks, sorry :(
+ // No outlinks with pdftotext, sorry :(
Outlink[] outlinks = new Outlink[0];
ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS,
@@ -131,7 +151,7 @@
content.getMetadata(),
metadata );
- return ParseResult.createParseResult( content.getUrl(), new ParseImpl( text, parseData ) );
+ return ParseResult.createParseResult( content.getUrl(), new ParseImpl( pinfo.stdout, parseData ) );
}
catch ( Exception e )
{
@@ -145,37 +165,83 @@
}
}
- // TODO!
+ // FIXME: Is there anything better/smarter to do here?
return null;
}
-
- private byte[] suck( InputStream is )
- throws IOException
+
+ ProcessInfo execute( String... args ) throws Exception
{
- ByteArrayOutputStream baos = new ByteArrayOutputStream( 4* 1024 );
- byte[] buf = new byte[1024*4];
- int c = -1;
- while ( (c = is.read( buf )) != -1 )
- {
- baos.write( buf, 0, c );
- }
+ // Create a Process which calls 'pdfinfo' to extract the metadata.
+ ProcessBuilder pb = new ProcessBuilder( args );
- return baos.toByteArray();
+ Process p = pb.start();
+
+ // Close the stdin of the child process.
+ p.getOutputStream( ).close();
+
+ Sucker stdoutSucker = new Sucker( p.getInputStream() );
+ Sucker stderrSucker = new Sucker( p.getErrorStream() );
+
+ stdoutSucker.start();
+ stderrSucker.start();
+
+ p.waitFor();
+
+ ProcessInfo result = new ProcessInfo();
+
+ result.returnCode = p.exitValue();
+ result.stdout = new String( stdoutSucker.getBytes(), "utf-8" );
+ result.stderr = new String( stderrSucker.getBytes(), "utf-8" );
+
+ return result;
}
- private String suck( InputStreamReader reader )
- throws IOException
+ /**
+ * Simple struct to hold sub-process return code, stdout and stderr.
+ */
+ static class ProcessInfo
{
- StringBuilder sb = new StringBuilder( 1024 * 4 );
- char[] buf = new char[1024*4];
- int c = -1;
+ public int returnCode;
+ public String stdout;
+ public String stderr;
+ }
+
+ /**
+ * Thread that sucks up the output of a sub-process stdout or stderr.
+ */
+ static class Sucker extends Thread
+ {
+ InputStream is;
+ ByteArrayOutputStream baos;
- while ( (c = reader.read( buf )) != -1 )
- {
- sb.append( buf, 0, c );
- }
+ public Sucker( InputStream is )
+ {
+ this.is = is;
+ this.baos = new ByteArrayOutputStream( 4*1024 );
+ }
- return sb.toString();
- }
+ public byte[] getBytes()
+ {
+ return this.baos.toByteArray();
+ }
+
+ public void run( )
+ {
+ try
+ {
+ byte[] buf = new byte[4*1024];
+ int c = -1;
+ while ( (c = is.read( buf )) != -1 )
+ {
+ baos.write( buf, 0, c );
+ }
+ }
+ catch ( IOException ioe )
+ {
+ PDFParser.LOG.warn( "Error reading from sub-process: " + ioe );
+ }
+ }
+
+ };
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|