Revision: 1509
http://tcobrowser.svn.sourceforge.net/tcobrowser/?rev=1509&view=rev
Author: amaxwell
Date: 2009-10-16 18:34:30 +0000 (Fri, 16 Oct 2009)
Log Message:
-----------
add Mike's IEEE Xplore parser
Modified Paths:
--------------
trunk/bibdesk/BDSKWebParser.h
trunk/bibdesk/BDSKWebParser.m
trunk/bibdesk/Bibdesk.xcodeproj/project.pbxproj
Added Paths:
-----------
trunk/bibdesk/BDSKIEEEXploreParser.h
trunk/bibdesk/BDSKIEEEXploreParser.m
Added: trunk/bibdesk/BDSKIEEEXploreParser.h
===================================================================
--- trunk/bibdesk/BDSKIEEEXploreParser.h (rev 0)
+++ trunk/bibdesk/BDSKIEEEXploreParser.h 2009-10-16 18:34:30 UTC (rev 1509)
@@ -0,0 +1,51 @@
+//
+// BDSKIEEEXploreParser.h
+//
+// Created by Michael O. McCracken on 9/26/07.
+/*
+ This software is Copyright (c) 2007-2009
+ Michael O. McCracken. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of Michael O. McCracken nor the names of any
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#import <Cocoa/Cocoa.h>
+#import "BDSKWebParser.h"
+#import "BDSKBibTeXParser.h"
+#import "NSXMLNode_BDSKExtensions.h"
+
+
+@interface BDSKIEEEXploreParser : BDSKWebParser
++ (NSString *)ARNumberFromURLSubstring:(NSString *)urlPath error:(NSError **)outError;
++ (NSString *)ISNumberFromURLSubstring:(NSString *)urlPath error:(NSError **)outError;
++ (BibItem *)itemFromURL:(NSURL *)url error:(NSError **)outError;
++ (BibItem *)itemFromURL:(NSURL *)url xmlDocument:(NSXMLDocument *)xmlDocument error:(NSError **)outError;
+@end
+
+
Added: trunk/bibdesk/BDSKIEEEXploreParser.m
===================================================================
--- trunk/bibdesk/BDSKIEEEXploreParser.m (rev 0)
+++ trunk/bibdesk/BDSKIEEEXploreParser.m 2009-10-16 18:34:30 UTC (rev 1509)
@@ -0,0 +1,229 @@
+//
+// BDSKIEEEXploreParser.m
+//
+// Created by Michael O. McCracken on 9/26/07.
+/*
+ This software is Copyright (c) 2007-2009
+ Michael O. McCracken. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of Michael O. McCracken nor the names of any
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#import "BDSKIEEEXploreParser.h"
+#import <WebKit/WebKit.h>
+#import "BibItem.h"
+#import "BDSKBibTeXParser.h"
+#import "NSError_BDSKExtensions.h"
+#import <AGRegex/AGRegex.h>
+
+// sometimes the link says AbstractPlus, sometimes it only says Abstract. This should catch both:
+NSString *containsAbstractPlusLinkNode = @"//a[contains(lower-case(text()),'abstract')]";
+NSString *abstractPageURLPath = @"/xpls/abs_all.jsp";
+NSString *searchResultPageURLPath = @"/search/srchabstract.jsp";
+
+@implementation BDSKIEEEXploreParser
+
++ (BOOL)canParseDocument:(DOMDocument *)domDocument xmlDocument:(NSXMLDocument *)xmlDocument fromURL:(NSURL *)url{
+
+ if (! [[url host] isEqualToString:@"ieeexplore.ieee.org"]){
+ return NO;
+ }
+
+ bool isOnAbstractPage = [[url path] isEqualToString:abstractPageURLPath];
+ bool isOnSearchResultPage = [[url path] isEqualToString:searchResultPageURLPath];
+
+ NSError *error = nil;
+
+ bool nodecountisok = [[[xmlDocument rootElement] nodesForXPath:containsAbstractPlusLinkNode error:&error] count] > 0;
+
+ return nodecountisok || isOnAbstractPage || isOnSearchResultPage;
+}
+
+
++ (NSString *)ARNumberFromURLSubstring:(NSString *)urlPath error:(NSError **)outError{
+
+ AGRegex * ARNumberRegex = [AGRegex regexWithPattern:@"arnumber=([0-9]+)" options:AGRegexMultiline];
+ AGRegexMatch *match = [ARNumberRegex findInString:urlPath];
+ if([match count] == 0){
+ *outError = [NSError mutableLocalErrorWithCode:0 localizedDescription:NSLocalizedString(@"missingARNumberKey", @"Can't get an ARNumber from the URL")];
+
+ return NULL;
+ }
+ return [match groupAtIndex:1];
+}
+
++ (NSString *)ISNumberFromURLSubstring:(NSString *)urlPath error:(NSError **)outError{
+
+ AGRegex * ISNumberRegex = [AGRegex regexWithPattern:@"isnumber=([0-9]+)" options:AGRegexMultiline];
+ AGRegexMatch *match = [ISNumberRegex findInString:urlPath];
+ if([match count] == 0){
+ *outError = [NSError mutableLocalErrorWithCode:0 localizedDescription:NSLocalizedString(@"missingISNumberKey", @"Can't get an ISNumber from the URL")];
+
+ return NULL;
+ }
+ return [match groupAtIndex:1];
+}
+
+
+
++ (NSArray *)itemsFromDocument:(DOMDocument *)domDocument xmlDocument:(NSXMLDocument *)xmlDocument fromURL:(NSURL *)url error:(NSError **)outError{
+
+ NSMutableArray *items = [NSMutableArray arrayWithCapacity:0];
+
+ // http://ieeexplore.ieee.org/search/srchabstract.jsp?arnumber=4723961&isnumber=4723954&punumber=4711036&k2dockey=4723961@...>
+ // http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4723954&arnumber=4723958&count=9&index=3
+ // http://ieeexplore.ieee.org/search/srchabstract.jsp?arnumber=928956&isnumber=20064&punumber=7385&k2dockey=928956@...>
+ if([[url path] isEqualToString:abstractPageURLPath] ||
+ [[url path] isEqualToString:searchResultPageURLPath]){
+
+ return [NSArray arrayWithObject:[self itemFromURL:url xmlDocument:xmlDocument error:outError]];
+ }else{
+ // The following code parses all the links on a TOC page and is unusably slow.
+ // Included for posterity in case we ever add async parsing.
+ /*
+ NSError *error = nil;
+
+ NSArray *AbstractPlusLinkNodes = [[xmlDocument rootElement] nodesForXPath:containsAbstractPlusLinkNode
+ error:&error];
+
+ if ([AbstractPlusLinkNodes count] < 1) {
+ if (outError) *outError = error;
+ return nil;
+ }
+
+ NSUInteger i, count = [AbstractPlusLinkNodes count];
+ for (i = 0; i < count; i++) {
+ NSXMLNode *aplinknode = [AbstractPlusLinkNodes objectAtIndex:i];
+ NSString *hrefValue = [aplinknode stringValueOfAttribute:@"href"];
+ NSURL *abstractPageURL = [NSURL URLWithString:[NSString stringWithFormat:@"http://%@%@", [url host], hrefValue]];
+
+ [items addObject:[self itemFromURL:abstractPageURL error:&error]];
+ }
+ */
+
+ // display a fake item in the table so the user knows one of the items failed to parse, but still gets the rest of the data
+ NSString *msg = NSLocalizedString(@"Click the \"AbstractPlus\" link for the item you want to import.",
+ @"IEEE TOC page fake marker item title");
+ NSDictionary *pubFields = [NSDictionary dictionaryWithObjectsAndKeys:msg, BDSKTitleString, nil];
+ BibItem *tocMarkerItem = [[BibItem alloc] initWithType:BDSKMiscString fileType:BDSKBibtexString citeKey:nil pubFields:pubFields isNew:YES];
+ [items addObject:tocMarkerItem];
+ [tocMarkerItem release];
+
+ }
+
+ return items;
+
+}
+
+
++ (BibItem *)itemFromURL:(NSURL *)url error:(NSError **)outError{
+ return [self itemFromURL:url xmlDocument:nil error:outError];
+}
+
++ (BibItem *)itemFromURL:(NSURL *)url xmlDocument:(NSXMLDocument *)xmlDocument error:(NSError **)outError{
+
+ NSError *error;
+
+ NSString *arnumberString = [self ARNumberFromURLSubstring:[url query] error:outError];
+ NSString *isnumberString = [self ISNumberFromURLSubstring:[url query] error:outError];
+
+
+ // Query IEEEXplore with a POST request
+
+ NSString * serverName = [[url host] lowercaseString];
+
+ NSString * URLString = [NSString stringWithFormat:@"http://%@/xpls/citationAct", serverName];
+
+ NSMutableURLRequest * request = [NSMutableURLRequest requestWithURL:[NSURL URLWithString:URLString]];
+ [request setHTTPMethod:@"POST"];
+ [request setValue:@"application/x-www-form-urlencoded" forHTTPHeaderField:@"Content-type"];
+
+ // note, do not actually url-encode this. they are expecting their angle brackets raw.
+ NSString * queryString = [NSString stringWithFormat:@"dlSelect=cite_abs&fileFormate=BibTex&arnumber=<arnumber>%@</arnumber>", arnumberString];
+
+ [request setHTTPBody:[queryString dataUsingEncoding:NSUTF8StringEncoding]];
+
+ NSURLResponse * response;
+ NSData * result = [NSURLConnection sendSynchronousRequest:request returningResponse: &response error: &error];
+
+ if (nil == result) {
+ if (outError != NULL) { *outError = error; }
+ return nil;
+ }
+
+ NSString * bibTeXString = [[[NSString alloc] initWithData:result encoding:NSUTF8StringEncoding] autorelease];
+ bibTeXString = [bibTeXString fastStringByCollapsingWhitespaceAndNewlinesAndRemovingSurroundingWhitespaceAndNewlines];
+
+ bibTeXString = [bibTeXString stringByRemovingString:@"<br>"]; // yes, that's all. whee!
+
+ // TODO: need to unescape XML entities.
+ // for example: http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4977283&arnumber=4977305&count=206&index=11
+ // has a (tm) as an entity.
+
+ BOOL isPartialData;
+ NSError * ignoreError;
+ NSArray * newPubs = [BDSKBibTeXParser itemsFromString:bibTeXString document:nil isPartialData:&isPartialData error: &ignoreError];
+
+ BibItem *newPub = nil;
+
+ if (newPubs != nil && [newPubs count] > 0) {
+ newPub = [newPubs objectAtIndex:0];
+ }
+
+ // Get the PDF URL, if possible:
+ // Need to load the page if it isn't passed in:
+ if(xmlDocument == nil){
+ NSString * ARNumberURLString = [NSString stringWithFormat:@"http://ieeexplore.ieee.org/xpls/abs_all.jsp?tp=&arnumber=%@&isnumber=%@", arnumberString, isnumberString];
+
+ request = [NSMutableURLRequest requestWithURL:[NSURL URLWithString:ARNumberURLString]];
+
+ result = [NSURLConnection sendSynchronousRequest:request returningResponse: &response error: &error];
+ NSString * abs_allHTMLString = [[[NSString alloc] initWithData:result encoding:NSUTF8StringEncoding] autorelease];
+ xmlDocument = [[NSXMLDocument alloc] initWithXMLString:abs_allHTMLString
+ options:NSXMLDocumentTidyHTML
+ error:&error];
+
+ }
+ NSArray *pdfLinkNodes = [[xmlDocument rootElement] nodesForXPath:@"//a[contains(text(), 'PDF')]"
+ error:&error];
+ if ([pdfLinkNodes count] > 0){
+ NSXMLNode *pdfLinkNode = [pdfLinkNodes objectAtIndex:0];
+ NSString *hrefValue = [pdfLinkNode stringValueOfAttribute:@"href"];
+
+ NSString *pdfURLString = [NSString stringWithFormat:@"http://%@%@", serverName, hrefValue];
+
+ [newPub setField:BDSKUrlString toValue:pdfURLString];
+ }
+
+ return newPub;
+}
+
+
+@end
+
Modified: trunk/bibdesk/BDSKWebParser.h
===================================================================
--- trunk/bibdesk/BDSKWebParser.h 2009-10-14 20:24:15 UTC (rev 1508)
+++ trunk/bibdesk/BDSKWebParser.h 2009-10-16 18:34:30 UTC (rev 1509)
@@ -41,12 +41,13 @@
enum {
BDSKUnknownWebType = -1,
- BDSKHCiteWebType,
BDSKCiteULikeWebType,
BDSKACMDLWebType,
BDSKHubmedWebType,
BDSKGoogleScholarWebType,
- BDSKCOinSType
+ BDSKIEEEXploreWebType,
+ BDSKCOinSType,
+ BDSKHCiteWebType,
};
@interface BDSKWebParser : NSObject
Modified: trunk/bibdesk/BDSKWebParser.m
===================================================================
--- trunk/bibdesk/BDSKWebParser.m 2009-10-14 20:24:15 UTC (rev 1508)
+++ trunk/bibdesk/BDSKWebParser.m 2009-10-16 18:34:30 UTC (rev 1509)
@@ -45,6 +45,7 @@
#import "BDSKGoogleScholarParser.h"
#import "NSError_BDSKExtensions.h"
#import "BDSKCOinSParser.h"
+#import "BDSKIEEEXploreParser.h"
@implementation BDSKWebParser
@@ -59,6 +60,8 @@
return [BDSKCiteULikeParser class];
case BDSKHubmedWebType:
return [BDSKHubmedParser class];
+ case BDSKIEEEXploreWebType:
+ return [BDSKIEEEXploreParser class];
case BDSKCOinSType:
return [BDSKCOinSParser class];
case BDSKHCiteWebType:
@@ -77,6 +80,9 @@
return BDSKCiteULikeWebType;
if([BDSKACMDLParser canParseDocument:domDocument xmlDocument:xmlDocument fromURL:url])
return BDSKACMDLWebType;
+ if([BDSKIEEEXploreParser canParseDocument:domDocument xmlDocument:xmlDocument fromURL:url])
+ return BDSKIEEEXploreWebType;
+
if([BDSKCOinSParser canParseDocument:domDocument xmlDocument:xmlDocument fromURL:url])
return BDSKCOinSType;
if([BDSKHCiteParser canParseDocument:domDocument xmlDocument:xmlDocument fromURL:url])
Modified: trunk/bibdesk/Bibdesk.xcodeproj/project.pbxproj
===================================================================
--- trunk/bibdesk/Bibdesk.xcodeproj/project.pbxproj 2009-10-14 20:24:15 UTC (rev 1508)
+++ trunk/bibdesk/Bibdesk.xcodeproj/project.pbxproj 2009-10-16 18:34:30 UTC (rev 1509)
@@ -285,6 +285,8 @@
F948BF980FCCA5290058AFB5 /* BDSKStackView.m in Sources */ = {isa = PBXBuildFile; fileRef = F948BF960FCCA5290058AFB5 /* BDSKStackView.m */; };
F94DB0F90B3E2FA1006F37A2 /* BDSKSearchGroup.m in Sources */ = {isa = PBXBuildFile; fileRef = F94DB0F70B3E2FA1006F37A2 /* BDSKSearchGroup.m */; };
F94DE74C09CB46FF00B5FD51 /* BDSKPersistentSearch.m in Sources */ = {isa = PBXBuildFile; fileRef = F94DE74A09CB46FF00B5FD51 /* BDSKPersistentSearch.m */; };
+ F955039D1088ED560014BD99 /* BDSKIEEEXploreParser.h in Headers */ = {isa = PBXBuildFile; fileRef = F955039B1088ED560014BD99 /* BDSKIEEEXploreParser.h */; };
+ F955039E1088ED560014BD99 /* BDSKIEEEXploreParser.m in Sources */ = {isa = PBXBuildFile; fileRef = F955039C1088ED560014BD99 /* BDSKIEEEXploreParser.m */; };
F95B874E0D567E8E005DC4F3 /* BibDesk.qlgenerator in CopyFiles */ = {isa = PBXBuildFile; fileRef = F98C6FD00CDD30A2002FCAD8 /* BibDesk.qlgenerator */; };
F95CC0C3087F5378002C5694 /* NSFileManager_BDSKExtensions.m in Sources */ = {isa = PBXBuildFile; fileRef = F95CC0C1087F5378002C5694 /* NSFileManager_BDSKExtensions.m */; };
F95D3FAF09DDF6A900D793A6 /* BibPref_Sharing.m in Sources */ = {isa = PBXBuildFile; fileRef = F95D3FAD09DDF6A900D793A6 /* BibPref_Sharing.m */; };
@@ -1200,6 +1202,8 @@
F94DB0F70B3E2FA1006F37A2 /* BDSKSearchGroup.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = BDSKSearchGroup.m; sourceTree = "<group>"; };
F94DE74909CB46FF00B5FD51 /* BDSKPersistentSearch.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = BDSKPersistentSearch.h; sourceTree = "<group>"; };
F94DE74A09CB46FF00B5FD51 /* BDSKPersistentSearch.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = BDSKPersistentSearch.m; sourceTree = "<group>"; };
+ F955039B1088ED560014BD99 /* BDSKIEEEXploreParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = BDSKIEEEXploreParser.h; sourceTree = "<group>"; };
+ F955039C1088ED560014BD99 /* BDSKIEEEXploreParser.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = BDSKIEEEXploreParser.m; sourceTree = "<group>"; };
F95C5E950D10E85400C81970 /* BibDesk-Release.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = "BibDesk-Release.xcconfig"; sourceTree = "<group>"; };
F95C5E970D10E87C00C81970 /* BibDesk-Debug.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = "BibDesk-Debug.xcconfig"; sourceTree = "<group>"; };
F95C5E990D10E89500C81970 /* BibDesk-Common.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = "BibDesk-Common.xcconfig"; sourceTree = "<group>"; };
@@ -1846,6 +1850,8 @@
CE38FA8D091D267A00BCB69D /* Parsers */ = {
isa = PBXGroup;
children = (
+ F955039B1088ED560014BD99 /* BDSKIEEEXploreParser.h */,
+ F955039C1088ED560014BD99 /* BDSKIEEEXploreParser.m */,
F9BA452D100E334400FE4435 /* BDSKCOinSParser.h */,
F9BA452E100E334400FE4435 /* BDSKCOinSParser.m */,
F9B39F2F0FACCBBE00FF8853 /* BDSKPubMedXMLParser.m */,
@@ -2358,6 +2364,7 @@
CE424A450D0F123500F824E7 /* BDSKCompletionServerProtocol.h in Headers */,
F948BF970FCCA5290058AFB5 /* BDSKStackView.h in Headers */,
F9BA452F100E334400FE4435 /* BDSKCOinSParser.h in Headers */,
+ F955039D1088ED560014BD99 /* BDSKIEEEXploreParser.h in Headers */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@@ -2839,7 +2846,7 @@
);
runOnlyForDeploymentPostprocessing = 0;
shellPath = /bin/sh;
- shellScript = "/usr/bin/codesign -f -s \"BibDesk Signing Certificate\" \"${BUILT_PRODUCTS_DIR}/${FULL_PRODUCT_NAME}\"";
+ shellScript = "/usr/bin/codesign -f -s \"TeX Live Utility Signing Certificate\" \"${BUILT_PRODUCTS_DIR}/${FULL_PRODUCT_NAME}\"";
};
F9E0B071083E450C000109C6 /* ShellScript */ = {
isa = PBXShellScriptBuildPhase;
@@ -3123,6 +3130,7 @@
F948BF980FCCA5290058AFB5 /* BDSKStackView.m in Sources */,
F93F0A720FF3D41A00B08718 /* BDSKSkimReader.m in Sources */,
F9BA4530100E334400FE4435 /* BDSKCOinSParser.m in Sources */,
+ F955039E1088ED560014BD99 /* BDSKIEEEXploreParser.m in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|