#!/usr/bin/env /usr/local/Python/bin/python2.1 """ Decoder for bibliographic data, BibTeX Usage: python bibtex2xml.py bibfile.bib > bibfile.xml v.9 (c)2002-08-11 Vidar Bronken Gundersen http://bibtexml.sf.net/ Reuse approved as long as this notification is kept. Licence: GPL. Contributions/thanks to: Egon Willighagen, http://sf.net/projects/jreferences/ Richard Mahoney (for providing a test case) """ import string, re def bibtexdecoder(filecontents_source): filecontents = [] endentry = '' for line in filecontents_source: line = line[:-1] # encode character entities line = re.sub('&', '&', line) line = re.sub('<', '<', line) line = re.sub('~', ' ', line) # start item: publication type (store for later use) if re.match('@(\w*){(.*),', line): arttype = re.sub('@(\w*){(.*),', '\g<1>', line) arttype = string.lower(arttype) artid = re.sub('@(\w*){(.*),', '\g<2>', line) endentry = '' + '\n' line = '\n' + \ '' # end item if re.match('^\s*}\s*$', line): line = re.sub('^\s*}\s*$', endentry, line) # field, publication info if re.match('\s*(\w*)\s*=\s*{(.*)},?', line): field = re.sub('\s*(\w*)\s*=.*', '\g<1>', line) field = string.lower(field) data = re.sub('\s*(\w*)\s*=\s*{(.*)},?', '\g<2>', line) line = '' + data + \ '' elif re.match('\s*(\w*)\s*=\s*"(.*)",?', line): field = re.sub('\s*(\w*)\s*=.*', '\g<1>', line) field = string.lower(field) data = re.sub('\s*(\w*)\s*=\s*"(.*)",?', '\g<2>', line) line = '' + data + \ '' filecontents.append(line) return filecontents def bibtexwasher(filecontents_source): filecontents = [] # remove trailing and excessive whitespace for line in filecontents_source: line = line[:-1] line = string.strip(line) line = re.sub('\s+', ' ', line) # ignore/remove comments line = re.sub('(%[^\n]*)', '', line) filecontents.append(line) filecontents = string.join(filecontents, '') # cleanup atypical fields # @STRING, # month = "1~" # jan, # title = 1966 # WGA, # month = jan # " ~14--15", filecontents = re.sub('=\s*\\"?([\w\d~-]+)\\"?\s*#\s*\\"?([\w\d\s~-]+)\\"?,', '=\"\g<1>\g<2>\",', filecontents) # missing delimiter "" or {} filecontents = re.sub('=\s*([\w\d-]+)\s*,', '=\"\g<1>\",', filecontents) # change field = {data}, to field = "data", #filecontents = re.sub('\s*(\w+)\s*=\s*{(.+)},?', # '\g<1> = \"\g<2>\",', filecontents) # split lines according to preferred syntax scheme filecontents = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecontents) filecontents = re.sub('\\",', '\",\n', filecontents) # end of entry, }\n} filecontents = re.sub('}\s*}', '}\n}', filecontents) # start of entry, @type{key},\n filecontents = re.sub('(@\w*)\s*({[^,\s]*)\s*,', '\n\n\g<1>\g<2>,\n', filecontents) # character encoding, reserved latex characters filecontents = re.sub('{\\\&}', '&', filecontents) filecontents = re.sub('\\\&', '&', filecontents) #filecontents = re.sub('\\\,', '', filecontents) #filecontents = re.sub('~', ' ', filecontents) # gather filecontents = string.split(filecontents, '\n') i = 0 for line in filecontents: filecontents[i] = line + '\n' i = i+1 filecontents = filecontents[2:] return filecontents def filehandler(filepath): try: fd = open(filepath, 'r') filecontents_source = fd.readlines() fd.close() except: print 'Could not open file:', filepath washeddata = bibtexwasher(filecontents_source) outdata = bibtexdecoder(washeddata) print '' #print '' print '' print '' print for line in outdata: print line print print '' print '' # main program def main(): import sys if sys.argv[1:]: filepath = sys.argv[1] else: print "No input file" sys.exit() filehandler(filepath) if __name__ == "__main__": main() # end python script