Brian Fife - 2013-07-28

If you have bad .cdf files with 17 columns, you can run the following script to repair the data. Specify the .cdf file as an argument, and redirect the output to a new file.

./parse.py old/log.1.0.cdf > log.1.0.cdf

#!/usr/bin/python
import sys

with open(sys.argv[1]) as lines:
  for line in lines:
    line = line.rstrip()
    cols = line.split(',')
    if len(cols) == 17:
      p2p = 0
      total = cols[9]
      icmp = cols[10]
      udp = cols[11]
      tcp = cols[12]
      if total.find("0") == 0:
        total = total[1:]
      else:
        newtotal = str(int(icmp) + int(udp) + int(tcp))
        offset = len(total) - len(newtotal)
        for i in xrange(len(newtotal), 0, -1):
          tindex = total[offset:].find(str(newtotal[:i]))
          if (tindex != -1):
            p2p = total[:offset+tindex]
            total = total[offset+tindex:]
            break
      print "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" % (cols[0], cols[1], cols[2], cols[3], cols[4], cols[5], cols[6], cols[7], cols[8], p2p, total, cols[10], cols[11], cols[12], cols[13], cols[14], cols[15], cols[16])
    else:
      print line