Scott R. Turner - 2013-02-17

I've been moving my Web-Harvest config files from 2.0 to 2.1, and put together a little tool to help me. It's actually a Web-Harvest script that translates config files from 2.0 to 2.1.

Using it is pretty straightforward

  1. Load up the "fixer" config file (below) into Web-Harvest 2.1.
  2. Set your output path to the directory where your Web-Harvest 2.0 config file lives.
  3. Edit the fixer config file so that the "script" variable is set to the name of your file.
  4. Run.
  5. The fixer config file will translate your config file to 2.1 and save it in the same place with "-2.1" added to the name.

Right now the tool takes care of fixing the config tag, replacing var-def with set, removing "overwrite=true" from var-def, replacing var with get, changing the "name" attribute to "var", and replacing ${_1} with ${group1} in regexps.

The fixes are pretty specific to how I write Web-Harvest. I didn't try too hard to make them foolproof, so you may find that you have to tweak them. Or you may need other fixes that I didn't.

If you improve the script please re-post it for others to benefit!

<?xml version="1.0" encoding="UTF-8"?>

<config xmlns="http://web-harvest.sourceforge.net/schema/2.1/core"
        xmlns:var="http://web-harvest.sourceforge.net/schema/2.1/var"
        xmlns:p="http://web-harvest.sourceforge.net/schema/2.1/param">

  <!-- Enter the name of the script to be fixed here -->
  <set var="script">getscores-historical.xml</set>
  <!-- The name for the fixed script adds "-2.1" to the end -->
  <set var="scriptNew">
      <regexp>
      <regexp-pattern>^(.*)\.xml</regexp-pattern>
      <regexp-source><get var="script"></get></regexp-source>
      <regexp-result><template>${group1}-2.1.xml</template></regexp-result>
    </regexp>
  </set>
  <!-- Read the script to be fixed into $content -->
  <set var="content">
    <file action="read" path="${script}"></file>
  </set>

  <!-- Translate <config> to the new format -->
  <set var="content">
      <regexp replace="true">
      <regexp-pattern>&lt;config&gt;</regexp-pattern>
      <regexp-source><get var="content"></get></regexp-source>
      <regexp-result>&lt;config xmlns="http://web-harvest.sourceforge.net/schema/2.1/core" xmlns:var="http://web-harvest.sourceforge.net/schema/2.1/var" xmlns:p="http://web-harvest.sourceforge.net/schema/2.1/param"&gt;</regexp-result>
    </regexp>
  </set>

  <!-- Translate "<var-def name=" into "<set var=" -->
  <set var="content">
      <regexp replace="true">
      <regexp-pattern>var-def name=</regexp-pattern>
      <regexp-source><get var="content"></get></regexp-source>
      <regexp-result>set var=</regexp-result>
    </regexp>
  </set>

  <!-- Translate "/var-def" to "/set" -->
  <set var="content">
      <regexp replace="true">
      <regexp-pattern>/var-def</regexp-pattern>
      <regexp-source><get var="content"></get></regexp-source>
      <regexp-result>/set</regexp-result>
    </regexp>
  </set>

  <!-- Remove 'overwrite="true"' -->
  <set var="content">
      <regexp replace="true">
      <regexp-pattern>overwrite="true"</regexp-pattern>
      <regexp-source><get var="content"></get></regexp-source>
      <regexp-result></regexp-result>
    </regexp>
  </set>

  <!-- Translate "<var name=" to "<get var=" -->
  <set var="content">
      <regexp replace="true">
      <regexp-pattern>&lt;var name=</regexp-pattern>
      <regexp-source><get var="content"></get></regexp-source>
      <regexp-result>&lt;get var=</regexp-result>
    </regexp>
  </set>

  <!-- Translate "/var" to "/get" -->
  <set var="content">
      <regexp replace="true">
      <regexp-pattern>&lt;/var&gt;</regexp-pattern>
      <regexp-source><get var="content"></get></regexp-source>
      <regexp-result>&lt;/get&gt;</regexp-result>
    </regexp>
  </set>

  <!-- Translate "${_2}" to "${group2}" -->
  <set var="content">
      <regexp replace="true">
      <regexp-pattern>\$\{\_</regexp-pattern>
      <regexp-source><get var="content"></get></regexp-source>
      <regexp-result><template>\$\{group</template></regexp-result>
    </regexp>
  </set>

  <!-- Write the fixed script out to the new file -->
  <file action="write" path="${scriptNew}">
    <template>${content}</template>
  </file>

</config>