Meta - Script to build manual from wiki content

junkyardsparkle

This is a crude script to scrape the contents of the Manual and How To pages from the wiki and create a set of rudimentary HTML files from which other document formats can be created. If you make improvements, increment the version number.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/bin/bash

# script to download and extract the content of the qtractor manual
# located on the sourceforge.net wiki, directly from the wiki pages.
# version 0.1

OUT_DIR="$PWD"

WGET_MANUAL="http://sourceforge.net/p/qtractor/wiki/Manual - Table of Contents/"
WGET_HOWTO="http://sourceforge.net/p/qtractor/wiki/How To - Contents/"

WGET_DIRS="/p/qtractor/wiki/Manual - *,\
/p/qtractor/wiki/Manual - */attachment,\
/p/qtractor/wiki/How To - *,\
/p/qtractor/wiki/How To - */attachment"

# ugly page scrape hacks live in here:
extractBody ()
{
  # cut out the part that we want...
  sed  '/<div class="markdown_content"/,/<\/html>/!d' "$INFILE" | \
  sed '/<div id="create_wiki_page_holder"/,/<\/html>/d' | \
  # adapt references to flat directory structure...
  sed '{
  s/\.\.\/\(.*\)\/index\.html/\1.html/
  s/"index.html#/"#/
  s/<img src="attachment\//<img src="/
  }'
}

cd "$OUT_DIR"

# first we pull down the wiki pages we want, in this case
# the ones that are prefixed with "Manual".
rm -rf qtractor_sf_wiki_pages

wget \
  --directory-prefix=qtractor_sf_wiki_pages \
  --no-host-directories \
  --cut-dirs=3 \
  --convert-links \
  --recursive \
  --include-directories "$WGET_DIRS" \
  --accept .html,.png,.jpg,.jpeg \
  "$WGET_MANUAL" "$WGET_HOWTO"

# make a nice clean place to build our shiny new manual...
rm -rf html_out; mkdir html_out

cd qtractor_sf_wiki_pages
for title in Manual* How\ To*
do
  INFILE="$title/index.html"
  OUTFILE="../html_out/$title.html"
  # build a minimal html page...
  echo "<HTML><HEAD><TITLE>$title</TITLE></HEAD><BODY>" > "$OUTFILE"
  extractBody  >> "$OUTFILE"
  echo "</BODY></HTML>" >> "$OUTFILE"
  # copy over the image files, if any...
  if [ -d "$title/attachment" ]
  then cp -t ../html_out/ "$title"/attachment/*
  fi
done