Thread: [Assorted-commits] SF.net SVN: assorted: [369] hash-join/trunk/tools/DbPrep.scala
Brought to you by:
yangzhang
From: <yan...@us...> - 2008-02-11 05:18:42
|
Revision: 369 http://assorted.svn.sourceforge.net/assorted/?rev=369&view=rev Author: yangzhang Date: 2008-02-10 21:18:47 -0800 (Sun, 10 Feb 2008) Log Message: ----------- DbPrep can generate larger data sets by repeating n times and using rot encoding Modified Paths: -------------- hash-join/trunk/tools/DbPrep.scala Modified: hash-join/trunk/tools/DbPrep.scala =================================================================== --- hash-join/trunk/tools/DbPrep.scala 2008-02-11 05:17:48 UTC (rev 368) +++ hash-join/trunk/tools/DbPrep.scala 2008-02-11 05:18:47 UTC (rev 369) @@ -1,3 +1,4 @@ +import commons.Collections._ import commons.Control._ import commons.Io._ import java.util.regex._ @@ -15,59 +16,63 @@ val pMovie = Pattern compile """^([^\t]+)\t+(.*)$""" val pActress = Pattern compile """^([^\t]+)\t+([^\t]+)$""" val (doMovies, doActresses) = (true, true) - if (doMovies) { - using (TextReader("movies.list")) { r => - using (TextWriter("movies.dat")) { w => - var line = r.readLine - try { - var body = false - while (line != null) { - if (body && (line contains "----------------")) { - body = false + val nreps = args(0).toInt + using (TextWriter("movies.dat")) { wm => + using (TextWriter("actresses.dat")) { wa => + for (i <- 0 until nreps) { + def xform(s: String) = if (i == 0) s else rot(i, s) + if (doMovies) { + using (TextReader("movies.list")) { r => + var line = r.readLine + try { + var body = false + while (line != null) { + if (body && (line contains "----------------")) { + body = false + } + if (body && line != "") { + val (title, release) = extract(pMovie, line) + wm print (xform(title) + "\0" + release + "\0\0") + } + if (!body && (line contains "=======")) { + body = true + } + line = r.readLine + } + } catch { + case e: Exception => { Console.err.println(line); throw e } } - if (body && line != "") { - val (title, release) = extract(pMovie, line) - w print (title + "\0" + release + "\0\0") - } - if (!body && (line contains "=======")) { - body = true - } - line = r.readLine } - } catch { - case e: Exception => { Console.err.println(line); throw e } } - } - } - } - if (doActresses) { - using (TextReader("actresses.list")) { r => - using (TextWriter("actresses.dat")) { w => - var line = r.readLine - try { - var body = false - while (line != null) { - if (body && (line contains "----------------")) { - body = false - } - if (body && line != "") { - val (actress, title) = extract(pActress, line) - w print (actress + "\0" + cleanTitle(title) + "\0") - while (line != "") { - line = r.readLine.trim - if (line != "") { - w print (cleanTitle(title) + "\0") + if (doActresses) { + using (TextReader("actresses.list")) { r => + var line = r.readLine + try { + var body = false + while (line != null) { + if (body && (line contains "----------------")) { + body = false } + if (body && line != "") { + val (actress, title) = extract(pActress, line) + wa print (actress + "\0" + cleanTitle(xform(title)) + "\0") + while (line != "") { + line = r.readLine.trim + if (line != "") { + wa print (cleanTitle(xform(title)) + "\0") + } + } + wa print "\0" + } + if (!body && ((line contains "\t") && (line startsWith "----") && (line endsWith "----"))) { + body = true + } + line = r.readLine } - w print "\0" + } catch { + case e: Exception => { Console.err.println(line); throw e } } - if (!body && ((line contains "\t") && (line startsWith "----") && (line endsWith "----"))) { - body = true - } - line = r.readLine } - } catch { - case e: Exception => { Console.err.println(line); throw e } } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-02-11 15:45:10
|
Revision: 372 http://assorted.svn.sourceforge.net/assorted/?rev=372&view=rev Author: yangzhang Date: 2008-02-11 07:45:13 -0800 (Mon, 11 Feb 2008) Log Message: ----------- fixed bug: not cleaning up titles on subsequent repetitions Modified Paths: -------------- hash-join/trunk/tools/DbPrep.scala Modified: hash-join/trunk/tools/DbPrep.scala =================================================================== --- hash-join/trunk/tools/DbPrep.scala 2008-02-11 05:46:03 UTC (rev 371) +++ hash-join/trunk/tools/DbPrep.scala 2008-02-11 15:45:13 UTC (rev 372) @@ -55,11 +55,11 @@ } if (body && line != "") { val (actress, title) = extract(pActress, line) - wa print (actress + "\0" + cleanTitle(xform(title)) + "\0") + wa print (actress + "\0" + xform(cleanTitle(title)) + "\0") while (line != "") { line = r.readLine.trim if (line != "") { - wa print (cleanTitle(xform(title)) + "\0") + wa print (xform(cleanTitle(title)) + "\0") } } wa print "\0" This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-02-15 04:04:55
|
Revision: 446 http://assorted.svn.sourceforge.net/assorted/?rev=446&view=rev Author: yangzhang Date: 2008-02-14 20:05:00 -0800 (Thu, 14 Feb 2008) Log Message: ----------- added cleanup to avoid unintentional null terminator Modified Paths: -------------- hash-join/trunk/tools/DbPrep.scala Modified: hash-join/trunk/tools/DbPrep.scala =================================================================== --- hash-join/trunk/tools/DbPrep.scala 2008-02-15 04:04:44 UTC (rev 445) +++ hash-join/trunk/tools/DbPrep.scala 2008-02-15 04:05:00 UTC (rev 446) @@ -22,7 +22,8 @@ using (TextWriter("movies.dat")) { wm => using (TextWriter("actresses.dat")) { wa => for (i <- 0 until nreps) { - def xform(s: String) = if (i == 0) s else rot(i, s) + def cleanup(s: String) = s filter (_ != 0) mkString + def xform(s: String) = if (i == 0) s else cleanup(rot(i, s)) if (doMovies) { using (TextReader("movies.list")) { r => var line = r.readLine This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |