[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexerapplications/tabby Tabby.java,NONE,1.1 package.h
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-09-10 03:54:16
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexerapplications/tabby In directory sc8-pr-cvs1:/tmp/cvs-serv24483/src/org/htmlparser/lexerapplications/tabby Added Files: Tabby.java package.html Log Message: Add style checking target to ant build script: ant checkstyle It uses a jar from http://checkstyle.sourceforge.net which is dropped in the lib directory. The rules are in the file htmlparser_checks.xml in the src directory. Added lexerapplications package with Tabby as the first app. It performs whitespace manipulation on source files to follow the style rules. This reduced the number of style violations to roughly 14,000. There are a few issues with the style checker that need to be resolved before it should be taken too seriously. For example: It thinks all method arguments should be final, even if they are modified by the code (which the compiler frowns on). It complains about long lines, even when there is no possibility of wrapping the line, i.e. a URL in a comment that's more than 80 characters long. It considers all naked integers as 'magic numbers', even when they are obvious, i.e. the 4 corners of a box. It complains about whitespace following braces, even in array initializers, i.e. X[][] = { {a, b} { } } But it points out some really interesting things, even if you don't agree with the style guidelines, so it's worth a look. --- NEW FILE: Tabby.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexerapplications/tabby/Tabby.java,v $ // $Author: derrickoswald $ // $Date: 2003/09/10 03:38:26 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.lexerapplications.tabby; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Page; /** * Replace tabs with spaces. * Convert tabs to the correct number of spaces according to a tabstop, * change DOS \r\n line endings to Unix \n form, and remove trailing whitespace */ public class Tabby { /** * The default tab stop spacing. */ private static final int DEFAULT_TABSTOP = 4; /** * The file filter to apply. */ protected Filter mFilter; /** * The replacement tab stop size. */ protected int mTabsize; /** * Creates a new instance of Tabby with no file filter and a tab stop of 4. */ public Tabby () { mFilter = null; mTabsize = DEFAULT_TABSTOP; } /** * Creates a new instance of Tabby using the given regular expression and * a tab stop of 4. * @param filter The regular expression to apply to the files searched. */ public Tabby (final String filter) { this (); mFilter = new Filter (filter); } /** Creates a new instance of Tabby. * @param filter The regular expression to apply to the files searched. * @param tabsize The tab stop setting. * @exception IllegalArgumentException If tabsize is not a positive number. */ public Tabby (final String filter, final int tabsize) throws IllegalArgumentException { this (filter); if (0 >= tabsize) throw new IllegalArgumentException ("tab size cannot be negative"); mTabsize = tabsize; } /** * Process the file or directory. * @param file The file to process. */ protected void process (final File file) { File[] files; File f; if (file.isDirectory ()) { files = file.listFiles (mFilter); for (int i = 0; i < files.length; i++) process (files[i]); } else edit (file); } /** * Process the file or directory. * @param file The file to edit. */ protected void edit (final File file) { FileInputStream in; Page page; Cursor cursor; int position; int expected; boolean modified; char ch; int last; StringBuffer buffer; FileOutputStream out; try { in = new FileInputStream (file); buffer = new StringBuffer (in.available ()); try { page = new Page (in, null); cursor = new Cursor (page, 0); position = 0; modified = false; expected = 0; last = -1; while (0 != (ch = page.getCharacter (cursor))) { if (++expected != cursor.getPosition ()) { modified = true; expected = cursor.getPosition (); } if ('\t' == ch) { do { buffer.append (' '); position++; } while (0 != (position % mTabsize)); modified = true; } else if ('\n' == ch) { // check for whitespace on the end of the line if (last + 1 != position) { // remove trailing whitespace last = buffer.length () - (position - last - 1); buffer.setLength (last); modified = true; } buffer.append (ch); position = 0; last = -1; } else { buffer.append (ch); if (!Character.isWhitespace (ch)) last = position; position++; } } } finally { in.close (); } if (modified) { System.out.println (file.getAbsolutePath ()); out = new FileOutputStream (file); out.write (buffer.toString ().getBytes (Page.DEFAULT_CHARSET)); out.close (); } } catch (Exception e) { System.out.println (e); } } /** * Implement a file filter. */ class Filter implements FileFilter { /** * The compiled expression. */ protected Pattern mExpression; /** * Create a file filter from the regular expression. * @param expression The <a href="http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html#sum">regular expression</a>. * A useful regular expression is ".*\.java" which accepts all * .java files. * @exception IllegalArgumentException If the expression is * <code>null</code>. * @exception PatternSyntaxException If the expression is not a valid * regular expression. */ public Filter (final String expression) throws PatternSyntaxException { if (null == expression) throw new IllegalArgumentException ( "filter expression cannot be null"); mExpression = Pattern.compile (expression); } // // FileFilter interface // /** * Tests whether or not the file should be included in a pathname list. * @param pathname The abstract pathname to be tested. * @return <code>true</code> if and only if <code>pathname</code> * should be included. */ public boolean accept (final File pathname) { Matcher matcher; boolean ret; // match directories if (pathname.isDirectory ()) ret = true; else { matcher = mExpression.matcher (pathname.getAbsolutePath ()); ret = matcher.matches (); } return (ret); } } /** * Run Tabby on a file or directory. * @param args The command line arguments. * <PRE> * args[0] The file or directory to work on. * args[1] Optional, the regular expression to use as a file filter * args[2] Optional, the tab stop setting (integer). * </PRE> */ public static void main (final String[] args) { Tabby tabby; File file; if (0 == args.length) System.out.println ( "usage: Tabby (<directory>|<file>)" + " [file-match regexp] [tabsize]"); else { if (2 < args.length) tabby = new Tabby (args[1], Integer.parseInt (args[2])); else if (1 < args.length) tabby = new Tabby (args[1]); else tabby = new Tabby (); file = new File (args[0]); tabby.process (file); } } } /* * Revision Control Modification History * * $Log: Tabby.java,v $ * Revision 1.1 2003/09/10 03:38:26 derrickoswald * Add style checking target to ant build script: * ant checkstyle * It uses a jar from http://checkstyle.sourceforge.net which is dropped in the lib directory. * The rules are in the file htmlparser_checks.xml in the src directory. * * Added lexerapplications package with Tabby as the first app. It performs whitespace manipulation * on source files to follow the style rules. This reduced the number of style violations to roughly 14,000. * * There are a few issues with the style checker that need to be resolved before it should be taken too seriously. * For example: * It thinks all method arguments should be final, even if they are modified by the code (which the compiler frowns on). * It complains about long lines, even when there is no possibility of wrapping the line, i.e. a URL in a comment * that's more than 80 characters long. * It considers all naked integers as 'magic numbers', even when they are obvious, i.e. the 4 corners of a box. * It complains about whitespace following braces, even in array initializers, i.e. X[][] = { {a, b} { } } * * But it points out some really interesting things, even if you don't agree with the style guidelines, * so it's worth a look. * * */ --- NEW FILE: package.html --- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <HTML> <HEAD> <TITLE>Tabby</TITLE> </HEAD> <BODY> The Tabby program is a demonstration of how to use the underlying Lexer classes to perform file I/O. The results could also be achieved with a normal Reader object but then there wouldn't be anything interesting about this. <p> The task is to replace tabs with spaces and preserve any indentaion that was a combination of tabs and spaces. This means honouring the 'tab stops' that would be used by an editor and not just a global search and replace. <p> The mechanism used is to keep track of the character offset from the beginning of a line and when a tab is encountered add spaces till the next tab stop. <p> At the same time any "\r\n" combinations that are in the file are converted to simple "\n" characters as found on Unix/Linux systems, and trailing whitespace at the ends of lines is removed. <p> The trick is to only write the file if something in the file required changes, so a boolean variable is kept current with a <code>true</code> value indicating the file needs to be modified. </BODY> </HTML> |