[Pyxida-cvs-commits] Util-PRP/src/de/ceyco/text HtmlEntities2Latin1Charset.java, NONE, 1.1

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/pyxida/Util-PRP/src/de/ceyco/text
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv17572/src/de/ceyco/text

Added Files:
	HtmlEntities2Latin1Charset.java 
Log Message:
Initial checkin

--- NEW FILE: HtmlEntities2Latin1Charset.java ---
/*
 * HtmlEntities2Latin1Charset.java
 * A description of this class is given in the JavaDoc comments
 * below.
 *
 * Created: 10.09.2005 00:57:33
 * Copyright (C) 2005 Christian Ey <ey...@in...>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package de.ceyco.text;

import java.util.HashMap;
import java.util.Map;

/**
 * Converts a <code>String</code> containing HTML entities to
 * a <code>String</code> containing only ISO8859-1 characters.
 * 
 * Uses <a href="http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html">ISO
 * 8859-1 table by Martin Ramsch</a>.
 * 
 * @author <a href="mailto:ey...@in...">Christian Ey</a>
 * @version 1.0
 */
public class HtmlEntities2Latin1Charset {

  private static final char ENTITY_START = '&';
  private static final char ENTITY_END = ';';

  private static Map<String, String> iso2htmlMappings = new HashMap<String, String>();
  static {
	String[] mappingArray = new String[] {
	  "&quot;",   "\"",
	  "&amp;",    "&",
	  "&lt;",     "<",
	  "&gt;",     ">",
	  "&nbsp;",   " ",
	  "&iexcl;",  "¡",
	  "&cent;",   "¢",
	  "&pound;",  "£",
	  "&curren;", "¤",
	  "&yen;",    "¥",
	  "&brvbar;", "¦",
	  "&sect;",   "§",
	  "&uml;",    "¨",
	  "&copy;",   "©",
	  "&ordf;",   "ª",
	  "&laquo;",  "«",
	  "&not;",    "¬",
	  "&shy;",    "",
	  "&reg;",    "®",
	  "&macr;",   "¯",
	  "&deg;",    "°",
	  "&plusmn;", "±",
	  "&sup2;",   "²",
	  "&sup3;",   "³",
	  "&acute;",  "´",
	  "&micro;",  "µ",
	  "&para;",   "¶",
	  "&middot;", "·",
	  "&cedil;",  "¸",
	  "&sup1;",   "¹",
	  "&ordm;",   "º",
	  "&raquo;",  "»",
	  "&frac14;", "¼",
	  "&frac12;", "½",
	  "&frac34;", "¾",
	  "&iquest;", "¿",
	  "&Agrave;", "À",
	  "&Aacute;", "Á",
	  "&Acirc;",  "Â",
	  "&Atilde;", "Ã",
	  "&Auml;",   "Ä",
	  "&Aring;",  "Å",
	  "&AElig;",  "Æ",
	  "&Ccedil;", "Ç",
	  "&Egrave;", "È",
	  "&Eacute;", "É",
	  "&Ecirc;",  "Ê",
	  "&Euml;",   "Ë",
	  "&Igrave;", "Ì",
	  "&Iacute;", "Í",
	  "&Icirc;",  "Î",
	  "&Iuml;",   "Ï",
	  "&ETH;",    "Ð",
	  "&Ntilde;", "Ñ",
	  "&Ograve;", "Ò",
	  "&Oacute;", "Ó",
	  "&Ocirc;",  "Ô",
	  "&Otilde;", "Õ",
	  "&Ouml;",   "Ö",
	  "&times;",  "×",
	  "&Oslash;", "Ø",
	  "&Ugrave;", "Ù",
	  "&Uacute;", "Ú",
	  "&Ucirc;",  "Û",
	  "&Uuml;",   "Ü",
	  "&Yacute;", "Ý",
	  "&THORN;",  "Þ",
	  "&szlig;",  "ß",
	  "&agrave;", "à",
	  "&aacute;", "á",
	  "&acirc;",  "â",
	  "&atilde;", "ã",
	  "&auml;",   "ä",
	  "&aring;",  "å",
	  "&aelig;",  "æ",
	  "&ccedil;", "ç",
	  "&egrave;", "è",
	  "&eacute;", "é",
	  "&ecirc;",  "ê",
	  "&euml;",   "ë",
	  "&igrave;", "ì",
	  "&iacute;", "í",
	  "&icirc;",  "î",
	  "&iuml;",   "ï",
	  "&eth;",    "ð",
	  "&ntilde;", "ñ",
	  "&ograve;", "ò",
	  "&oacute;", "ó",
	  "&ocirc;",  "ô",
	  "&otilde;", "õ",
	  "&ouml;",   "ö",
	  "&divide;", "÷",
	  "&oslash;", "ø",
	  "&ugrave;", "ù",
	  "&uacute;", "ú",
	  "&ucirc;",  "û",
	  "&uuml;",   "ü",
	  "&yacute;", "ý",
	  "&thorn;",  "þ",
	  "&yuml;",   "ÿ",

	  "&#34;",  "\"",
	  "&#38;",  "&",
	  "&#60;",  "<",
	  "&#62;",  ">",
	  "&#160;", " ",
	  "&#161;", "¡",
	  "&#162;", "¢",
	  "&#163;", "£",
	  "&#164;", "¤",
	  "&#165;", "¥",
	  "&#166;", "¦",
	  "&#167;", "§",
	  "&#168;", "¨",
	  "&#169;", "©",
	  "&#170;", "ª",
	  "&#171;", "«",
	  "&#172;", "¬",
	  "&#173;", "",
	  "&#174;", "®",
	  "&#175;", "¯",
	  "&#176;", "°",
	  "&#177;", "±",
	  "&#178;", "²",
	  "&#179;", "³",
	  "&#180;", "´",
	  "&#181;", "µ",
	  "&#182;", "¶",
	  "&#183;", "·",
	  "&#184;", "¸",
	  "&#185;", "¹",
	  "&#186;", "º",
	  "&#187;", "»",
	  "&#188;", "¼",
	  "&#189;", "½",
	  "&#190;", "¾",
	  "&#191;", "¿",
	  "&#192;", "À",
	  "&#193;", "Á",
	  "&#194;", "Â",
	  "&#195;", "Ã",
	  "&#196;", "Ä",
	  "&#197;", "Å",
	  "&#198;", "Æ",
	  "&#199;", "Ç",
	  "&#200;", "È",
	  "&#201;", "É",
	  "&#202;", "Ê",
	  "&#203;", "Ë",
	  "&#204;", "Ì",
	  "&#205;", "Í",
	  "&#206;", "Î",
	  "&#207;", "Ï",
	  "&#208;", "Ð",
	  "&#209;", "Ñ",
	  "&#210;", "Ò",
	  "&#211;", "Ó",
	  "&#212;", "Ô",
	  "&#213;", "Õ",
	  "&#214;", "Ö",
	  "&#215;", "×",
	  "&#216;", "Ø",
	  "&#217;", "Ù",
	  "&#218;", "Ú",
	  "&#219;", "Û",
	  "&#220;", "Ü",
	  "&#221;", "Ý",
	  "&#222;", "Þ",
	  "&#223;", "ß",
	  "&#224;", "à",
	  "&#225;", "á",
	  "&#226;", "â",
	  "&#227;", "ã",
	  "&#228;", "ä",
	  "&#229;", "å",
	  "&#230;", "æ",
	  "&#231;", "ç",
	  "&#232;", "è",
	  "&#233;", "é",
	  "&#234;", "ê",
	  "&#235;", "ë",
	  "&#236;", "ì",
	  "&#237;", "í",
	  "&#238;", "î",
	  "&#239;", "ï",
	  "&#240;", "ð",
	  "&#241;", "ñ",
	  "&#242;", "ò",
	  "&#243;", "ó",
	  "&#244;", "ô",
	  "&#245;", "õ",
	  "&#246;", "ö",
	  "&#247;", "÷",
	  "&#248;", "ø",
	  "&#249;", "ù",
	  "&#250;", "ú",
	  "&#251;", "û",
	  "&#252;", "ü",
	  "&#253;", "ý",
	  "&#254;", "þ",
	  "&#255;", "ÿ"
	};
	for (int i = 0; i < mappingArray.length; i = i + 2) {
	  iso2htmlMappings.put(mappingArray[i], mappingArray[i+1]);
	}
  }

  /**
   * Converts a <code>String</code> containing HTML entities to
   * a <code>String</code> containing only ISO8859-1 characters.
   * 
   * @param htmlString The <code>String</code> containing HTML
   * 	entities
   * @return A <code>String</code> containing only ISO8859-1
   * 	characters
   */
  public static String convert( String htmlString) {

	if (htmlString != null) {
	  // first condition is met: input is not null
	  int indexStart = htmlString.indexOf( ENTITY_START);
	  if (indexStart >= 0) {
		// second condition is met: entity start detected
		// copy everything from the beginning to entity start into buffer
		StringBuffer isoBuffer = new StringBuffer( htmlString.substring( 0, indexStart));
		while (indexStart >= 0) {
		  int indexEnd = htmlString.indexOf( ENTITY_END, indexStart + 1);
		  if (indexEnd >= 0) {
			int alternativeStart = htmlString.indexOf( ENTITY_START, indexStart + 1);
			if ((alternativeStart > indexStart) && (alternativeStart < indexEnd)) {
			  // a second index start is found inbetween current index start
			  // and index end

			  // flush the html string inbetween
			  isoBuffer.append( htmlString.substring( indexStart, alternativeStart));

			  // use the second index start and loop again
			  indexStart = alternativeStart;
			} else {
			  String entity = htmlString.substring( indexStart, indexEnd + 1);
			  String isoCharacter = (String) iso2htmlMappings.get( entity);
			  if (isoCharacter != null) {
				// insert iso character instead of html entity
				isoBuffer.append( isoCharacter);
			  } else {
				// illegal entity detected, ignore gracefully
				isoBuffer.append( entity);
			  }
			  indexStart = htmlString.indexOf( ENTITY_START, indexEnd + 1);
			  if (indexStart >= 0) {
				// another entity start detected, flush the html string inbetween
				isoBuffer.append( htmlString.substring( indexEnd + 1, indexStart));
			  } else {
				// no further entity start detected, flush rest of html string
				isoBuffer.append( htmlString.substring( indexEnd + 1));
			  }
			}
		  } else {
			// entity start without matching entity end detected, ignore gracefully
			isoBuffer.append( htmlString.substring( indexStart));
			break;
		  }
		}
		return isoBuffer.toString();
	  } else {
		// nothing to do
		return htmlString;
	  }
	} else {
	  // nothing to do
	  return null;
	}
  }
}