|
From: Peter P. <pr...@us...> - 2007-11-19 17:50:12
|
Update of /cvsroot/pyxida/Util-PRP/src/de/ceyco/text In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv17572/src/de/ceyco/text Added Files: HtmlEntities2Latin1Charset.java Log Message: Initial checkin --- NEW FILE: HtmlEntities2Latin1Charset.java --- /* * HtmlEntities2Latin1Charset.java * A description of this class is given in the JavaDoc comments * below. * * Created: 10.09.2005 00:57:33 * Copyright (C) 2005 Christian Ey <ey...@in...> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package de.ceyco.text; import java.util.HashMap; import java.util.Map; /** * Converts a <code>String</code> containing HTML entities to * a <code>String</code> containing only ISO8859-1 characters. * * Uses <a href="http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html">ISO * 8859-1 table by Martin Ramsch</a>. * * @author <a href="mailto:ey...@in...">Christian Ey</a> * @version 1.0 */ public class HtmlEntities2Latin1Charset { private static final char ENTITY_START = '&'; private static final char ENTITY_END = ';'; private static Map<String, String> iso2htmlMappings = new HashMap<String, String>(); static { String[] mappingArray = new String[] { """, "\"", "&", "&", "<", "<", ">", ">", " ", " ", "¡", "¡", "¢", "¢", "£", "£", "¤", "¤", "¥", "¥", "¦", "¦", "§", "§", "¨", "¨", "©", "©", "ª", "ª", "«", "«", "¬", "¬", "­", "", "®", "®", "¯", "¯", "°", "°", "±", "±", "²", "²", "³", "³", "´", "´", "µ", "µ", "¶", "¶", "·", "·", "¸", "¸", "¹", "¹", "º", "º", "»", "»", "¼", "¼", "½", "½", "¾", "¾", "¿", "¿", "À", "À", "Á", "Á", "Â", "Â", "Ã", "Ã", "Ä", "Ä", "Å", "Å", "Æ", "Æ", "Ç", "Ç", "È", "È", "É", "É", "Ê", "Ê", "Ë", "Ë", "Ì", "Ì", "Í", "Í", "Î", "Î", "Ï", "Ï", "Ð", "Ð", "Ñ", "Ñ", "Ò", "Ò", "Ó", "Ó", "Ô", "Ô", "Õ", "Õ", "Ö", "Ö", "×", "×", "Ø", "Ø", "Ù", "Ù", "Ú", "Ú", "Û", "Û", "Ü", "Ü", "Ý", "Ý", "Þ", "Þ", "ß", "ß", "à", "à", "á", "á", "â", "â", "ã", "ã", "ä", "ä", "å", "å", "æ", "æ", "ç", "ç", "è", "è", "é", "é", "ê", "ê", "ë", "ë", "ì", "ì", "í", "í", "î", "î", "ï", "ï", "ð", "ð", "ñ", "ñ", "ò", "ò", "ó", "ó", "ô", "ô", "õ", "õ", "ö", "ö", "÷", "÷", "ø", "ø", "ù", "ù", "ú", "ú", "û", "û", "ü", "ü", "ý", "ý", "þ", "þ", "ÿ", "ÿ", """, "\"", "&", "&", "<", "<", ">", ">", " ", " ", "¡", "¡", "¢", "¢", "£", "£", "¤", "¤", "¥", "¥", "¦", "¦", "§", "§", "¨", "¨", "©", "©", "ª", "ª", "«", "«", "¬", "¬", "­", "", "®", "®", "¯", "¯", "°", "°", "±", "±", "²", "²", "³", "³", "´", "´", "µ", "µ", "¶", "¶", "·", "·", "¸", "¸", "¹", "¹", "º", "º", "»", "»", "¼", "¼", "½", "½", "¾", "¾", "¿", "¿", "À", "À", "Á", "Á", "Â", "Â", "Ã", "Ã", "Ä", "Ä", "Å", "Å", "Æ", "Æ", "Ç", "Ç", "È", "È", "É", "É", "Ê", "Ê", "Ë", "Ë", "Ì", "Ì", "Í", "Í", "Î", "Î", "Ï", "Ï", "Ð", "Ð", "Ñ", "Ñ", "Ò", "Ò", "Ó", "Ó", "Ô", "Ô", "Õ", "Õ", "Ö", "Ö", "×", "×", "Ø", "Ø", "Ù", "Ù", "Ú", "Ú", "Û", "Û", "Ü", "Ü", "Ý", "Ý", "Þ", "Þ", "ß", "ß", "à", "à", "á", "á", "â", "â", "ã", "ã", "ä", "ä", "å", "å", "æ", "æ", "ç", "ç", "è", "è", "é", "é", "ê", "ê", "ë", "ë", "ì", "ì", "í", "í", "î", "î", "ï", "ï", "ð", "ð", "ñ", "ñ", "ò", "ò", "ó", "ó", "ô", "ô", "õ", "õ", "ö", "ö", "÷", "÷", "ø", "ø", "ù", "ù", "ú", "ú", "û", "û", "ü", "ü", "ý", "ý", "þ", "þ", "ÿ", "ÿ" }; for (int i = 0; i < mappingArray.length; i = i + 2) { iso2htmlMappings.put(mappingArray[i], mappingArray[i+1]); } } /** * Converts a <code>String</code> containing HTML entities to * a <code>String</code> containing only ISO8859-1 characters. * * @param htmlString The <code>String</code> containing HTML * entities * @return A <code>String</code> containing only ISO8859-1 * characters */ public static String convert( String htmlString) { if (htmlString != null) { // first condition is met: input is not null int indexStart = htmlString.indexOf( ENTITY_START); if (indexStart >= 0) { // second condition is met: entity start detected // copy everything from the beginning to entity start into buffer StringBuffer isoBuffer = new StringBuffer( htmlString.substring( 0, indexStart)); while (indexStart >= 0) { int indexEnd = htmlString.indexOf( ENTITY_END, indexStart + 1); if (indexEnd >= 0) { int alternativeStart = htmlString.indexOf( ENTITY_START, indexStart + 1); if ((alternativeStart > indexStart) && (alternativeStart < indexEnd)) { // a second index start is found inbetween current index start // and index end // flush the html string inbetween isoBuffer.append( htmlString.substring( indexStart, alternativeStart)); // use the second index start and loop again indexStart = alternativeStart; } else { String entity = htmlString.substring( indexStart, indexEnd + 1); String isoCharacter = (String) iso2htmlMappings.get( entity); if (isoCharacter != null) { // insert iso character instead of html entity isoBuffer.append( isoCharacter); } else { // illegal entity detected, ignore gracefully isoBuffer.append( entity); } indexStart = htmlString.indexOf( ENTITY_START, indexEnd + 1); if (indexStart >= 0) { // another entity start detected, flush the html string inbetween isoBuffer.append( htmlString.substring( indexEnd + 1, indexStart)); } else { // no further entity start detected, flush rest of html string isoBuffer.append( htmlString.substring( indexEnd + 1)); } } } else { // entity start without matching entity end detected, ignore gracefully isoBuffer.append( htmlString.substring( indexStart)); break; } } return isoBuffer.toString(); } else { // nothing to do return htmlString; } } else { // nothing to do return null; } } } |