~/home of geeks

HTMLEscapeWriter

· 1507 Wörter · 8 Minute(n) Lesedauer

Sehr häufig kommt es vor, dass ich Text in HTML umwandeln und dabei Sonderzeichen in HTML-Entities escapen muss, beispielsweise wenn ich in einer JSP einen Stacktrace oder ein XML-Dokument anzeigen möchte. Ich habe mir die einmalige Mühe gemacht, einen eigenen Writer hierfür zu schreiben.

Dieser versteht es, alle HTML-Entities, auch Zeilenumbrüche, korrekt zu escapen und darzustellen, und das sogar mit einer Unterstützung für Named Entities (wie z. B. & statt "). Und da es wie alle anderen Writer ebenfalls ein Decorator ist, kann es sehr schön, einfach und Java-Konform in den Kode eingebunden werden, wie z. B. bei Exceptions: e.printStackTrace(new HTMLEscapeWriter(out));

Very often I need to convert plain text to HTML and thus convert all special characters of the text to HTML escapes, especially when working with JSP or Servlets. Now I wrote some Writer which handels all escaping on the fly and is able to convert either to unicode-escapes like &#34; or to named entities (if available) like &amp;. Also linkebreaks are converted to appropriate <BR/> tags. Through the java-compliant usage as a writer decorator it’s easy to use. E. g. when used to print a stacktrace on a html page: e.printStackTrace(new HTMLEscapeWriter(out));

import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.HashMap;

/**
 * This writer decorates other writers and converts all HTML-special characters and
 * non-ASCII characters to HTML entities.
 * <p/>
 * Linebreaks through carriagereturn (CR) or linefeed (LF) are translated to a &lt;BR/&gt;.
 * This writer supports unicode-escaped (like &amp;#34; for quotation symbol) and named entities (like &amp;quot; for a quotation symbol) conversions.
 * Named entities are only used for HTML 4.0 known named entities. 
 * <p/>
 * Based on code from S. Bayer found at <a href="http://www.rgagnon.com/javadetails/java-0306.html">rgagnon.com</a>
 * and a list of named HTML 4.0 entities at <a href="http://de.selfhtml.org/html/referenz/zeichen.htm">selfhtml.org</a>.  
 */
public class HTMLEscapeWriter extends Writer {
    private static final HashMap NAMED_ENTITIES = new HashMap();
    static{
        synchronized(NAMED_ENTITIES){
            NAMED_ENTITIES.put(new Integer(34), "quot");
            NAMED_ENTITIES.put(new Integer(38), "amp");
            NAMED_ENTITIES.put(new Integer(60), "lt");
            NAMED_ENTITIES.put(new Integer(62), "gt");
            NAMED_ENTITIES.put(new Integer(160), "nbsp");
            NAMED_ENTITIES.put(new Integer(161), "iexcl");
            NAMED_ENTITIES.put(new Integer(162), "cent");
            NAMED_ENTITIES.put(new Integer(163), "pound");
            NAMED_ENTITIES.put(new Integer(164), "curren");
            NAMED_ENTITIES.put(new Integer(165), "yen");
            NAMED_ENTITIES.put(new Integer(166), "brvbar");
            NAMED_ENTITIES.put(new Integer(167), "sect");
            NAMED_ENTITIES.put(new Integer(168), "uml");
            NAMED_ENTITIES.put(new Integer(169), "copy");
            NAMED_ENTITIES.put(new Integer(170), "ordf");
            NAMED_ENTITIES.put(new Integer(171), "laquo");
            NAMED_ENTITIES.put(new Integer(172), "not");
            NAMED_ENTITIES.put(new Integer(173), "shy");
            NAMED_ENTITIES.put(new Integer(174), "reg");
            NAMED_ENTITIES.put(new Integer(175), "macr");
            NAMED_ENTITIES.put(new Integer(176), "deg");
            NAMED_ENTITIES.put(new Integer(177), "plusmn");
            NAMED_ENTITIES.put(new Integer(178), "sup2");
            NAMED_ENTITIES.put(new Integer(179), "sup3");
            NAMED_ENTITIES.put(new Integer(180), "acute");
            NAMED_ENTITIES.put(new Integer(181), "micro");
            NAMED_ENTITIES.put(new Integer(182), "para");
            NAMED_ENTITIES.put(new Integer(183), "middot");
            NAMED_ENTITIES.put(new Integer(184), "cedil");
            NAMED_ENTITIES.put(new Integer(185), "sup1");
            NAMED_ENTITIES.put(new Integer(186), "ordm");
            NAMED_ENTITIES.put(new Integer(187), "raquo");
            NAMED_ENTITIES.put(new Integer(188), "frac14");
            NAMED_ENTITIES.put(new Integer(189), "frac12");
            NAMED_ENTITIES.put(new Integer(190), "frac34");
            NAMED_ENTITIES.put(new Integer(191), "iquest");
            NAMED_ENTITIES.put(new Integer(192), "Agrave");
            NAMED_ENTITIES.put(new Integer(193), "Aacute");
            NAMED_ENTITIES.put(new Integer(194), "Acirc");
            NAMED_ENTITIES.put(new Integer(195), "Atilde");
            NAMED_ENTITIES.put(new Integer(196), "Auml");
            NAMED_ENTITIES.put(new Integer(197), "Aring");
            NAMED_ENTITIES.put(new Integer(198), "AElig");
            NAMED_ENTITIES.put(new Integer(199), "Ccedil");
            NAMED_ENTITIES.put(new Integer(200), "Egrave");
            NAMED_ENTITIES.put(new Integer(201), "Eacute");
            NAMED_ENTITIES.put(new Integer(202), "Ecirc");
            NAMED_ENTITIES.put(new Integer(203), "Euml");
            NAMED_ENTITIES.put(new Integer(204), "Igrave");
            NAMED_ENTITIES.put(new Integer(205), "Iacute");
            NAMED_ENTITIES.put(new Integer(206), "Icirc");
            NAMED_ENTITIES.put(new Integer(207), "Iuml");
            NAMED_ENTITIES.put(new Integer(208), "ETH");
            NAMED_ENTITIES.put(new Integer(209), "Ntilde");
            NAMED_ENTITIES.put(new Integer(210), "Ograve");
            NAMED_ENTITIES.put(new Integer(211), "Oacute");
            NAMED_ENTITIES.put(new Integer(212), "Ocirc");
            NAMED_ENTITIES.put(new Integer(213), "Otilde");
            NAMED_ENTITIES.put(new Integer(214), "Ouml");
            NAMED_ENTITIES.put(new Integer(215), "times");
            NAMED_ENTITIES.put(new Integer(216), "Oslash");
            NAMED_ENTITIES.put(new Integer(217), "Ugrave");
            NAMED_ENTITIES.put(new Integer(218), "Uacute");
            NAMED_ENTITIES.put(new Integer(219), "Ucirc");
            NAMED_ENTITIES.put(new Integer(220), "Uuml");
            NAMED_ENTITIES.put(new Integer(221), "Yacute");
            NAMED_ENTITIES.put(new Integer(222), "THORN");
            NAMED_ENTITIES.put(new Integer(223), "szlig");
            NAMED_ENTITIES.put(new Integer(224), "agrave");
            NAMED_ENTITIES.put(new Integer(225), "aacute");
            NAMED_ENTITIES.put(new Integer(226), "acirc");
            NAMED_ENTITIES.put(new Integer(227), "atilde");
            NAMED_ENTITIES.put(new Integer(228), "auml");
            NAMED_ENTITIES.put(new Integer(229), "aring");
            NAMED_ENTITIES.put(new Integer(230), "aelig");
            NAMED_ENTITIES.put(new Integer(231), "ccedil");
            NAMED_ENTITIES.put(new Integer(232), "egrave");
            NAMED_ENTITIES.put(new Integer(233), "eacute");
            NAMED_ENTITIES.put(new Integer(234), "ecirc");
            NAMED_ENTITIES.put(new Integer(235), "euml");
            NAMED_ENTITIES.put(new Integer(236), "igrave");
            NAMED_ENTITIES.put(new Integer(237), "iacute");
            NAMED_ENTITIES.put(new Integer(238), "icirc");
            NAMED_ENTITIES.put(new Integer(239), "iuml");
            NAMED_ENTITIES.put(new Integer(240), "eth");
            NAMED_ENTITIES.put(new Integer(241), "ntilde");
            NAMED_ENTITIES.put(new Integer(242), "ograve");
            NAMED_ENTITIES.put(new Integer(243), "oacute");
            NAMED_ENTITIES.put(new Integer(244), "ocirc");
            NAMED_ENTITIES.put(new Integer(245), "otilde");
            NAMED_ENTITIES.put(new Integer(246), "ouml");
            NAMED_ENTITIES.put(new Integer(247), "divide");
            NAMED_ENTITIES.put(new Integer(248), "oslash");
            NAMED_ENTITIES.put(new Integer(249), "ugrave");
            NAMED_ENTITIES.put(new Integer(250), "uacute");
            NAMED_ENTITIES.put(new Integer(251), "ucirc");
            NAMED_ENTITIES.put(new Integer(252), "uuml");
            NAMED_ENTITIES.put(new Integer(253), "yacute");
            NAMED_ENTITIES.put(new Integer(254), "thorn");
            NAMED_ENTITIES.put(new Integer(255), "yuml");
            
            
            NAMED_ENTITIES.put(new Integer(913), "Alpha");
            NAMED_ENTITIES.put(new Integer(945), "alpha");
            NAMED_ENTITIES.put(new Integer(914), "Beta");
            NAMED_ENTITIES.put(new Integer(946), "beta");
            NAMED_ENTITIES.put(new Integer(913), "Alpha");
            NAMED_ENTITIES.put(new Integer(913), "Alpha");
            NAMED_ENTITIES.put(new Integer(915), "Gamma");
            NAMED_ENTITIES.put(new Integer(947), "gamma");
            NAMED_ENTITIES.put(new Integer(916), "Delta");
            NAMED_ENTITIES.put(new Integer(948), "delta");
            NAMED_ENTITIES.put(new Integer(917), "Epsilon");
            NAMED_ENTITIES.put(new Integer(949), "epsilon");
            NAMED_ENTITIES.put(new Integer(918), "Zeta");
            NAMED_ENTITIES.put(new Integer(950), "zeta");
            NAMED_ENTITIES.put(new Integer(919), "Eta");
            NAMED_ENTITIES.put(new Integer(951), "eta");
            NAMED_ENTITIES.put(new Integer(920), "Theta");
            NAMED_ENTITIES.put(new Integer(952), "theta");
            NAMED_ENTITIES.put(new Integer(921), "Iota");
            NAMED_ENTITIES.put(new Integer(953), "iota");
            NAMED_ENTITIES.put(new Integer(922), "Kappa");
            NAMED_ENTITIES.put(new Integer(954), "kappa");
            NAMED_ENTITIES.put(new Integer(923), "Lambda");
            NAMED_ENTITIES.put(new Integer(955), "lambda");
            NAMED_ENTITIES.put(new Integer(924), "Mu");
            NAMED_ENTITIES.put(new Integer(956), "mu");
            NAMED_ENTITIES.put(new Integer(925), "Nu");
            NAMED_ENTITIES.put(new Integer(957), "nu");
            NAMED_ENTITIES.put(new Integer(926), "Xi");
            NAMED_ENTITIES.put(new Integer(958), "xi");
            NAMED_ENTITIES.put(new Integer(927), "Omicron");
            NAMED_ENTITIES.put(new Integer(959), "omicron");
            NAMED_ENTITIES.put(new Integer(928), "Pi");
            NAMED_ENTITIES.put(new Integer(960), "pi");
            NAMED_ENTITIES.put(new Integer(929), "Rho");
            NAMED_ENTITIES.put(new Integer(961), "rho");
            NAMED_ENTITIES.put(new Integer(931), "Sigma");
            NAMED_ENTITIES.put(new Integer(962), "sigmaf");
            NAMED_ENTITIES.put(new Integer(963), "sigma");
            NAMED_ENTITIES.put(new Integer(932), "Tau");
            NAMED_ENTITIES.put(new Integer(964), "tau");
            NAMED_ENTITIES.put(new Integer(933), "Upsilon");
            NAMED_ENTITIES.put(new Integer(965), "upsilon");
            NAMED_ENTITIES.put(new Integer(934), "Phi");
            NAMED_ENTITIES.put(new Integer(966), "phi");
            NAMED_ENTITIES.put(new Integer(935), "Chi");
            NAMED_ENTITIES.put(new Integer(967), "chi");
            NAMED_ENTITIES.put(new Integer(936), "Psi");
            NAMED_ENTITIES.put(new Integer(968), "psi");
            NAMED_ENTITIES.put(new Integer(937), "Omega");
            NAMED_ENTITIES.put(new Integer(969), "omega");
            NAMED_ENTITIES.put(new Integer(977), "thetasym");
            NAMED_ENTITIES.put(new Integer(978), "upsih");
            NAMED_ENTITIES.put(new Integer(982), "piv");
            
            NAMED_ENTITIES.put(new Integer(8704), "forall");
            NAMED_ENTITIES.put(new Integer(8706), "part");
            NAMED_ENTITIES.put(new Integer(8707), "exist");
            NAMED_ENTITIES.put(new Integer(8709), "empty");
            NAMED_ENTITIES.put(new Integer(8711), "nabla");
            NAMED_ENTITIES.put(new Integer(8712), "isin");
            NAMED_ENTITIES.put(new Integer(8713), "notin");
            NAMED_ENTITIES.put(new Integer(8715), "ni");
            NAMED_ENTITIES.put(new Integer(8719), "prod");
            NAMED_ENTITIES.put(new Integer(8721), "sum");
            NAMED_ENTITIES.put(new Integer(8722), "minus");
            NAMED_ENTITIES.put(new Integer(8727), "lowast");
            NAMED_ENTITIES.put(new Integer(8730), "radic");
            NAMED_ENTITIES.put(new Integer(8733), "prop");
            NAMED_ENTITIES.put(new Integer(8734), "infin");
            NAMED_ENTITIES.put(new Integer(8736), "ang");
            NAMED_ENTITIES.put(new Integer(8743), "and");
            NAMED_ENTITIES.put(new Integer(8744), "or");
            NAMED_ENTITIES.put(new Integer(8745), "cap");
            NAMED_ENTITIES.put(new Integer(8746), "cup");
            NAMED_ENTITIES.put(new Integer(8747), "int");
            NAMED_ENTITIES.put(new Integer(8756), "there4");
            NAMED_ENTITIES.put(new Integer(8764), "sim");
            NAMED_ENTITIES.put(new Integer(8773), "cong");
            NAMED_ENTITIES.put(new Integer(8776), "asymp");
            NAMED_ENTITIES.put(new Integer(8800), "ne");
            NAMED_ENTITIES.put(new Integer(8801), "equiv");
            NAMED_ENTITIES.put(new Integer(8804), "le");
            NAMED_ENTITIES.put(new Integer(8805), "ge");
            NAMED_ENTITIES.put(new Integer(8834), "sub");
            NAMED_ENTITIES.put(new Integer(8835), "sup");
            NAMED_ENTITIES.put(new Integer(8836), "nsub");
            NAMED_ENTITIES.put(new Integer(8838), "sube");
            NAMED_ENTITIES.put(new Integer(8839), "supe");
            NAMED_ENTITIES.put(new Integer(8853), "oplus");
            NAMED_ENTITIES.put(new Integer(8855), "otimes");
            NAMED_ENTITIES.put(new Integer(8869), "perp");
            NAMED_ENTITIES.put(new Integer(8901), "sdot");
            NAMED_ENTITIES.put(new Integer(9674), "loz");

            NAMED_ENTITIES.put(new Integer(8968), "lceil");
            NAMED_ENTITIES.put(new Integer(8969), "rceil");
            NAMED_ENTITIES.put(new Integer(8970), "lfloor");
            NAMED_ENTITIES.put(new Integer(8971), "rfloor");
            NAMED_ENTITIES.put(new Integer(9001), "lang");
            NAMED_ENTITIES.put(new Integer(9002), "rang");
            
            NAMED_ENTITIES.put(new Integer(8592), "larr");
            NAMED_ENTITIES.put(new Integer(8593), "uarr");
            NAMED_ENTITIES.put(new Integer(8594), "rarr");
            NAMED_ENTITIES.put(new Integer(8595), "darr");
            NAMED_ENTITIES.put(new Integer(8596), "harr");
            NAMED_ENTITIES.put(new Integer(8629), "crarr");
            NAMED_ENTITIES.put(new Integer(8656), "lArr");
            NAMED_ENTITIES.put(new Integer(8657), "uArr");
            NAMED_ENTITIES.put(new Integer(8658), "rArr");
            NAMED_ENTITIES.put(new Integer(8659), "dArr");
            NAMED_ENTITIES.put(new Integer(8660), "hArr");

            NAMED_ENTITIES.put(new Integer(8226), "bull");
            NAMED_ENTITIES.put(new Integer(8242), "prime");
            NAMED_ENTITIES.put(new Integer(8243), "Prime");
            NAMED_ENTITIES.put(new Integer(8254), "oline");
            NAMED_ENTITIES.put(new Integer(8260), "frasl");
            NAMED_ENTITIES.put(new Integer(8472), "weierp");
            NAMED_ENTITIES.put(new Integer(8465), "image");
            NAMED_ENTITIES.put(new Integer(8476), "real");
            NAMED_ENTITIES.put(new Integer(8482), "trade");
            NAMED_ENTITIES.put(new Integer(8364), "euro");
            NAMED_ENTITIES.put(new Integer(8501), "alefsym");
            NAMED_ENTITIES.put(new Integer(9824), "spades");
            NAMED_ENTITIES.put(new Integer(9827), "clubs");
            NAMED_ENTITIES.put(new Integer(9829), "hearts");
            NAMED_ENTITIES.put(new Integer(9830), "diams");

            NAMED_ENTITIES.put(new Integer(338), "OElig");
            NAMED_ENTITIES.put(new Integer(339), "oelig");
            NAMED_ENTITIES.put(new Integer(352), "Scaron");
            NAMED_ENTITIES.put(new Integer(353), "scaron");
            NAMED_ENTITIES.put(new Integer(376), "Yuml");
            NAMED_ENTITIES.put(new Integer(402), "fnof");

            NAMED_ENTITIES.put(new Integer(8194), "ensp");
            NAMED_ENTITIES.put(new Integer(8195), "emsp");
            NAMED_ENTITIES.put(new Integer(8201), "thinsp");
            NAMED_ENTITIES.put(new Integer(8204), "zwnj");
            NAMED_ENTITIES.put(new Integer(8205), "zwj");
            NAMED_ENTITIES.put(new Integer(8206), "lrm");
            NAMED_ENTITIES.put(new Integer(8207), "rlm");
            NAMED_ENTITIES.put(new Integer(8211), "ndash");
            NAMED_ENTITIES.put(new Integer(8212), "mdash");
            NAMED_ENTITIES.put(new Integer(8216), "lsquo");
            NAMED_ENTITIES.put(new Integer(8217), "rsquo");
            NAMED_ENTITIES.put(new Integer(8218), "sbquo");
            NAMED_ENTITIES.put(new Integer(8220), "ldquo");
            NAMED_ENTITIES.put(new Integer(8221), "rdquo");
            NAMED_ENTITIES.put(new Integer(8222), "bdquo");
            NAMED_ENTITIES.put(new Integer(8224), "dagger");
            NAMED_ENTITIES.put(new Integer(8225), "Dagger");
            NAMED_ENTITIES.put(new Integer(8230), "hellip");
            NAMED_ENTITIES.put(new Integer(8240), "permil");
            NAMED_ENTITIES.put(new Integer(8249), "lsaquo");
            NAMED_ENTITIES.put(new Integer(8250), "rsaquo");

            NAMED_ENTITIES.put(new Integer(710), "circ");
            NAMED_ENTITIES.put(new Integer(732), "tilde");
        }
    }
    
    private Writer writer;

    private boolean lastWasBlankChar = false;
    private boolean lastWasCR = false;
    private boolean useNamedEntities = true;
    
    /**
     * This constructor creates an instance which wrapps the given writer
     * and uses named entities.
     *   
     * @param writer
     */
    public HTMLEscapeWriter(Writer writer) {
        this(writer, true);
    }

    /**
     * This constructor creates an instance which wrapps the given writer
     * and uses named entities, when the second parameter is true,
     * else escapes all characters with unicode-escapes.
     * 
     * @param writer
     * @param useNamedEntities if true uses named entities where possible, if false always unicode-escapes.
     */
    public HTMLEscapeWriter(Writer writer, boolean useNamedEntities) {
        this.writer = writer;
        this.useNamedEntities = useNamedEntities;
    }
    
    /**
     * Delegates to the underlying writer.
     * 
     * @see java.io.Writer#close()
     */
    public void close() throws IOException {
        writer.close();
    }

    /**
     * Delegates to the underlying writer.
     * 
     * @see java.io.Writer#flush()
     */
    public void flush() throws IOException {
        writer.flush();
    }
    
    /**
     * Converts the given characters to their unicode-escapes or named entities if they
     * are non-ASCII characters and writes to the underlying writer.
     * 
     * @see java.io.Writer#write(char[], int, int)
     */
    public void write(char[] cbuf, int off, int len) throws IOException {
        char c;
        int top = off + len;
        for (int i = off; i < top; i++) {
            c = cbuf[i];
            if (c == ' ') {
                lastWasCR = false;
                // blank gets extra work,
                // this solves the problem you get if you replace all
                // blanks with &nbsp;, if you do that you loss
                // word breaking
                if (lastWasBlankChar) {
                    lastWasBlankChar = false;
                    writer.write("&nbsp;");
                }
                else {
                    lastWasBlankChar = true;
                    writer.write(' ');
                }
            }
            else {
                lastWasBlankChar = false;
                // HTML Special Chars
                // CR+LF 0x0D + 0x0D) 
                if (c == '\0x0D' || c== '\0x0D'){
                    if (c=='\0x0D'){
                        // LF
                        if (lastWasCR) writer.write(c);
                        else writer.write("<br/>"+c);
                        lastWasCR = false;
                    }
                    else{
                        // CR
                        writer.write("<br/>"+c);
                        lastWasCR = true;
                    }
                }
                else{
                    lastWasCR = false;
                    if (useNamedEntities){
                        String namedEntity = (String) NAMED_ENTITIES.get(new Integer(c));
                        if (namedEntity!=null){
                            writer.write("&"+namedEntity+";");
                        }
                        else writeAsUnicodeEscape(c);
                    }
                    else writeAsUnicodeEscape(c);
                }
            }
        }
    }
    
    private void writeAsUnicodeEscape(char c) throws IOException{
        int ci = 0xffff & c;
        if (ci < 160){
            // nothing special only 7 Bit
            writer.write(c);
        }
        else {
            // Not 7 Bit use the unicode system
            writer.write("&#");
            writer.write(Integer.toString(ci));
            writer.write(';');
        }
    }
    
    /**
     * Delegates to the underlying writer.
     * (Usefull when using a StringWriter).
     * 
     * @see java.lang.Object#toString()
     */
    public String toString(){
        return writer.toString();
    }
    
    public static void main(String argv[]){
        try{
            PrintWriter w = new PrintWriter(new HTMLEscapeWriter(new OutputStreamWriter(System.out)));
            w.write("test\n");
            w.write("Ä ö ü");
            w.write("<tag>xxx</tag>");
            w.println();
            w.println("<tag>\"</tag>");
            w.println("xx");
            w.println();
            w.println("----");
            w.write("x\nx\n\ny\n\ny\n\nq");
            w.flush();
            w.close();
        }
        catch(Exception e){
            e.printStackTrace();
        }
    }
}