Sehr häufig kommt es vor, dass ich Text in HTML umwandeln und dabei Sonderzeichen in HTML-Entities escapen muss, beispielsweise wenn ich in einer JSP einen Stacktrace oder ein XML-Dokument anzeigen möchte. Ich habe mir die einmalige Mühe gemacht, einen eigenen Writer hierfür zu schreiben.
Dieser versteht es, alle HTML-Entities, auch Zeilenumbrüche, korrekt zu escapen und darzustellen, und das sogar mit einer Unterstützung für Named Entities (wie z. B. & statt ").
Und da es wie alle anderen Writer ebenfalls ein Decorator ist, kann es sehr schön, einfach und Java-Konform in den Kode eingebunden werden, wie z. B. bei Exceptions: e.printStackTrace(new HTMLEscapeWriter(out));
Very often I need to convert plain text to HTML and thus convert all special characters of the text to HTML escapes, especially when working with JSP or Servlets. Now I wrote some Writer which handels all escaping on the fly and is able to convert either to unicode-escapes like " or to named entities (if available) like &. Also linkebreaks are converted to appropriate <BR/> tags.
Through the java-compliant usage as a writer decorator it’s easy to use. E. g. when used to print a stacktrace on a html page:
e.printStackTrace(new HTMLEscapeWriter(out));
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.HashMap;
/**
* This writer decorates other writers and converts all HTML-special characters and
* non-ASCII characters to HTML entities.
* <p/>
* Linebreaks through carriagereturn (CR) or linefeed (LF) are translated to a <BR/>.
* This writer supports unicode-escaped (like &#34; for quotation symbol) and named entities (like &quot; for a quotation symbol) conversions.
* Named entities are only used for HTML 4.0 known named entities.
* <p/>
* Based on code from S. Bayer found at <a href="http://www.rgagnon.com/javadetails/java-0306.html">rgagnon.com</a>
* and a list of named HTML 4.0 entities at <a href="http://de.selfhtml.org/html/referenz/zeichen.htm">selfhtml.org</a>.
*/
public class HTMLEscapeWriter extends Writer {
private static final HashMap NAMED_ENTITIES = new HashMap();
static{
synchronized(NAMED_ENTITIES){
NAMED_ENTITIES.put(new Integer(34), "quot");
NAMED_ENTITIES.put(new Integer(38), "amp");
NAMED_ENTITIES.put(new Integer(60), "lt");
NAMED_ENTITIES.put(new Integer(62), "gt");
NAMED_ENTITIES.put(new Integer(160), "nbsp");
NAMED_ENTITIES.put(new Integer(161), "iexcl");
NAMED_ENTITIES.put(new Integer(162), "cent");
NAMED_ENTITIES.put(new Integer(163), "pound");
NAMED_ENTITIES.put(new Integer(164), "curren");
NAMED_ENTITIES.put(new Integer(165), "yen");
NAMED_ENTITIES.put(new Integer(166), "brvbar");
NAMED_ENTITIES.put(new Integer(167), "sect");
NAMED_ENTITIES.put(new Integer(168), "uml");
NAMED_ENTITIES.put(new Integer(169), "copy");
NAMED_ENTITIES.put(new Integer(170), "ordf");
NAMED_ENTITIES.put(new Integer(171), "laquo");
NAMED_ENTITIES.put(new Integer(172), "not");
NAMED_ENTITIES.put(new Integer(173), "shy");
NAMED_ENTITIES.put(new Integer(174), "reg");
NAMED_ENTITIES.put(new Integer(175), "macr");
NAMED_ENTITIES.put(new Integer(176), "deg");
NAMED_ENTITIES.put(new Integer(177), "plusmn");
NAMED_ENTITIES.put(new Integer(178), "sup2");
NAMED_ENTITIES.put(new Integer(179), "sup3");
NAMED_ENTITIES.put(new Integer(180), "acute");
NAMED_ENTITIES.put(new Integer(181), "micro");
NAMED_ENTITIES.put(new Integer(182), "para");
NAMED_ENTITIES.put(new Integer(183), "middot");
NAMED_ENTITIES.put(new Integer(184), "cedil");
NAMED_ENTITIES.put(new Integer(185), "sup1");
NAMED_ENTITIES.put(new Integer(186), "ordm");
NAMED_ENTITIES.put(new Integer(187), "raquo");
NAMED_ENTITIES.put(new Integer(188), "frac14");
NAMED_ENTITIES.put(new Integer(189), "frac12");
NAMED_ENTITIES.put(new Integer(190), "frac34");
NAMED_ENTITIES.put(new Integer(191), "iquest");
NAMED_ENTITIES.put(new Integer(192), "Agrave");
NAMED_ENTITIES.put(new Integer(193), "Aacute");
NAMED_ENTITIES.put(new Integer(194), "Acirc");
NAMED_ENTITIES.put(new Integer(195), "Atilde");
NAMED_ENTITIES.put(new Integer(196), "Auml");
NAMED_ENTITIES.put(new Integer(197), "Aring");
NAMED_ENTITIES.put(new Integer(198), "AElig");
NAMED_ENTITIES.put(new Integer(199), "Ccedil");
NAMED_ENTITIES.put(new Integer(200), "Egrave");
NAMED_ENTITIES.put(new Integer(201), "Eacute");
NAMED_ENTITIES.put(new Integer(202), "Ecirc");
NAMED_ENTITIES.put(new Integer(203), "Euml");
NAMED_ENTITIES.put(new Integer(204), "Igrave");
NAMED_ENTITIES.put(new Integer(205), "Iacute");
NAMED_ENTITIES.put(new Integer(206), "Icirc");
NAMED_ENTITIES.put(new Integer(207), "Iuml");
NAMED_ENTITIES.put(new Integer(208), "ETH");
NAMED_ENTITIES.put(new Integer(209), "Ntilde");
NAMED_ENTITIES.put(new Integer(210), "Ograve");
NAMED_ENTITIES.put(new Integer(211), "Oacute");
NAMED_ENTITIES.put(new Integer(212), "Ocirc");
NAMED_ENTITIES.put(new Integer(213), "Otilde");
NAMED_ENTITIES.put(new Integer(214), "Ouml");
NAMED_ENTITIES.put(new Integer(215), "times");
NAMED_ENTITIES.put(new Integer(216), "Oslash");
NAMED_ENTITIES.put(new Integer(217), "Ugrave");
NAMED_ENTITIES.put(new Integer(218), "Uacute");
NAMED_ENTITIES.put(new Integer(219), "Ucirc");
NAMED_ENTITIES.put(new Integer(220), "Uuml");
NAMED_ENTITIES.put(new Integer(221), "Yacute");
NAMED_ENTITIES.put(new Integer(222), "THORN");
NAMED_ENTITIES.put(new Integer(223), "szlig");
NAMED_ENTITIES.put(new Integer(224), "agrave");
NAMED_ENTITIES.put(new Integer(225), "aacute");
NAMED_ENTITIES.put(new Integer(226), "acirc");
NAMED_ENTITIES.put(new Integer(227), "atilde");
NAMED_ENTITIES.put(new Integer(228), "auml");
NAMED_ENTITIES.put(new Integer(229), "aring");
NAMED_ENTITIES.put(new Integer(230), "aelig");
NAMED_ENTITIES.put(new Integer(231), "ccedil");
NAMED_ENTITIES.put(new Integer(232), "egrave");
NAMED_ENTITIES.put(new Integer(233), "eacute");
NAMED_ENTITIES.put(new Integer(234), "ecirc");
NAMED_ENTITIES.put(new Integer(235), "euml");
NAMED_ENTITIES.put(new Integer(236), "igrave");
NAMED_ENTITIES.put(new Integer(237), "iacute");
NAMED_ENTITIES.put(new Integer(238), "icirc");
NAMED_ENTITIES.put(new Integer(239), "iuml");
NAMED_ENTITIES.put(new Integer(240), "eth");
NAMED_ENTITIES.put(new Integer(241), "ntilde");
NAMED_ENTITIES.put(new Integer(242), "ograve");
NAMED_ENTITIES.put(new Integer(243), "oacute");
NAMED_ENTITIES.put(new Integer(244), "ocirc");
NAMED_ENTITIES.put(new Integer(245), "otilde");
NAMED_ENTITIES.put(new Integer(246), "ouml");
NAMED_ENTITIES.put(new Integer(247), "divide");
NAMED_ENTITIES.put(new Integer(248), "oslash");
NAMED_ENTITIES.put(new Integer(249), "ugrave");
NAMED_ENTITIES.put(new Integer(250), "uacute");
NAMED_ENTITIES.put(new Integer(251), "ucirc");
NAMED_ENTITIES.put(new Integer(252), "uuml");
NAMED_ENTITIES.put(new Integer(253), "yacute");
NAMED_ENTITIES.put(new Integer(254), "thorn");
NAMED_ENTITIES.put(new Integer(255), "yuml");
NAMED_ENTITIES.put(new Integer(913), "Alpha");
NAMED_ENTITIES.put(new Integer(945), "alpha");
NAMED_ENTITIES.put(new Integer(914), "Beta");
NAMED_ENTITIES.put(new Integer(946), "beta");
NAMED_ENTITIES.put(new Integer(913), "Alpha");
NAMED_ENTITIES.put(new Integer(913), "Alpha");
NAMED_ENTITIES.put(new Integer(915), "Gamma");
NAMED_ENTITIES.put(new Integer(947), "gamma");
NAMED_ENTITIES.put(new Integer(916), "Delta");
NAMED_ENTITIES.put(new Integer(948), "delta");
NAMED_ENTITIES.put(new Integer(917), "Epsilon");
NAMED_ENTITIES.put(new Integer(949), "epsilon");
NAMED_ENTITIES.put(new Integer(918), "Zeta");
NAMED_ENTITIES.put(new Integer(950), "zeta");
NAMED_ENTITIES.put(new Integer(919), "Eta");
NAMED_ENTITIES.put(new Integer(951), "eta");
NAMED_ENTITIES.put(new Integer(920), "Theta");
NAMED_ENTITIES.put(new Integer(952), "theta");
NAMED_ENTITIES.put(new Integer(921), "Iota");
NAMED_ENTITIES.put(new Integer(953), "iota");
NAMED_ENTITIES.put(new Integer(922), "Kappa");
NAMED_ENTITIES.put(new Integer(954), "kappa");
NAMED_ENTITIES.put(new Integer(923), "Lambda");
NAMED_ENTITIES.put(new Integer(955), "lambda");
NAMED_ENTITIES.put(new Integer(924), "Mu");
NAMED_ENTITIES.put(new Integer(956), "mu");
NAMED_ENTITIES.put(new Integer(925), "Nu");
NAMED_ENTITIES.put(new Integer(957), "nu");
NAMED_ENTITIES.put(new Integer(926), "Xi");
NAMED_ENTITIES.put(new Integer(958), "xi");
NAMED_ENTITIES.put(new Integer(927), "Omicron");
NAMED_ENTITIES.put(new Integer(959), "omicron");
NAMED_ENTITIES.put(new Integer(928), "Pi");
NAMED_ENTITIES.put(new Integer(960), "pi");
NAMED_ENTITIES.put(new Integer(929), "Rho");
NAMED_ENTITIES.put(new Integer(961), "rho");
NAMED_ENTITIES.put(new Integer(931), "Sigma");
NAMED_ENTITIES.put(new Integer(962), "sigmaf");
NAMED_ENTITIES.put(new Integer(963), "sigma");
NAMED_ENTITIES.put(new Integer(932), "Tau");
NAMED_ENTITIES.put(new Integer(964), "tau");
NAMED_ENTITIES.put(new Integer(933), "Upsilon");
NAMED_ENTITIES.put(new Integer(965), "upsilon");
NAMED_ENTITIES.put(new Integer(934), "Phi");
NAMED_ENTITIES.put(new Integer(966), "phi");
NAMED_ENTITIES.put(new Integer(935), "Chi");
NAMED_ENTITIES.put(new Integer(967), "chi");
NAMED_ENTITIES.put(new Integer(936), "Psi");
NAMED_ENTITIES.put(new Integer(968), "psi");
NAMED_ENTITIES.put(new Integer(937), "Omega");
NAMED_ENTITIES.put(new Integer(969), "omega");
NAMED_ENTITIES.put(new Integer(977), "thetasym");
NAMED_ENTITIES.put(new Integer(978), "upsih");
NAMED_ENTITIES.put(new Integer(982), "piv");
NAMED_ENTITIES.put(new Integer(8704), "forall");
NAMED_ENTITIES.put(new Integer(8706), "part");
NAMED_ENTITIES.put(new Integer(8707), "exist");
NAMED_ENTITIES.put(new Integer(8709), "empty");
NAMED_ENTITIES.put(new Integer(8711), "nabla");
NAMED_ENTITIES.put(new Integer(8712), "isin");
NAMED_ENTITIES.put(new Integer(8713), "notin");
NAMED_ENTITIES.put(new Integer(8715), "ni");
NAMED_ENTITIES.put(new Integer(8719), "prod");
NAMED_ENTITIES.put(new Integer(8721), "sum");
NAMED_ENTITIES.put(new Integer(8722), "minus");
NAMED_ENTITIES.put(new Integer(8727), "lowast");
NAMED_ENTITIES.put(new Integer(8730), "radic");
NAMED_ENTITIES.put(new Integer(8733), "prop");
NAMED_ENTITIES.put(new Integer(8734), "infin");
NAMED_ENTITIES.put(new Integer(8736), "ang");
NAMED_ENTITIES.put(new Integer(8743), "and");
NAMED_ENTITIES.put(new Integer(8744), "or");
NAMED_ENTITIES.put(new Integer(8745), "cap");
NAMED_ENTITIES.put(new Integer(8746), "cup");
NAMED_ENTITIES.put(new Integer(8747), "int");
NAMED_ENTITIES.put(new Integer(8756), "there4");
NAMED_ENTITIES.put(new Integer(8764), "sim");
NAMED_ENTITIES.put(new Integer(8773), "cong");
NAMED_ENTITIES.put(new Integer(8776), "asymp");
NAMED_ENTITIES.put(new Integer(8800), "ne");
NAMED_ENTITIES.put(new Integer(8801), "equiv");
NAMED_ENTITIES.put(new Integer(8804), "le");
NAMED_ENTITIES.put(new Integer(8805), "ge");
NAMED_ENTITIES.put(new Integer(8834), "sub");
NAMED_ENTITIES.put(new Integer(8835), "sup");
NAMED_ENTITIES.put(new Integer(8836), "nsub");
NAMED_ENTITIES.put(new Integer(8838), "sube");
NAMED_ENTITIES.put(new Integer(8839), "supe");
NAMED_ENTITIES.put(new Integer(8853), "oplus");
NAMED_ENTITIES.put(new Integer(8855), "otimes");
NAMED_ENTITIES.put(new Integer(8869), "perp");
NAMED_ENTITIES.put(new Integer(8901), "sdot");
NAMED_ENTITIES.put(new Integer(9674), "loz");
NAMED_ENTITIES.put(new Integer(8968), "lceil");
NAMED_ENTITIES.put(new Integer(8969), "rceil");
NAMED_ENTITIES.put(new Integer(8970), "lfloor");
NAMED_ENTITIES.put(new Integer(8971), "rfloor");
NAMED_ENTITIES.put(new Integer(9001), "lang");
NAMED_ENTITIES.put(new Integer(9002), "rang");
NAMED_ENTITIES.put(new Integer(8592), "larr");
NAMED_ENTITIES.put(new Integer(8593), "uarr");
NAMED_ENTITIES.put(new Integer(8594), "rarr");
NAMED_ENTITIES.put(new Integer(8595), "darr");
NAMED_ENTITIES.put(new Integer(8596), "harr");
NAMED_ENTITIES.put(new Integer(8629), "crarr");
NAMED_ENTITIES.put(new Integer(8656), "lArr");
NAMED_ENTITIES.put(new Integer(8657), "uArr");
NAMED_ENTITIES.put(new Integer(8658), "rArr");
NAMED_ENTITIES.put(new Integer(8659), "dArr");
NAMED_ENTITIES.put(new Integer(8660), "hArr");
NAMED_ENTITIES.put(new Integer(8226), "bull");
NAMED_ENTITIES.put(new Integer(8242), "prime");
NAMED_ENTITIES.put(new Integer(8243), "Prime");
NAMED_ENTITIES.put(new Integer(8254), "oline");
NAMED_ENTITIES.put(new Integer(8260), "frasl");
NAMED_ENTITIES.put(new Integer(8472), "weierp");
NAMED_ENTITIES.put(new Integer(8465), "image");
NAMED_ENTITIES.put(new Integer(8476), "real");
NAMED_ENTITIES.put(new Integer(8482), "trade");
NAMED_ENTITIES.put(new Integer(8364), "euro");
NAMED_ENTITIES.put(new Integer(8501), "alefsym");
NAMED_ENTITIES.put(new Integer(9824), "spades");
NAMED_ENTITIES.put(new Integer(9827), "clubs");
NAMED_ENTITIES.put(new Integer(9829), "hearts");
NAMED_ENTITIES.put(new Integer(9830), "diams");
NAMED_ENTITIES.put(new Integer(338), "OElig");
NAMED_ENTITIES.put(new Integer(339), "oelig");
NAMED_ENTITIES.put(new Integer(352), "Scaron");
NAMED_ENTITIES.put(new Integer(353), "scaron");
NAMED_ENTITIES.put(new Integer(376), "Yuml");
NAMED_ENTITIES.put(new Integer(402), "fnof");
NAMED_ENTITIES.put(new Integer(8194), "ensp");
NAMED_ENTITIES.put(new Integer(8195), "emsp");
NAMED_ENTITIES.put(new Integer(8201), "thinsp");
NAMED_ENTITIES.put(new Integer(8204), "zwnj");
NAMED_ENTITIES.put(new Integer(8205), "zwj");
NAMED_ENTITIES.put(new Integer(8206), "lrm");
NAMED_ENTITIES.put(new Integer(8207), "rlm");
NAMED_ENTITIES.put(new Integer(8211), "ndash");
NAMED_ENTITIES.put(new Integer(8212), "mdash");
NAMED_ENTITIES.put(new Integer(8216), "lsquo");
NAMED_ENTITIES.put(new Integer(8217), "rsquo");
NAMED_ENTITIES.put(new Integer(8218), "sbquo");
NAMED_ENTITIES.put(new Integer(8220), "ldquo");
NAMED_ENTITIES.put(new Integer(8221), "rdquo");
NAMED_ENTITIES.put(new Integer(8222), "bdquo");
NAMED_ENTITIES.put(new Integer(8224), "dagger");
NAMED_ENTITIES.put(new Integer(8225), "Dagger");
NAMED_ENTITIES.put(new Integer(8230), "hellip");
NAMED_ENTITIES.put(new Integer(8240), "permil");
NAMED_ENTITIES.put(new Integer(8249), "lsaquo");
NAMED_ENTITIES.put(new Integer(8250), "rsaquo");
NAMED_ENTITIES.put(new Integer(710), "circ");
NAMED_ENTITIES.put(new Integer(732), "tilde");
}
}
private Writer writer;
private boolean lastWasBlankChar = false;
private boolean lastWasCR = false;
private boolean useNamedEntities = true;
/**
* This constructor creates an instance which wrapps the given writer
* and uses named entities.
*
* @param writer
*/
public HTMLEscapeWriter(Writer writer) {
this(writer, true);
}
/**
* This constructor creates an instance which wrapps the given writer
* and uses named entities, when the second parameter is true,
* else escapes all characters with unicode-escapes.
*
* @param writer
* @param useNamedEntities if true uses named entities where possible, if false always unicode-escapes.
*/
public HTMLEscapeWriter(Writer writer, boolean useNamedEntities) {
this.writer = writer;
this.useNamedEntities = useNamedEntities;
}
/**
* Delegates to the underlying writer.
*
* @see java.io.Writer#close()
*/
public void close() throws IOException {
writer.close();
}
/**
* Delegates to the underlying writer.
*
* @see java.io.Writer#flush()
*/
public void flush() throws IOException {
writer.flush();
}
/**
* Converts the given characters to their unicode-escapes or named entities if they
* are non-ASCII characters and writes to the underlying writer.
*
* @see java.io.Writer#write(char[], int, int)
*/
public void write(char[] cbuf, int off, int len) throws IOException {
char c;
int top = off + len;
for (int i = off; i < top; i++) {
c = cbuf[i];
if (c == ' ') {
lastWasCR = false;
// blank gets extra work,
// this solves the problem you get if you replace all
// blanks with , if you do that you loss
// word breaking
if (lastWasBlankChar) {
lastWasBlankChar = false;
writer.write(" ");
}
else {
lastWasBlankChar = true;
writer.write(' ');
}
}
else {
lastWasBlankChar = false;
// HTML Special Chars
// CR+LF 0x0D + 0x0D)
if (c == '\0x0D' || c== '\0x0D'){
if (c=='\0x0D'){
// LF
if (lastWasCR) writer.write(c);
else writer.write("<br/>"+c);
lastWasCR = false;
}
else{
// CR
writer.write("<br/>"+c);
lastWasCR = true;
}
}
else{
lastWasCR = false;
if (useNamedEntities){
String namedEntity = (String) NAMED_ENTITIES.get(new Integer(c));
if (namedEntity!=null){
writer.write("&"+namedEntity+";");
}
else writeAsUnicodeEscape(c);
}
else writeAsUnicodeEscape(c);
}
}
}
}
private void writeAsUnicodeEscape(char c) throws IOException{
int ci = 0xffff & c;
if (ci < 160){
// nothing special only 7 Bit
writer.write(c);
}
else {
// Not 7 Bit use the unicode system
writer.write("&#");
writer.write(Integer.toString(ci));
writer.write(';');
}
}
/**
* Delegates to the underlying writer.
* (Usefull when using a StringWriter).
*
* @see java.lang.Object#toString()
*/
public String toString(){
return writer.toString();
}
public static void main(String argv[]){
try{
PrintWriter w = new PrintWriter(new HTMLEscapeWriter(new OutputStreamWriter(System.out)));
w.write("test\n");
w.write("Ä ö ü");
w.write("<tag>xxx</tag>");
w.println();
w.println("<tag>\"</tag>");
w.println("xx");
w.println();
w.println("----");
w.write("x\nx\n\ny\n\ny\n\nq");
w.flush();
w.close();
}
catch(Exception e){
e.printStackTrace();
}
}
}