| XMLEncoder.java |
/*
* $Id: XMLEncoder.java,v 1.210 2005/10/24 09:54:53 agoubard Exp $
*/
package org.znerd.xmlenc;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
/**
* Encodes character streams for an XML document.
*
* <p>The following encodings are supported:
*
* <ul>
* <li><code>UTF-8</code>
* <li><code>UTF-16</code>
* <li><code>US-ASCII</code>, with alias <code>ASCII</code>
* <li>all <code>ISO-8859</code> encodings
* </ul>
*
* @version $Revision: 1.210 $ $Date: 2005/10/24 09:54:53 $
* @author Ernst de Haan (<a href="mailto:ernst.dehaan@nl.wanadoo.com">ernst.dehaan@nl.wanadoo.com</a>)
* @author Jochen Schwoerer (j.schwoerer [at] web.de)
* @author Anthony Goubard (<a href="mailto:anthony.goubard@nl.wanadoo.com">anthony.goubard@nl.wanadoo.com</a>)
*
* @since xmlenc 0.1
*/
public class XMLEncoder extends Object {
// For this encoder, different Unicode characters are treated differently.
//
// Within attribute values, the following applies:
//
// ID Dec Description Escaping
// __ ______ __________________ _________________________
//
// A 0-8 Control characters -- Not allowed in XML 1.0 --
// B 9-10 Normal characters Never needed
// C 11-12 Control characters -- Not allowed in XML 1.0 --
// D 13 Normal character Never needed
// E 14-31 Control characters -- Not allowed in XML 1.0 --
// F 32-33 Normal characters Never needed
// G 34 Quote (") If quotation mark
// H 35-37 Normal characters Never needed
// I 38 Ampersand (&) If escapeAmpersands=true
// J 39 Apostrophe (') If quotation mark
// K 40-59 Normal characters Never needed
// L 60 Less than (<) Always
// M 61 Normal character Never needed
// N 62 Greater than (>) Always
// O 63-127 Normal characters Never needed
// P 128+ Normal characters If encoding is ASCII
//
// Outside attribute values, the following applies:
//
// ID Dec Description Escaping
// __ ______ __________________ _________________________
//
// A 0-8 Control characters -- Not allowed in XML 1.0 --
// B 9-10 Normal characters Never needed
// C 11-12 Control characters -- Not allowed in XML 1.0 --
// D 13 Normal character Never needed
// E 14-31 Control characters -- Not allowed in XML 1.0 --
// FGH 32-37 Normal characters Never needed
// I 38 Ampersand (&) If escapeAmpersands=true
// JK 39-59 Normal characters Never needed
// L 60 Less than (<) Always
// M 61 Normal character Never needed
// N 62 Greater than (>) Always
// O 63-127 Normal characters Never needed
// P 128+ Normal characters If encoding is ASCII
//
// The following characters are expected to be encountered the most often:
//
// 32 Space Part of range F
// 10 Linefeed Part of range B
// 13 Carriage return Range D
// 48-57 Digits 0-9 Part of range K
// 65-90 Uppercase letters A-Z Part of range O
// 97-122 Lowercase letters a-z Part of range O
//
// After that, the following characters are expected to be encountered the
// most often:
//
// 9 Tab Part of range B
// 33 Exclamation mark Part of range F
// 34 Quote Range G
// 35-37 Hash, dollar, percent Range H
// 38 Ampersand Range I
// 39 Apostrophe Range J
// 40-47 Punctuation, etc. Part of range K
// 58-59 Punctuation, etc. Part of range K
// 60 Less-than Range L
// 61 Equals Range M
// 62 Greater-than Range N
// 63-64 Question, at-sign Part of range O
// 91-96 Punctuation, etc. Part of range O
// 123-127 Punctuation Part of range O
//
// And the following characters are expected to be encountered the least:
//
// 128+ High characters Range P
//
//
// See:
// http://www.w3.org/TR/REC-xml
// http://www.jimprice.com/ascii-0-127.gif
//-------------------------------------------------------------------------
// Class functions
//-------------------------------------------------------------------------
/**
* Retrieves an <code>XMLEncoder</code> for the specified encoding. If no
* suitable instance can be returned, then an exception is thrown.
*
* @param encoding
* the name of the encoding, not <code>null</code>.
*
* @return
* an <code>XMLEncoder</code> instance that matches the specified
* encoding, never <code>null</code>.
*
* @throws IllegalArgumentException
* if <code>encoding == null</code>.
*
* @throws UnsupportedEncodingException
* if the specified encoding is not supported.
*/
public static final XMLEncoder getEncoder(String encoding)
throws IllegalArgumentException, UnsupportedEncodingException {
return new XMLEncoder(encoding);
}
//-------------------------------------------------------------------------
// Class fields
//-------------------------------------------------------------------------
/**
* The first part of a declaration, before the encoding.
*/
private static final char[] DECLARATION_START = "<?xml version=\"1.0\" encoding=\"".toCharArray();
/**
* The length of <code>DECLARATION_START</code>.
*/
private static final int DECLARATION_START_LENGTH = DECLARATION_START.length;
/**
* The last part of a declaration, after the encoding.
*/
private static final char[] DECLARATION_END = "\"?>".toCharArray();
/**
* The length of <code>DECLARATION_END</code>.
*/
private static final int DECLARATION_END_LENGTH = DECLARATION_END.length;
/**
* Character array representing the string <code>">"</code>.
*/
private static final char[] ESC_GREATER_THAN = new char[] { '&', 'g', 't', ';' };
/**
* Character array representing the string <code>"<"</code>.
*/
private static final char[] ESC_LESS_THAN = new char[] { '&', 'l', 't', ';' };
/**
* Character array representing the string <code>"&amp;"</code>.
*/
private static final char[] ESC_AMPERSAND = new char[] { '&', 'a', 'm', 'p', ';' };
/**
* Character array representing the string <code>"&apos;"</code>.
*/
private static final char[] ESC_APOSTROPHE = new char[] { '&', 'a', 'p', 'o', 's', ';' };
/**
* Character array representing the string <code>"&apos;"</code>.
*/
private static final char[] ESC_QUOTE = new char[] { '&', 'q', 'u', 'o', 't', ';' };
/**
* Character array representing the string <code>"&#"</code>.
*/
private static final char[] AMPERSAND_HASH = new char[] { '&', '#' };
/**
* Character array representing the string <code>"='"</code>.
*/
private static final char[] EQUALS_APOSTROPHE = new char[] { '=', '\'' };
/**
* Character array representing the string <code>"=\""</code>.
*/
private static final char[] EQUALS_QUOTE = new char[] { '=', '"' };
//-------------------------------------------------------------------------
// Constructor
//-------------------------------------------------------------------------
/**
* Constructs a new <code>XMLEncoder</code> instance.
*
* @param encoding
* the name of the encoding, not <code>null</code>.
*
* @throws IllegalArgumentException
* if <code>encoding == null</code>.
*
* @throws UnsupportedEncodingException
* if the specified encoding is not supported.
*
* @deprecated
* Deprecated since xmlenc 0.47.
* Use the factory method {@link #getEncoder(String)} instead.
*/
public XMLEncoder(String encoding)
throws IllegalArgumentException, UnsupportedEncodingException {
// Check argument
if (encoding == null) {
throw new IllegalArgumentException("encoding == null");
}
// Uppercase encoding to compare it with supported encodings in a
// case-insensitive manner
String ucEncoding = encoding.toUpperCase();
// Check if the encoding supports all Unicode characters
if (ucEncoding.equals("UTF-8") || ucEncoding.equals("UTF-16")) {
_sevenBitEncoding = false;
// Check if this is an ISO 646-based character set (7-bit ASCII)
} else if (ucEncoding.equals("US-ASCII")
|| ucEncoding.equals("ASCII")
|| ucEncoding.startsWith("ISO-8859-")) {
_sevenBitEncoding = true;
// Otherwise fail
} else {
throw new UnsupportedEncodingException(encoding);
}
// Store encoding literally as passed
_encoding = encoding;
_encodingCharArray = encoding.toCharArray();
}
//-------------------------------------------------------------------------
// Fields
//-------------------------------------------------------------------------
/**
* The name of the encoding. Cannot be <code>null</code>.
*/
private final String _encoding;
/**
* The name of the encoding as a character array. Cannot be
* <code>null</code>.
*/
private final char[] _encodingCharArray;
/**
* Flag that indicates whether the encoding is based on the ISO 646
* character set. The value is <code>true</code> if the encoding is a 7 bit
* encoding, or <code>false</code> if the encoding supports all Unicode
* characters.
*/
private final boolean _sevenBitEncoding;
//-------------------------------------------------------------------------
// Methods
//-------------------------------------------------------------------------
/**
* Returns the encoding.
*
* @return
* the encoding passed to the constructor, never <code>null</code>.
*/
public String getEncoding() {
return _encoding;
}
/**
* Writes an XML declaration.
*
* @param out
* the character stream to write to, not <code>null</code>.
*
* @throws NullPointerException
* if <code>out == null</code>.
*
* @throws IOException
* if an I/O error occurs.
*/
public void declaration(Writer out)
throws NullPointerException, IOException {
out.write(DECLARATION_START, 0, DECLARATION_START_LENGTH);
out.write(_encodingCharArray);
out.write(DECLARATION_END, 0, DECLARATION_END_LENGTH);
}
/**
* Writes the specified text. Any characters that are non-printable in this
* encoding will be escaped.
*
* <p />It must be specified whether ampersands should be escaped. Unless
* ampersands are escaped, entity references can be written.
*
* @param out
* the character stream to write to, not <code>null</code>.
*
* @param text
* the text to be written, not <code>null</code>.
*
* @param escapeAmpersands
* flag that indicates whether ampersands should be escaped.
*
* @throws NullPointerException
* if <code>out == null || text == null</code>.
*
* @throws InvalidXMLException
* if the specified text contains an invalid character.
*
* @throws IOException
* if an I/O error occurs.
*/
public void text(Writer out, String text, boolean escapeAmpersands)
throws NullPointerException, InvalidXMLException, IOException {
text(out,
text.toCharArray(),
0,
text.length(),
escapeAmpersands);
}
/**
* Writes text from the specified character array. Any characters that are
* non-printable in this encoding will be escaped.
*
* <p />It must be specified whether ampersands should be escaped. Unless
* ampersands are escaped, entity references can be written.
*
* @param out
* the character stream to write to, not <code>null</code>.
*
* @param ch
* the character array from which to retrieve the text to be written,
* not <code>null</code>.
*
* @param start
* the start index into <code>ch</code>, must be >= 0.
*
* @param length
* the number of characters to take from <code>ch</code>, starting at
* the <code>start</code> index.
*
* @param escapeAmpersands
* flag that indicates if ampersands should be escaped.
*
* @throws NullPointerException
* if <code>out == null || ch == null</code>.
*
* @throws IndexOutOfBoundsException
* if <code>start < 0
* || start + length > ch.length</code>; this may not be
* checked before the character stream is written to, so this may
* cause a <em>partial</em> failure.
*
* @throws InvalidXMLException
* if the specified text contains an invalid character.
*
* @throws IOException
* if an I/O error occurs.
*/
public void text(Writer out,
char[] ch,
int start,
int length,
boolean escapeAmpersands)
throws NullPointerException,
IndexOutOfBoundsException,
InvalidXMLException,
IOException {
int end = start + length;
// The position after the last escaped character
int lastEscaped = start;
for (int i = start; i < end; i++) {
int c = (int) ch[i];
if ((c >= 63 && c <= 127) || (c >= 39 && c <= 59) || (c >= 32 && c <= 37)
|| (c == 38 && !escapeAmpersands) || (c > 127 && !_sevenBitEncoding)
|| c == 10 || c == 13 || c == 61 || c == 9) {
continue;
} else {
out.write(ch, lastEscaped, i - lastEscaped);
if (c == 60) {
out.write(ESC_LESS_THAN, 0, 4);
} else if (c == 62) {
out.write(ESC_GREATER_THAN, 0, 4);
} else if (c == 38) {
out.write(ESC_AMPERSAND, 0, 5);
} else if (c > 127) {
out.write(AMPERSAND_HASH, 0, 2);
out.write(Integer.toString(c));
out.write(';');
} else {
throw new InvalidXMLException("The character 0x" + Integer.toHexString(c) + " is not valid.");
}
lastEscaped = i + 1;
}
}
out.write(ch, lastEscaped, end - lastEscaped);
}
/**
* Writes the specified character. If the character is non-printable in
* this encoding, then it will be escaped.
*
* <p />It is safe for this method to assume that the specified character
* does not need to be escaped unless the encoding does not support the
* character.
*
* @param out
* the character stream to write to, not <code>null</code>.
*
* @param c
* the character to be written.
*
* @throws InvalidXMLException
* if the specified text contains an invalid character.
*
* @throws IOException
* if an I/O error occurs.
*
* @deprecated
* Deprecated since xmlenc 0.51.
* Use the text method {@link #text(Writer, char, boolean)} instead.
*/
public void text(Writer out, char c) throws InvalidXMLException, IOException {
if ((c >= 63 && c <= 127) || (c >= 39 && c <= 59) || (c >= 32 && c <= 37)
|| (c == 38) || (c > 127 && !_sevenBitEncoding)
|| c == 10 || c == 13 || c == 61 || c == 9) {
out.write(c);
} else {
if (c == 60) {
out.write(ESC_LESS_THAN, 0, 4);
} else if (c == 62) {
out.write(ESC_GREATER_THAN, 0, 4);
} else if (c > 127) {
out.write(AMPERSAND_HASH, 0, 2);
out.write(Integer.toString(c));
out.write(';');
} else {
throw new InvalidXMLException("The character 0x" + Integer.toHexString(c) + " is not valid.");
}
}
}
/**
* Writes the specified character. If the character is non-printable in
* this encoding, then it will be escaped.
*
* <p />It is safe for this method to assume that the specified character
* does not need to be escaped unless the encoding does not support the
* character.
*
* @param out
* the character stream to write to, not <code>null</code>.
*
* @param c
* the character to be written.
*
* @param escapeAmpersands
* flag that indicates if ampersands should be escaped.
*
* @throws InvalidXMLException
* if the specified text contains an invalid character.
*
* @throws IOException
* if an I/O error occurs.
*/
public void text(Writer out, char c, boolean escapeAmpersands) throws InvalidXMLException, IOException {
if ((c >= 63 && c <= 127) || (c >= 39 && c <= 59) || (c >= 32 && c <= 37)
|| (c == 38 && escapeAmpersands) || (c > 127 && !_sevenBitEncoding)
|| c == 10 || c == 13 || c == 61 || c == 9) {
out.write(c);
} else {
if (c == 60) {
out.write(ESC_LESS_THAN, 0, 4);
} else if (c == 62) {
out.write(ESC_GREATER_THAN, 0, 4);
} else if (c == 38) {
out.write(ESC_AMPERSAND, 0, 5);
} else if (c > 127) {
out.write(AMPERSAND_HASH, 0, 2);
out.write(Integer.toString(c));
out.write(';');
} else {
throw new InvalidXMLException("The character 0x" + Integer.toHexString(c) + " is not valid.");
}
}
}
/**
* Writes the specified whitespace string.
*
* @param out
* the character stream to write to, not <code>null</code>.
*
* @param s
* the character string to be written, not <code>null</code>.
*
* @throws NullPointerException
* if <code>out == null || s == null</code>.
*
* @throws InvalidXMLException
* if the specified character string contains a character that is
* invalid as whitespace.
*
* @throws IOException
* if an I/O error occurs.
*/
public void whitespace(Writer out, String s)
throws NullPointerException, InvalidXMLException, IOException {
char[] ch = s.toCharArray();
int length = ch.length;
whitespace(out, ch, 0, length);
}
/**
* Writes whitespace from the specified character array.
*
* @param out
* the character stream to write to, not <code>null</code>.
*
* @param ch
* the character array from which to retrieve the text to be written,
* not <code>null</code>.
*
* @param start
* the start index into <code>ch</code>, must be >= 0.
*
* @param length
* the number of characters to take from <code>ch</code>, starting at
* the <code>start</code> index.
*
* @throws NullPointerException
* if <code>out == null || ch == null</code>.
*
* @throws IndexOutOfBoundsException
* if <code>start < 0
* || start + length > ch.length</code>; this may not be
* checked before the character stream is written to, so this may
* cause a <em>partial</em> failure.
*
* @throws InvalidXMLException
* if the specified character array contains a character that is invalid
* as whitespace.
*
* @throws IOException
* if an I/O error occurs.
*/
public void whitespace(Writer out,
char[] ch,
int start,
int length)
throws NullPointerException,
IndexOutOfBoundsException,
InvalidXMLException,
IOException {
// Check the string
XMLChecker.checkS(ch, start, length);
// Write the complete character string at once
out.write(ch, start, length);
}
/**
* Writes an attribute assignment.
*
* @param out
* the character stream to write to, not <code>null</code>.
*
* @param name
* the name of the attribute, not <code>null</code>.
*
* @param value
* the value of the attribute, not <code>null</code>.
*
* @param quotationMark
* the quotation mark, must be either the apostrophe (<code>'\''</code>)
* or the quote character (<code>'"'</code>).
*
* @throws NullPointerException
* if <code>out == null || value == null</code>.
*
* @throws IllegalArgumentException
* if <code>quotationMark != '\'' && quotationMark != '"'</code>.
*
* @throws IOException
* if an I/O error occurs.
*/
public void attribute(Writer out,
String name,
String value,
char quotationMark,
boolean escapeAmpersands)
throws NullPointerException, IOException {
char[] ch = value.toCharArray();
int length = ch.length;
int start = 0;
int end = start + length;
// TODO: Call overloaded attribute method that accepts char[]
// The position after the last escaped character
int lastEscaped = 0;
boolean useQuote;
if (quotationMark == '"') {
useQuote = true;
} else if (quotationMark == '\'') {
useQuote = false;
} else {
String error = "Character 0x"
+ Integer.toHexString((int) quotationMark)
+ " ('"
+ quotationMark
+ "') is not a valid quotation mark.";
throw new IllegalArgumentException(error);
}
out.write(' ');
out.write(name);
if (useQuote) {
out.write(EQUALS_QUOTE, 0, 2);
} else {
out.write(EQUALS_APOSTROPHE, 0, 2);
}
for (int i = start; i < end; i++) {
int c = (int) ch[i];
if ((c >= 63 && c <= 127) || (c >= 40 && c <= 59) || (c >= 32 && c <= 37 && c != 34)
|| (c == 38 && !escapeAmpersands) || (c > 127 && !_sevenBitEncoding)
|| (!useQuote && c == 34) || (useQuote && c == 39) || c == 10 || c == 13 || c == 61 || c == 9) {
continue;
} else {
out.write(ch, lastEscaped, i - lastEscaped);
if (c == 60) {
out.write(ESC_LESS_THAN, 0, 4);
} else if (c == 62) {
out.write(ESC_GREATER_THAN, 0, 4);
} else if (c == 34) {
out.write(ESC_QUOTE, 0, 6);
} else if (c == 39) {
out.write(ESC_APOSTROPHE, 0, 6);
} else if (c == 38) {
out.write(ESC_AMPERSAND, 0, 5);
} else if (c > 127) {
out.write(AMPERSAND_HASH, 0, 2);
out.write(Integer.toString(c));
out.write(';');
} else {
throw new InvalidXMLException("The character 0x" + Integer.toHexString(c) + " is not valid.");
}
lastEscaped = i + 1;
}
}
out.write(ch, lastEscaped, length - lastEscaped);
out.write(quotationMark);
}
}