需要解决的问题是 根据一输入流读取一段XML内容,然后对其进行过滤截取,最后写回输出流中。具体说明如下:
1.对XML根据特定需求,过滤标签(如SCRIPT,FRAME等非标准HTML标签),过滤属性(如onclick,onblur等)
2.对XML进行长度截取,具体做法如下:
(1)对start标签的处理: 若加上start标签长度后超过最大允许长度,则去除该标签,且同时去除后面和该标签同一等级的所有标签。
(2)对text内容的处理:若加上text内容的长度后超过最大允许的长度,则从中截取text长度,并加上省略号......
(3)对end标签内容的处理:不做长度截取,且要做到自动补齐end标签。
有关SAX的详细介绍,请查看最好的参考资料
http://www.saxproject.org/ 。其中有一个很重要的类 DefaultHandler, 该类中的startElement, endElement, characters 3个方法尤为重要。 为解决上述问题,需要设计2个类:HTMLWriter, HTMLFilter, 其中HTMLFilter是HTMLWriter的子类,HTMLWriter继承了DefaultHandler,其中最为关键的是要重写上述3个关键方法。
一.HTMLWriter类的代码:
这个类主要用于写操作,最重要是理解变量strippedElementLevel 的用法。上面问题的具体业务逻辑处理(标签的过滤和长度截取)将在子类HTMLFilter 解决。
package org.util.sax.html
import openxml.parser.HTMLdtd;
import openxml.parser.HTMLSAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.xml.sax.ErrorHandler;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.DefaultHandler;
import java.io.*;
public class HTMLWriter extends DefaultHandler implements LexicalHandler {
private ErrorHandler errorHandler;
private Writer out;
private int strippedElementLevel = 0; //用来作为start标签和end标签成对出现的标记(极为重要),具体算法思路类似于堆栈
private boolean inRawElement;
public void filter(String htmlContent) throws IOException, SAXException {
filter(new StringReader(htmlContent));
}
public void filter(Reader in) throws IOException, SAXException {
filter(new InputSource(in));
}
public void filter(InputSource in) throws IOException, SAXException {
HTMLSAXParser parser = new HTMLSAXParser(errorHandler, false);
parser.setLexicalHandler(this);
XMLReader htmlReader = new HTMLParserAdapter(parser);
htmlReader.setFeature("http://xml.org/sax/features/namespaces", false);
htmlReader.setContentHandler(this);
prepare();
htmlReader.parse(in);
}
protected void prepare() {
if (out == null) {
out = new StringWriter();
}
}
public void setErrorHandler(ErrorHandler errorHandler) {
this.errorHandler = errorHandler;
}
public void setOut(Writer out) {
this.out = out;
}
public Writer getOut() {
return out;
}
public String getResultAsString() {
if (out instanceof StringWriter) {
return out.toString();
}
throw new IllegalStateException("Not a buffered target");
}
@Override
public void startDocument() throws SAXException {
prepare();
}
@Override
public final void startElement(String namespaceURI,
String localName,
String qName,
Attributes attrs) throws SAXException {
if (strippedElementLevel > 0) {
strippedElementLevel++;
return;
}
// features/namespace is false
if (!startTag(qName, attrs)) {
strippedElementLevel = 1;
}
}
@Override
public final void endElement(String namespaceURI,
String localName,
String qName) throws SAXException {
if (strippedElementLevel > 0) {
strippedElementLevel--;
return;
}
// features/namespace is false
endTag(qName);
}
protected boolean startTag(String tagName, Attributes attrs) throws SAXException {
String tagUpper = tagName.toUpperCase();
inRawElement = "SCRIPT".equals(tagUpper) || "STYLE".equals(tagUpper);
write('<');
write(tagName);
for (int i = 0; i < attrs.getLength(); i++) {
// features/namespace is false
String attrName = attrs.getQName(i);
attribute(tagUpper, attrName.toLowerCase(), attrName, attrs.getValue(i));
}
write('>');
return true;
}
protected void endTag(String tagName) throws SAXException {
inRawElement = false;
if (!isEmptyTag(tagName.toUpperCase())) {
write("</");
write(tagName);
write('>');
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (strippedElementLevel != 0) {
return;
}
if (inRawElement) {
write(ch, start, length);
return;
}
text(ch, start, length);
}
protected void text(char[] ch, int start, int length) throws SAXException {
writeText(ch, start, length);
}
public void startDTD(String tagName, String publicId, String systemId) throws SAXException {
write("<!DOCTYPE ");
write(tagName);
write(" PUBLIC ");
write('"');
write(publicId);
write('"');
write('>');
}
public void endDTD() {}
public void startEntity(String name) {}
public void endEntity(String name) {}
public void startCDATA() {}
public void endCDATA() {}
public void comment(char ch[], int start, int length) throws SAXException {
/**//*
if (strippedElementLevel == 0) {
write("<!--");
write(ch, start, length);
write("-->");
}
*/
}
@Override
public void ignorableWhitespace(char ch[], int start, int length) throws SAXException {
if (strippedElementLevel == 0) {
write(ch, start, length);
}
}
protected void attribute(final String tagUpper, // 规范化的 TAG 名称 - 使用大写字母
final String attrLower, // 规范化的 属性 名称 - 使用小写字母
String attrName,
String attrValue) throws SAXException {
write(' ');
write(attrName);
if (!isBoolean(attrLower, tagUpper)) {
write('=');
write('"');
for (int i = 0; i < attrValue.length(); i++) {
writeEncoded(attrValue.charAt(i), true);
}
write('"');
}
}
protected final void writeText(char[] ch, int start, int length) throws SAXException {
writeTextWithEnd(ch, start, start + length);
}
protected final void writeTextWithEnd(char[] ch, int begin, int end) throws SAXException {
for (int i = begin; i < end; i++) {
writeEncoded(ch[i], false);
}
}
protected void writeEncoded(char c, boolean isAttr) throws SAXException {
switch (c) {
case '<':
write("<");
break;
case '>':
write(">");
break;
case '&':
write("&");
break;
case 0xa0: // NBSP
// 暂时只特殊处理特殊字符 NBSP
// 当组信 NBSP 在转换到纯文本时可变成空格
// 但其它特殊字符没有简单的Ascii字符可替代, 因而这里也不执行替代
write(" ");
break;
case '"':
if (isAttr) {
write(""");
break;
}
default:
write(c);
}
}
protected void write(char c) throws SAXException {
try {
out.write(c);
} catch (IOException e) {
throw new SAXException(e);
}
}
protected void write(char ch[], int start, int length) throws SAXException {
try {
out.write(ch, start, length);
} catch (IOException e) {
throw new SAXException(e);
}
}
protected void write(String s) throws SAXException {
try {
out.write(s);
} catch (IOException e) {
throw new SAXException(e);
}
}
private static boolean isBoolean(String attrLower, String tagUpper) {
return HTMLdtd.isBoolean(attrLower, tagUpper);
}
private static boolean isEmptyTag(String tagUpper) {
return HTMLdtd.isEmptyTag(tagUpper);
}
}
二. HTMLFilter 类的代码:
主要解决标签过滤,即哪些标签和属性需要过滤,解决长度截取问题,即断点出现在startTag,text,endTag的情况应该如何解决。
主要理解重写父类HTMLWriter的几个方法:startTag(),characters(),comment(),attribute(), 另外需要一个成员变量currentLen记录当前写入的长度,在进行write()方法时要对currentLen变量进行叠加。
package org.util.sax.html;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import java.util.Map;
import java.io.Writer;
import java.io.CharArrayWriter;
import java.io.IOException;
public class HTMLFilter extends HTMLWriter {
ConfigManager conf = CM.getConfig();
Map<String, String> cidMap; //cid 和 正文内容图片 filename的 映射
private int currentLen; //当前已经写入out的长度
private int maxLen; //允许push的最大长度
private boolean ignore=false; //当出现要截取时,就设为 true ,意味着如果ignore 为true时, 就以后的内容都要忽略。
public HTMLFilter(Map<String,String> map,int allowMessage_BodyLen) {
//super.setAllowContentLen(allowMessage_BodyLen);
this.maxLen=allowMessage_BodyLen;
this.cidMap=map;
}
@Override
protected boolean startTag(String tagName, Attributes attrs) throws SAXException {
if (!isTagAllowed(tagName, attrs)) {
return false;
}
if (ignore) {
return false;
}
Writer originalOutput = getOut();
int remainChars = getRemainChars();
if(remainChars == 0){
ignore = true;
write("");
return false;
}
CharArrayWriter capturedOutput = new CharArrayWriter();
setOut(capturedOutput);
try {
if (super.startTag(tagName, attrs)) {
if (capturedOutput.toCharArray().length < remainChars) {
try {
originalOutput.write(capturedOutput.toCharArray());
return true;
} catch (IOException e) {
throw new SAXException(e);
}
}
}
} finally {
setOut(originalOutput);
}
ignore = true;
write("");
return false;
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (ignore) { //如果长度已经超出限制,则不写
return;
}
int remainChars = getRemainChars();
if (remainChars == 0) {
ignore = true;
write("");
return;
}
if (remainChars < length) { //当将要写入的 text 长度 大于 remainChars 时, 就写入所能够写入的字符,然后添加省略号
ignore = true;
super.characters(ch, start, remainChars);
write("");
} else {
super.characters(ch, start, length);
}
}
@Override
protected void endTag(String tagName) throws SAXException {
super.endTag(tagName);
}
public void comment(char ch[], int start, int length) throws SAXException{
if(ignore){
return;
}
int remainChars = getRemainChars();
if (remainChars == 0) {
ignore = true;
write("");
return;
}
if (remainChars < length) {
ignore=true;
super.comment(ch, start, remainChars);
} else {
super.comment(ch, start, length);
}
}
@Override
protected void attribute(final String tagUpper,
final String attrLower,
final String attrName,
String attrValue) throws SAXException {
if (attrLower.startsWith("on")) {
return;
}
if (tagUpper.equalsIgnoreCase("IMG") && attrLower.equalsIgnoreCase("src") && attrValue.trim().indexOf("cid:") != -1) {
attrValue=attrValue.trim();
int cid_idx = attrValue.indexOf("cid:");
String cid = attrValue.substring(cid_idx + 4);
// System.out.println("cid is: "+ cid);
String photoName = cidMap.get(cid);
// System.out.println("photoName is: "+ photoName);
if (photoName != null) {
super.attribute(tagUpper, attrLower, attrName, "#{" + photoName + "}");
} else{
super.attribute(tagUpper, attrLower, attrName, "#{" + " " + "}");
}
} else {
attrValue = transformScript(attrValue);
super.attribute(tagUpper, attrLower, attrName, attrValue);
}
}
private String transformScript(final String data) {
if (true) {
final String trimedData = data.trim();
final String scriptData = mySubstringAfterIgnoreCase(trimedData, "javascript:");
if (scriptData != null) {
return "";
}
}
return data;
}
protected boolean isTagAllowed(String tagName, Attributes attrs) {
if (tagName.equalsIgnoreCase("SCRIPT")) {
return false;
}
if(tagName.equalsIgnoreCase("A")){ //超链接标签不push
return false;
}
if (tagName.equalsIgnoreCase("PARAM")) {
String name = getAttrIgnoreCase(attrs, "name");
if ("movie".equalsIgnoreCase(name) || "src".equalsIgnoreCase(name)) {
return false;
}
}
/**//*
if (tagName.equalsIgnoreCase("STYLE")) {
return false;
}
*/
if (tagName.equalsIgnoreCase("LINK") &&
"stylesheet".equalsIgnoreCase(getAttrIgnoreCase(attrs, "rel"))) {
return false;
}
if (tagName.equals("FRAME") || tagName.equals("FRAMESET")) {
return false;
}
return true;
}
private static String getAttrIgnoreCase(Attributes attrs, String name) {
for (int i = 0, len = attrs.getLength(); i < len; i++) {
if (name.equalsIgnoreCase(attrs.getQName(i))) {
return attrs.getValue(i);
}
}
return null;
}
/** *//**
* 忽略控制字符后, 判断是否以某字符串开始, 并返回匹配后的截取部分.
* <p/>
* <p/>
* 注: 忽略控制字符是为了对付IE的安全漏洞
*
* @param source 源字符串
* @param prefix 要匹配的前缀字符串
* @return 如果测试成功, 返回截取后的字符串; 否则, 返回 null;
*/
static String mySubstringAfterIgnoreCase(String source, String prefix) {
int sourceLength = source.length();
int targetLength = prefix.length();
if (sourceLength < targetLength) {
return null;
}
int sourceOffset = 0;
int targetOffset = 0;
char targetChar = Character.toUpperCase(prefix.charAt(targetOffset));
for (; sourceOffset < sourceLength; sourceOffset++) {
char c = source.charAt(sourceOffset);
if (c < ' ') {
// 忽略控制字符
continue;
}
if (Character.toUpperCase(c) != targetChar) {
break;
}
targetOffset++;
if (targetOffset == targetLength) {
return source.substring(sourceOffset + 1);
}
targetChar = Character.toUpperCase(prefix.charAt(targetOffset));
}
return null;
}
protected void write(char c) throws SAXException {
super.write(c);
currentLen++;
}
protected void write(char ch[], int start, int length) throws SAXException {
super.write(ch, start, length);
currentLen += length;
}
protected void write(String s) throws SAXException {
super.write(s);
currentLen += s.length();
}
protected int getRemainChars(){ //求出还剩多少个字符可以写入
return (maxLen - currentLen);
}
}