需要解决的问题是 根据一输入流读取一段XML内容,然后对其进行过滤截取,最后写回输出流中。具体说明如下:
1.对XML根据特定需求,过滤标签(如SCRIPT,FRAME等非标准HTML标签),过滤属性(如onclick,onblur等)
2.对XML进行长度截取,具体做法如下:
(1)对start标签的处理: 若加上start标签长度后超过最大允许长度,则去除该标签,且同时去除后面和该标签同一等级的所有标签。
(2)对text内容的处理:若加上text内容的长度后超过最大允许的长度,则从中截取text长度,并加上省略号......
(3)对end标签内容的处理:不做长度截取,且要做到自动补齐end标签。
有关SAX的详细介绍,请查看最好的参考资料
http://www.saxproject.org/ 。其中有一个很重要的类 DefaultHandler, 该类中的startElement, endElement, characters 3个方法尤为重要。 为解决上述问题,需要设计2个类:HTMLWriter, HTMLFilter, 其中HTMLFilter是HTMLWriter的子类,HTMLWriter继承了DefaultHandler,其中最为关键的是要重写上述3个关键方法。
一.HTMLWriter类的代码:
这个类主要用于写操作,最重要是理解变量strippedElementLevel 的用法。上面问题的具体业务逻辑处理(标签的过滤和长度截取)将在子类HTMLFilter 解决。
package org.util.sax.html

import openxml.parser.HTMLdtd;
import openxml.parser.HTMLSAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.xml.sax.ErrorHandler;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.DefaultHandler;

import java.io.*;




public class HTMLWriter extends DefaultHandler implements LexicalHandler
{

private ErrorHandler errorHandler;

private Writer out;

private int strippedElementLevel = 0; //用来作为start标签和end标签成对出现的标记(极为重要),具体算法思路类似于堆栈
private boolean inRawElement;



public void filter(String htmlContent) throws IOException, SAXException
{
filter(new StringReader(htmlContent));
}


public void filter(Reader in) throws IOException, SAXException
{
filter(new InputSource(in));
}


public void filter(InputSource in) throws IOException, SAXException
{
HTMLSAXParser parser = new HTMLSAXParser(errorHandler, false);
parser.setLexicalHandler(this);

XMLReader htmlReader = new HTMLParserAdapter(parser);
htmlReader.setFeature("http://xml.org/sax/features/namespaces", false);
htmlReader.setContentHandler(this);

prepare();
htmlReader.parse(in);
}



protected void prepare()
{

if (out == null)
{
out = new StringWriter();
}
}



public void setErrorHandler(ErrorHandler errorHandler)
{
this.errorHandler = errorHandler;
}


public void setOut(Writer out)
{
this.out = out;
}


public Writer getOut()
{
return out;
}


public String getResultAsString()
{

if (out instanceof StringWriter)
{
return out.toString();
}
throw new IllegalStateException("Not a buffered target");
}


@Override

public void startDocument() throws SAXException
{
prepare();
}


@Override
public final void startElement(String namespaceURI,
String localName,
String qName,

Attributes attrs) throws SAXException
{

if (strippedElementLevel > 0)
{
strippedElementLevel++;
return;
}

// features/namespace is false

if (!startTag(qName, attrs))
{
strippedElementLevel = 1;
}
}


@Override
public final void endElement(String namespaceURI,
String localName,

String qName) throws SAXException
{

if (strippedElementLevel > 0)
{
strippedElementLevel--;
return;
}

// features/namespace is false
endTag(qName);
}



protected boolean startTag(String tagName, Attributes attrs) throws SAXException
{

String tagUpper = tagName.toUpperCase();

inRawElement = "SCRIPT".equals(tagUpper) || "STYLE".equals(tagUpper);

write('<');
write(tagName);

for (int i = 0; i < attrs.getLength(); i++)
{
// features/namespace is false
String attrName = attrs.getQName(i);
attribute(tagUpper, attrName.toLowerCase(), attrName, attrs.getValue(i));
}
write('>');

return true;
}



protected void endTag(String tagName) throws SAXException
{
inRawElement = false;

if (!isEmptyTag(tagName.toUpperCase()))
{
write("</");
write(tagName);
write('>');
}
}


@Override

public void characters(char[] ch, int start, int length) throws SAXException
{

if (strippedElementLevel != 0)
{
return;
}


if (inRawElement)
{
write(ch, start, length);
return;
}

text(ch, start, length);
}



protected void text(char[] ch, int start, int length) throws SAXException
{
writeText(ch, start, length);
}



public void startDTD(String tagName, String publicId, String systemId) throws SAXException
{
write("<!DOCTYPE ");
write(tagName);
write(" PUBLIC ");
write('"');
write(publicId);
write('"');
write('>');
}



public void endDTD()
{}

public void startEntity(String name)
{}

public void endEntity(String name)
{}

public void startCDATA()
{}

public void endCDATA()
{}


public void comment(char ch[], int start, int length) throws SAXException
{

/**//*
if (strippedElementLevel == 0) {
write("<!--");
write(ch, start, length);
write("-->");
}
*/
}


@Override

public void ignorableWhitespace(char ch[], int start, int length) throws SAXException
{

if (strippedElementLevel == 0)
{
write(ch, start, length);
}
}


protected void attribute(final String tagUpper, // 规范化的 TAG 名称 - 使用大写字母
final String attrLower, // 规范化的 属性 名称 - 使用小写字母
String attrName,

String attrValue) throws SAXException
{
write(' ');
write(attrName);

if (!isBoolean(attrLower, tagUpper))
{
write('=');
write('"');

for (int i = 0; i < attrValue.length(); i++)
{
writeEncoded(attrValue.charAt(i), true);
}
write('"');
}
}



protected final void writeText(char[] ch, int start, int length) throws SAXException
{
writeTextWithEnd(ch, start, start + length);
}



protected final void writeTextWithEnd(char[] ch, int begin, int end) throws SAXException
{

for (int i = begin; i < end; i++)
{
writeEncoded(ch[i], false);
}
}



protected void writeEncoded(char c, boolean isAttr) throws SAXException
{

switch (c)
{
case '<':
write("<");
break;
case '>':
write(">");
break;
case '&':
write("&");
break;
case 0xa0: // NBSP
// 暂时只特殊处理特殊字符 NBSP
// 当组信 NBSP 在转换到纯文本时可变成空格
// 但其它特殊字符没有简单的Ascii字符可替代, 因而这里也不执行替代
write(" ");
break;
case '"':

if (isAttr)
{
write(""");
break;
}
default:
write(c);
}
}


protected void write(char c) throws SAXException
{

try
{
out.write(c);

} catch (IOException e)
{
throw new SAXException(e);
}
}



protected void write(char ch[], int start, int length) throws SAXException
{

try
{
out.write(ch, start, length);

} catch (IOException e)
{
throw new SAXException(e);
}
}


protected void write(String s) throws SAXException
{

try
{
out.write(s);

} catch (IOException e)
{
throw new SAXException(e);
}
}



private static boolean isBoolean(String attrLower, String tagUpper)
{
return HTMLdtd.isBoolean(attrLower, tagUpper);
}


private static boolean isEmptyTag(String tagUpper)
{
return HTMLdtd.isEmptyTag(tagUpper);
}

}

二. HTMLFilter 类的代码:
主要解决标签过滤,即哪些标签和属性需要过滤,解决长度截取问题,即断点出现在startTag,text,endTag的情况应该如何解决。
主要理解重写父类HTMLWriter的几个方法:startTag(),characters(),comment(),attribute(), 另外需要一个成员变量currentLen记录当前写入的长度,在进行write()方法时要对currentLen变量进行叠加。
package org.util.sax.html;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

import java.util.Map;
import java.io.Writer;
import java.io.CharArrayWriter;
import java.io.IOException;


public class HTMLFilter extends HTMLWriter
{

ConfigManager conf = CM.getConfig();
Map<String, String> cidMap; //cid 和 正文内容图片 filename的 映射
private int currentLen; //当前已经写入out的长度
private int maxLen; //允许push的最大长度
private boolean ignore=false; //当出现要截取时,就设为 true ,意味着如果ignore 为true时, 就以后的内容都要忽略。


public HTMLFilter(Map<String,String> map,int allowMessage_BodyLen)
{
//super.setAllowContentLen(allowMessage_BodyLen);
this.maxLen=allowMessage_BodyLen;
this.cidMap=map;
}

@Override

protected boolean startTag(String tagName, Attributes attrs) throws SAXException
{

if (!isTagAllowed(tagName, attrs))
{
return false;
}


if (ignore)
{
return false;
}

Writer originalOutput = getOut();
int remainChars = getRemainChars();


if(remainChars == 0)
{
ignore = true;
write("
");
return false;
}

CharArrayWriter capturedOutput = new CharArrayWriter();
setOut(capturedOutput);


try
{

if (super.startTag(tagName, attrs))
{

if (capturedOutput.toCharArray().length < remainChars)
{

try
{
originalOutput.write(capturedOutput.toCharArray());
return true;

} catch (IOException e)
{
throw new SAXException(e);
}
}
}

} finally
{
setOut(originalOutput);
}

ignore = true;
write("
");
return false;
}


@Override

public void characters(char[] ch, int start, int length) throws SAXException
{

if (ignore)
{ //如果长度已经超出限制,则不写
return;
}
int remainChars = getRemainChars();


if (remainChars == 0)
{
ignore = true;
write("
");
return;
}


if (remainChars < length)
{ //当将要写入的 text 长度 大于 remainChars 时, 就写入所能够写入的字符,然后添加省略号

ignore = true;
super.characters(ch, start, remainChars);
write("
");

} else
{
super.characters(ch, start, length);
}
}

@Override

protected void endTag(String tagName) throws SAXException
{
super.endTag(tagName);
}


public void comment(char ch[], int start, int length) throws SAXException
{

if(ignore)
{
return;
}
int remainChars = getRemainChars();


if (remainChars == 0)
{
ignore = true;
write("
");
return;
}

if (remainChars < length)
{
ignore=true;
super.comment(ch, start, remainChars);

} else
{
super.comment(ch, start, length);
}

}

@Override
protected void attribute(final String tagUpper,
final String attrLower,
final String attrName,

String attrValue) throws SAXException
{


if (attrLower.startsWith("on"))
{
return;
}

if (tagUpper.equalsIgnoreCase("IMG") && attrLower.equalsIgnoreCase("src") && attrValue.trim().indexOf("cid:") != -1)
{
attrValue=attrValue.trim();
int cid_idx = attrValue.indexOf("cid:");
String cid = attrValue.substring(cid_idx + 4);
// System.out.println("cid is: "+ cid);
String photoName = cidMap.get(cid);
// System.out.println("photoName is: "+ photoName);

if (photoName != null)
{
super.attribute(tagUpper, attrLower, attrName, "#{" + photoName + "}");

} else
{
super.attribute(tagUpper, attrLower, attrName, "#{" + " " + "}");
}



} else
{
attrValue = transformScript(attrValue);
super.attribute(tagUpper, attrLower, attrName, attrValue);
}
}


private String transformScript(final String data)
{

if (true)
{
final String trimedData = data.trim();
final String scriptData = mySubstringAfterIgnoreCase(trimedData, "javascript:");

if (scriptData != null)
{
return "";
}
}
return data;
}


protected boolean isTagAllowed(String tagName, Attributes attrs)
{

if (tagName.equalsIgnoreCase("SCRIPT"))
{
return false;
}

if(tagName.equalsIgnoreCase("A"))
{ //超链接标签不push
return false;
}

if (tagName.equalsIgnoreCase("PARAM"))
{
String name = getAttrIgnoreCase(attrs, "name");

if ("movie".equalsIgnoreCase(name) || "src".equalsIgnoreCase(name))
{
return false;
}
}

/**//*
if (tagName.equalsIgnoreCase("STYLE")) {
return false;
}
*/
if (tagName.equalsIgnoreCase("LINK") &&

"stylesheet".equalsIgnoreCase(getAttrIgnoreCase(attrs, "rel")))
{
return false;
}

if (tagName.equals("FRAME") || tagName.equals("FRAMESET"))
{
return false;
}
return true;
}



private static String getAttrIgnoreCase(Attributes attrs, String name)
{

for (int i = 0, len = attrs.getLength(); i < len; i++)
{

if (name.equalsIgnoreCase(attrs.getQName(i)))
{
return attrs.getValue(i);
}
}
return null;
}



/** *//**
* 忽略控制字符后, 判断是否以某字符串开始, 并返回匹配后的截取部分.
* <p/>
* <p/>
* 注: 忽略控制字符是为了对付IE的安全漏洞
*
* @param source 源字符串
* @param prefix 要匹配的前缀字符串
* @return 如果测试成功, 返回截取后的字符串; 否则, 返回 null;
*/

static String mySubstringAfterIgnoreCase(String source, String prefix)
{
int sourceLength = source.length();
int targetLength = prefix.length();


if (sourceLength < targetLength)
{
return null;
}

int sourceOffset = 0;
int targetOffset = 0;
char targetChar = Character.toUpperCase(prefix.charAt(targetOffset));


for (; sourceOffset < sourceLength; sourceOffset++)
{
char c = source.charAt(sourceOffset);

if (c < ' ')
{
// 忽略控制字符
continue;
}


if (Character.toUpperCase(c) != targetChar)
{
break;
}

targetOffset++;

if (targetOffset == targetLength)
{
return source.substring(sourceOffset + 1);
}

targetChar = Character.toUpperCase(prefix.charAt(targetOffset));
}

return null;
}


protected void write(char c) throws SAXException
{
super.write(c);
currentLen++;
}


protected void write(char ch[], int start, int length) throws SAXException
{
super.write(ch, start, length);
currentLen += length;
}


protected void write(String s) throws SAXException
{
super.write(s);
currentLen += s.length();
}


protected int getRemainChars()
{ //求出还剩多少个字符可以写入
return (maxLen - currentLen);
}


}