使用SAX对XML根据具体需求过滤标签和长度截取

需要解决的问题是根据一输入流读取一段XML内容，然后对其进行过滤截取，最后写回输出流中。具体说明如下：
1.对XML根据特定需求，过滤标签（如SCRIPT,FRAME等非标准HTML标签），过滤属性（如onclick,onblur等）
2.对XML进行长度截取，具体做法如下：
（1）对start标签的处理：若加上start标签长度后超过最大允许长度，则去除该标签，且同时去除后面和该标签同一等级的所有标签。
（2）对text内容的处理：若加上text内容的长度后超过最大允许的长度，则从中截取text长度，并加上省略号......
（3）对end标签内容的处理：不做长度截取，且要做到自动补齐end标签。

有关SAX的详细介绍，请查看最好的参考资料 http://www.saxproject.org/ 。其中有一个很重要的类 DefaultHandler，该类中的startElement, endElement, characters 3个方法尤为重要。为解决上述问题，需要设计2个类：HTMLWriter, HTMLFilter, 其中HTMLFilter是HTMLWriter的子类，HTMLWriter继承了DefaultHandler，其中最为关键的是要重写上述3个关键方法。

一.HTMLWriter类的代码：
这个类主要用于写操作，最重要是理解变量strippedElementLevel 的用法。上面问题的具体业务逻辑处理（标签的过滤和长度截取）将在子类HTMLFilter 解决。

package org.util.sax.html

import openxml.parser.HTMLdtd;

import openxml.parser.HTMLSAXParser;

import org.xml.sax.Attributes;

import org.xml.sax.SAXException;

import org.xml.sax.InputSource;

import org.xml.sax.XMLReader;

import org.xml.sax.ErrorHandler;

import org.xml.sax.ext.LexicalHandler;

import org.xml.sax.helpers.DefaultHandler;

import java.io.*;

public class HTMLWriter extends DefaultHandler implements LexicalHandler {

private ErrorHandler errorHandler;

private Writer out;

private int strippedElementLevel = 0; //用来作为start标签和end标签成对出现的标记（极为重要），具体算法思路类似于堆栈

private boolean inRawElement;

public void filter(String htmlContent) throws IOException, SAXException {

filter(new StringReader(htmlContent));

}

public void filter(Reader in) throws IOException, SAXException {

filter(new InputSource(in));

}

public void filter(InputSource in) throws IOException, SAXException {

HTMLSAXParser parser = new HTMLSAXParser(errorHandler, false);

parser.setLexicalHandler(this);

XMLReader htmlReader = new HTMLParserAdapter(parser);

htmlReader.setFeature("http://xml.org/sax/features/namespaces", false);

htmlReader.setContentHandler(this);

prepare();

htmlReader.parse(in);

}

protected void prepare() {

if (out == null) {

out = new StringWriter();

}

public void setErrorHandler(ErrorHandler errorHandler) {

this.errorHandler = errorHandler;

}

public void setOut(Writer out) {

this.out = out;

}

public Writer getOut() {

return out;

}

public String getResultAsString() {

if (out instanceof StringWriter) {

return out.toString();

}

throw new IllegalStateException("Not a buffered target");

}

@Override

public void startDocument() throws SAXException {

prepare();

}

@Override

public final void startElement(String namespaceURI,

String localName,

String qName,

Attributes attrs) throws SAXException {

if (strippedElementLevel > 0) {

strippedElementLevel++;

return;

}

// features/namespace is false

if (!startTag(qName, attrs)) {

strippedElementLevel = 1;

}

@Override

public final void endElement(String namespaceURI,

String localName,

String qName) throws SAXException {

if (strippedElementLevel > 0) {

strippedElementLevel--;

return;

}

// features/namespace is false

endTag(qName);

}

protected boolean startTag(String tagName, Attributes attrs) throws SAXException {

String tagUpper = tagName.toUpperCase();

inRawElement = "SCRIPT".equals(tagUpper) || "STYLE".equals(tagUpper);

write('<');

write(tagName);

for (int i = 0; i < attrs.getLength(); i++) {

// features/namespace is false

String attrName = attrs.getQName(i);

attribute(tagUpper, attrName.toLowerCase(), attrName, attrs.getValue(i));

}

write('>');

return true;

}

protected void endTag(String tagName) throws SAXException {

inRawElement = false;

if (!isEmptyTag(tagName.toUpperCase())) {

write("</");

write(tagName);

write('>');

}

@Override

public void characters(char[] ch, int start, int length) throws SAXException {

if (strippedElementLevel != 0) {

return;

}

if (inRawElement) {

write(ch, start, length);

return;

}

text(ch, start, length);

}

protected void text(char[] ch, int start, int length) throws SAXException {

writeText(ch, start, length);

}

public void startDTD(String tagName, String publicId, String systemId) throws SAXException {

write("<!DOCTYPE ");

write(tagName);

write(" PUBLIC ");

write('"');

write(publicId);

write('"');

write('>');

}

public void endDTD() {}

public void startEntity(String name) {}

public void endEntity(String name) {}

public void startCDATA() {}

public void endCDATA() {}

public void comment(char ch[], int start, int length) throws SAXException {

if (strippedElementLevel == 0) {

write("<!--");

write(ch, start, length);

write("-->");

}

@Override

public void ignorableWhitespace(char ch[], int start, int length) throws SAXException {

if (strippedElementLevel == 0) {

write(ch, start, length);

}

protected void attribute(final String tagUpper, // 规范化的 TAG 名称 - 使用大写字母

final String attrLower, // 规范化的属性名称 - 使用小写字母

String attrName,

String attrValue) throws SAXException {

write(' ');

write(attrName);

if (!isBoolean(attrLower, tagUpper)) {

write('=');

write('"');

for (int i = 0; i < attrValue.length(); i++) {

writeEncoded(attrValue.charAt(i), true);

}

write('"');

}

protected final void writeText(char[] ch, int start, int length) throws SAXException {

writeTextWithEnd(ch, start, start + length);

}

protected final void writeTextWithEnd(char[] ch, int begin, int end) throws SAXException {

for (int i = begin; i < end; i++) {

writeEncoded(ch[i], false);

}

protected void writeEncoded(char c, boolean isAttr) throws SAXException {

switch (c) {

case '<':

write("<");

break;

case '>':

write(">");

break;

case '&':

write("&");

break;

case 0xa0: // NBSP

// 暂时只特殊处理特殊字符 NBSP

// 当组信 NBSP 在转换到纯文本时可变成空格

// 但其它特殊字符没有简单的Ascii字符可替代, 因而这里也不执行替代

write(" ");

break;

case '"':

if (isAttr) {

write(""");

break;

}

default:

write(c);

}

protected void write(char c) throws SAXException {

try {

out.write(c);

} catch (IOException e) {

throw new SAXException(e);

}

protected void write(char ch[], int start, int length) throws SAXException {

try {

out.write(ch, start, length);

} catch (IOException e) {

throw new SAXException(e);

}

protected void write(String s) throws SAXException {

try {

out.write(s);

} catch (IOException e) {

throw new SAXException(e);

}

private static boolean isBoolean(String attrLower, String tagUpper) {

return HTMLdtd.isBoolean(attrLower, tagUpper);

}

private static boolean isEmptyTag(String tagUpper) {

return HTMLdtd.isEmptyTag(tagUpper);

}

二. HTMLFilter 类的代码：
主要解决标签过滤，即哪些标签和属性需要过滤，解决长度截取问题，即断点出现在startTag,text,endTag的情况应该如何解决。
主要理解重写父类HTMLWriter的几个方法：startTag（），characters（），comment（），attribute（），另外需要一个成员变量currentLen记录当前写入的长度，在进行write()方法时要对currentLen变量进行叠加。

package org.util.sax.html;

import org.xml.sax.Attributes;

import org.xml.sax.SAXException;

import java.util.Map;

import java.io.Writer;

import java.io.CharArrayWriter;

import java.io.IOException;

public class HTMLFilter extends HTMLWriter {

ConfigManager conf = CM.getConfig();

Map<String, String> cidMap; //cid 和正文内容图片 filename的映射

private int currentLen; //当前已经写入out的长度

private int maxLen; //允许push的最大长度

private boolean ignore=false; //当出现要截取时，就设为 true ,意味着如果ignore 为true时，就以后的内容都要忽略。

public HTMLFilter(Map<String,String> map,int allowMessage_BodyLen) {

//super.setAllowContentLen(allowMessage_BodyLen);

this.maxLen=allowMessage_BodyLen;

this.cidMap=map;

}

@Override

protected boolean startTag(String tagName, Attributes attrs) throws SAXException {

if (!isTagAllowed(tagName, attrs)) {

return false;

}

if (ignore) {

return false;

}

Writer originalOutput = getOut();

int remainChars = getRemainChars();

if(remainChars == 0){

ignore = true;

write("

");

return false;

}

CharArrayWriter capturedOutput = new CharArrayWriter();

setOut(capturedOutput);

try {

if (super.startTag(tagName, attrs)) {

if (capturedOutput.toCharArray().length < remainChars) {

try {

originalOutput.write(capturedOutput.toCharArray());

return true;

} catch (IOException e) {

throw new SAXException(e);

}

} finally {

setOut(originalOutput);

}

ignore = true;

write("

");

return false;

}

@Override

public void characters(char[] ch, int start, int length) throws SAXException {

if (ignore) { //如果长度已经超出限制，则不写

return;

}

int remainChars = getRemainChars();

if (remainChars == 0) {

ignore = true;

write("

");

return;

}

if (remainChars < length) { //当将要写入的 text 长度大于 remainChars 时，就写入所能够写入的字符，然后添加省略号

ignore = true;

super.characters(ch, start, remainChars);

write("

");

} else {

super.characters(ch, start, length);

}

@Override

protected void endTag(String tagName) throws SAXException {

super.endTag(tagName);

}

public void comment(char ch[], int start, int length) throws SAXException{

if(ignore){

return;

}

int remainChars = getRemainChars();

if (remainChars == 0) {

ignore = true;

write("

");

return;

}

if (remainChars < length) {

ignore=true;

super.comment(ch, start, remainChars);

} else {

super.comment(ch, start, length);

}

@Override

protected void attribute(final String tagUpper,

final String attrLower,

final String attrName,

String attrValue) throws SAXException {

if (attrLower.startsWith("on")) {

return;

}

if (tagUpper.equalsIgnoreCase("IMG") && attrLower.equalsIgnoreCase("src") && attrValue.trim().indexOf("cid:") != -1) {

attrValue=attrValue.trim();

int cid_idx = attrValue.indexOf("cid:");

String cid = attrValue.substring(cid_idx + 4);

// System.out.println("cid is: "+ cid);

String photoName = cidMap.get(cid);

// System.out.println("photoName is: "+ photoName);

if (photoName != null) {

super.attribute(tagUpper, attrLower, attrName, "#{" + photoName + "}");

} else{

super.attribute(tagUpper, attrLower, attrName, "#{" + " " + "}");

}

} else {

attrValue = transformScript(attrValue);

super.attribute(tagUpper, attrLower, attrName, attrValue);

}

private String transformScript(final String data) {

if (true) {

final String trimedData = data.trim();

final String scriptData = mySubstringAfterIgnoreCase(trimedData, "javascript:");

if (scriptData != null) {

return "";

}

return data;

}

protected boolean isTagAllowed(String tagName, Attributes attrs) {

if (tagName.equalsIgnoreCase("SCRIPT")) {

return false;

}

if(tagName.equalsIgnoreCase("A")){ //超链接标签不push

return false;

}

if (tagName.equalsIgnoreCase("PARAM")) {

String name = getAttrIgnoreCase(attrs, "name");

if ("movie".equalsIgnoreCase(name) || "src".equalsIgnoreCase(name)) {

return false;

}

if (tagName.equalsIgnoreCase("STYLE")) {

return false;

}

if (tagName.equalsIgnoreCase("LINK") &&

"stylesheet".equalsIgnoreCase(getAttrIgnoreCase(attrs, "rel"))) {

return false;

}

if (tagName.equals("FRAME") || tagName.equals("FRAMESET")) {

return false;

}

return true;

}

private static String getAttrIgnoreCase(Attributes attrs, String name) {

for (int i = 0, len = attrs.getLength(); i < len; i++) {

if (name.equalsIgnoreCase(attrs.getQName(i))) {

return attrs.getValue(i);

}

return null;

}

/**

* 忽略控制字符后, 判断是否以某字符串开始, 并返回匹配后的截取部分.

* <p/>

* 注: 忽略控制字符是为了对付IE的安全漏洞

* @param source 源字符串

* @param prefix 要匹配的前缀字符串

* @return 如果测试成功, 返回截取后的字符串; 否则, 返回 null;

static String mySubstringAfterIgnoreCase(String source, String prefix) {

int sourceLength = source.length();

int targetLength = prefix.length();

if (sourceLength < targetLength) {

return null;

}

int sourceOffset = 0;

int targetOffset = 0;

char targetChar = Character.toUpperCase(prefix.charAt(targetOffset));

for (; sourceOffset < sourceLength; sourceOffset++) {

char c = source.charAt(sourceOffset);

if (c < ' ') {

// 忽略控制字符

continue;

}

if (Character.toUpperCase(c) != targetChar) {

break;

}

targetOffset++;

if (targetOffset == targetLength) {

return source.substring(sourceOffset + 1);

}

targetChar = Character.toUpperCase(prefix.charAt(targetOffset));

}

return null;

}

protected void write(char c) throws SAXException {

super.write(c);

currentLen++;

}

protected void write(char ch[], int start, int length) throws SAXException {

super.write(ch, start, length);

currentLen += length;

}

protected void write(String s) throws SAXException {

super.write(s);

currentLen += s.length();

}

protected int getRemainChars(){ //求出还剩多少个字符可以写入

return (maxLen - currentLen);

}

posted on 2008-09-01 21:26 cong 阅读(934) 评论(0) 编辑收藏所属分类: JAVA

新用户注册刷新评论列表


只有注册用户登录后才能发表评论。




网站导航: 博客园 IT新闻 Chat2DB C++博客博问管理

lycong

使用SAX对XML根据具体需求过滤标签和长度截取

My Links

Blog Stats

常用链接

留言簿(1)

随笔分类

随笔档案

搜索

最新评论

阅读排行榜

评论排行榜