readFile(id) 输入id 读取id_* 的内容，去掉了 html标签

package com.yesky.wstsearch.common;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.regex.Pattern;
/**
*
* @author yu
* readFile(id) 输入id 读取id_* 的内容，去掉了html标签
*/
public class FileToCon {
    /**
     * 读取文件内容
     */
    public static StringBuffer readFileContent(File file) {
        try {
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
            StringBuffer content = new StringBuffer();

            for (String line = null; (line = reader.readLine()) != null;) {
                content.append(line).append("\n");
            }

            return content;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public static String Html2Text(String inputString) {
        String htmlStr = inputString; // 含html标签的字符串
        String textStr = "";
        java.util.regex.Pattern p_script;
        java.util.regex.Matcher m_script;
        java.util.regex.Pattern p_style;
        java.util.regex.Matcher m_style;
        java.util.regex.Pattern p_html;
        java.util.regex.Matcher m_html;

        java.util.regex.Pattern p_html1;
        java.util.regex.Matcher m_html1;

        try {
            String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script>
            // }
            String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; // 定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style>
            // }
            String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
            String regEx_html1 = "<[^>]+";
            p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
            m_script = p_script.matcher(htmlStr);
            htmlStr = m_script.replaceAll(""); // 过滤script标签

            p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
            m_style = p_style.matcher(htmlStr);
            htmlStr = m_style.replaceAll(""); // 过滤style标签

            p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
            m_html = p_html.matcher(htmlStr);
            htmlStr = m_html.replaceAll(""); // 过滤html标签

            p_html1 = Pattern.compile(regEx_html1, Pattern.CASE_INSENSITIVE);
            m_html1 = p_html1.matcher(htmlStr);
            htmlStr = m_html1.replaceAll(""); // 过滤html标签

            textStr = htmlStr;

        } catch (Exception e) {
            System.err.println("Html2Text: " + e.getMessage());
        }

        return textStr;// 返回文本字符串
    }

    public static String readFile(long id) {
        StringBuffer aa = new StringBuffer();
        for (int j = 1; j < 11; j++) {
            String filePath1 = "e:/home00/art/"+ id%500+"/"+id+"_" + j + ".html";

            File file = new File(filePath1);
            if (file.exists()) {
                aa.append(readFileContent(file));
            } else {
                break;
            }
        }
        return Html2Text(aa.toString());

    }

    public static void main(String[] args) {
        System.out.println(readFile(160600));

    }
}

发表于 2010-07-01 10:45 西瓜阅读(381) 评论(0) 编辑收藏所属分类: 正则表达式

常用链接

留言簿(2)

随笔分类(116)

随笔档案(114)

文章分类(1)

文章档案(1)

搜索

最新评论

阅读排行榜

评论排行榜

西瓜地儿沈阳求职（java3年以上经验）！ashutc@126.com
BlogJava \| 首页 \| 发新随笔 \| 发新文章 \| 联系 \| 聚合 \| 管理	随笔：114 文章：1 评论：45 引用：0