为了支持全文检索,有必要将HTML格式的文章转化为纯文本格式,因此我设计了一个基本的WebFormatter类,提供一个简单的public static String html2text(String html),将HTML格式转化为Text: /* * File: WebFormatter.java * Created on 2005-6-24 * Author: Liao Xuefeng, asklxf@163.com * Copyright (C) 2005, Liao Xuefeng. */ package com.mboker.blog.web.util; import java.util.*; import java.text.SimpleDateFormat; /** * Do some format on web display. * * @author Xuefeng */ public class WebFormatter { public static String html2text(String html) { StringBuffer sb = new StringBuffer(html.length()); char[] data = html.toCharArray(); int start = 0; boolean previousIsPre = false; Token token = null; for(;;) { token = parse(data, start, previousIsPre); if(token==null) break; previousIsPre = token.isPreTag(); sb = sb.append(token.getText()); start += token.getLength(); } return sb.toString(); } private static Token parse(char[] data, int start, boolean previousIsPre) { if(start>=data.length) return null; // try to read next char: char c = data[start]; if(c=='<') { // this is a tag or comment or script: int end_index = indexOf(data, start+1, '>'); if(end_index==(-1)) { // the left is all text! return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre); } String s = new String(data, start, end_index-start+1); // now we got s="<...>": if(s.startsWith(""); if(end_comment_index==(-1)) { // illegal end, but treat as comment: return new Token(Token.TOKEN_COMMENT, data, start, data.length, previousIsPre); } else return new Token(Token.TOKEN_COMMENT, data, start, end_comment_index+3, previousIsPre); } String s_lowerCase = s.toLowerCase(); if(s_lowerCase.startsWith(""); if(end_script_index==(-1)) // illegal end, but treat as script: return new Token(Token.TOKEN_SCRIPT, data, start, data.length, previousIsPre); else return new Token(Token.TOKEN_SCRIPT, data, start, end_script_index+9, previousIsPre); } else { // this is a tag: return new Token(Token.TOKEN_TAG, data, start, start+s.length(), previousIsPre); } } // this is a text: int next_tag_index = indexOf(data, start+1, '<'); if(next_tag_index==(-1)) return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre); return new Token(Token.TOKEN_TEXT, data, start, next_tag_index, previousIsPre); } private static int indexOf(char[] data, int start, String s) { char[] ss = s.toCharArray(); // TODO: performance can improve! for(int i=start; i<(data.length-ss.length); i++) { // compare from data[i] with ss[0]: boolean match = true; for(int j=0; j match = false; break; } } if(match) return i; } return (-1); } private static int indexOf(char[] data, int start, char c) { for(int i=start; i return i; } return (-1); } } class Token { public static final int TOKEN_TEXT = 0; // html text. public static final int TOKEN_COMMENT = 1; // comment like public static final int TOKEN_TAG = 2; // tag like , , etc. | ".toCharArray();
|小黑屋|最新主题|手机版|微赢网络技术论坛 ( 苏ICP备08020429号 )
GMT+8, 2024-9-30 03:28 , Processed in 0.130444 second(s), 12 queries , Gzip On, MemCache On.
Powered by Discuz! X3.5
© 2001-2023 Discuz! Team.