html转换text-分段落,实现富文本导入word的格式转换,标签过滤 html转换text-分段落,实现富文本导入word的格式转换,标签过滤
一、工具类 html2Text
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import java.io.*;
public class Html2Text extends HTMLEditorKit.ParserCallback {
private static Html2Text html2Text = new Html2Text();
StringBuffer s;
public Html2Text() {
}
public void parse(String str) throws IOException {
InputStream iin = new ByteArrayInputStream(str.getBytes());
Reader in = new InputStreamReader(iin);
s = new StringBuffer();
ParserDelegator delegator = new ParserDelegator();
// the third parameter is TRUE to ignore charset directive
delegator.parse(in, this, Boolean.TRUE);
iin.close();
in.close();
}
public void handleEndOfLineString(String eol) {
}
/**
*按标签分割过滤后执行
*/
public void handleText(char[] text, int pos) {
s.append(text);
}
public String getText() {
return s.toString();
}
public static String getContent(String str) {
try {
html2Text.parse(str);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return html2Text.getText();
}
}
二、分段实现
/**
* @Name :getTextContentP
* @Description :<富文本html转换text段落>
* @Author :gaogushenling
* @Date :2021/10/23 14:15
* @Version :1.0
* @History :<修改代码时说明>
* @param :xmlStr
* @return :List<String>
*/
private List<String> getTextContentP(String xmlStr) {
String s = xmlStr.replaceAll("div", "p");
String[] ss = s.split("<p");
List<String> textList = new ArrayList<>();
for (String s1 : ss) {
String s2 = Html2Text.getContent("<p "+s1);
if (StringUtil.isNotEmpty(s2)){
//textList.add(s2.replaceAll(""(?<=")(\\S+)(?=")"",""));
textList.add(s2);
}
}
if (textList.size() == 0) {
textList.add("富文本文件是空的");
}
return textList;
}
调用
List<String> textList = getTextContentP("富文本(html格式)");