java字符串编码门类获取
java字符串编码类型获取
package com.poi.examples; /** *//** * <p> * Title: LoonFramework * </p> * <p> * Description:编码基本类型集合 * </p> * <p> * Copyright: Copyright (c) 2008 * </p> * <p> * Company: LoonFramework * </p> * <p> * License: http://www.apache.org/licenses/LICENSE-2.0 * </p> * * @author chenpeng * @email:ceponline@yahoo.com.cn * @version 0.1 */ public class Encoding { // 支持的字符格式 public static int GB2312 = 0; public static int GBK = 1; public static int BIG5 = 2; public static int UTF8 = 3; public static int UNICODE = 4; public static int EUC_KR = 5; public static int SJIS = 6; public static int EUC_JP = 7; public static int ASCII = 8; public static int UNKNOWN = 9; public static int TOTALT = 10; public final static int SIMP = 0; public final static int TRAD = 1; // 解析名称用 public static String[] javaname; // 编码用 public static String[] nicename; // 应用于html中的字符集 public static String[] htmlname; public Encoding() { javaname = new String[TOTALT]; nicename = new String[TOTALT]; htmlname = new String[TOTALT]; javaname[GB2312] = "GB2312"; javaname[GBK] = "GBK"; javaname[BIG5] = "BIG5"; javaname[UTF8] = "UTF8"; javaname[UNICODE] = "Unicode"; javaname[EUC_KR] = "EUC_KR"; javaname[SJIS] = "SJIS"; javaname[EUC_JP] = "EUC_JP"; javaname[ASCII] = "ASCII"; javaname[UNKNOWN] = "ISO8859_1"; // 分配编码名称 htmlname[GB2312] = "GB2312"; htmlname[GBK] = "GBK"; htmlname[BIG5] = "BIG5"; htmlname[UTF8] = "UTF-8"; htmlname[UNICODE] = "UTF-16"; htmlname[EUC_KR] = "EUC-KR"; htmlname[SJIS] = "Shift_JIS"; htmlname[EUC_JP] = "EUC-JP"; htmlname[ASCII] = "ASCII"; htmlname[UNKNOWN] = "ISO8859-1"; // 分配可读名称 nicename[GB2312] = "GB-2312"; nicename[GBK] = "GBK"; nicename[BIG5] = "Big5"; nicename[UTF8] = "UTF-8"; nicename[UNICODE] = "Unicode"; nicename[EUC_KR] = "EUC-KR"; nicename[SJIS] = "Shift-JIS"; nicename[EUC_JP] = "EUC-JP"; nicename[ASCII] = "ASCII"; nicename[UNKNOWN] = "UNKNOWN"; } public String toEncoding(final int type) { return (javaname[type] + "," + nicename[type] + "," + htmlname[type]) .intern(); } } package com.poi.examples; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; public class ParseEncoding extends Encoding { int[][] GB2312format = new int[94][94]; int[][] GBKformat = new int[126][191]; int[][] Big5format = new int[94][158]; int[][] EUC_KRformat = new int[94][94]; int[][] JPformat = new int[94][94]; public ParseEncoding() { super(); // 初始化编码格式 // init(); } public String getEncoding(final String path) { return check(getEncodeValue(path)); } public String getEncoding(final InputStream in) { return check(getEncodeValue(in)); } public String getEncoding(final byte[] buffer) { return check(getEncodeValue(buffer)); } public String getEncoding(final URL url) { return check(getEncodeValue(url)); } private String check(final int result) { if (result == -1) { return nicename[UNKNOWN]; } return nicename[result]; } /** */ /** * 解析指定字符串路径编码所用格式 * * @param path * @return */ private int getEncodeValue(String path) { int express = UNKNOWN; if (path.startsWith("http://")) { try { express = getEncodeValue(new URL(path)); } catch (MalformedURLException e) { express = -1; } } else { express = getEncodeValue(new File(path)); } return express; } /** */ /** * * 解析指定InputStream所用编码,返回或然率最高的编码类型数值 * * @param in * @return */ public int getEncodeValue(InputStream in) { byte[] rawtext = new byte[8192]; int bytesread = 0, byteoffset = 0; int express = UNKNOWN; InputStream stream = in; try { while ((bytesread = stream.read(rawtext, byteoffset, rawtext.length - byteoffset)) > 0) { byteoffset += bytesread; } ; stream.close(); express = getEncodeValue(rawtext); } catch (Exception e) { express = -1; } return express; } /** */ /** * 解析指定url下数据所用编码,返回或然率最高的编码类型数值 * * @param url * @return */ public int getEncodeValue(URL url) { InputStream stream; try { stream = url.openStream(); } catch (IOException e) { stream = null; } return getEncodeValue(stream); } /** */ /** * 解析指定file所用编码,返回或然率最高的编码类型数值 * * @param file * @return */ public int getEncodeValue(File file) { byte[] buffer; try { buffer = read(new FileInputStream(file)); } catch (FileNotFoundException e) { buffer = null; } return getEncodeValue(buffer); } /** */ /** * 将inputstream转为byte[] * * @param inputStream * @return */ private final byte[] read(final InputStream inputStream) { byte[] arrayByte = null; ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); byte[] bytes = new byte[8192]; try { bytes = new byte[inputStream.available()]; int read; while ((read = inputStream.read(bytes)) >= 0) { byteArrayOutputStream.write(bytes, 0, read); } arrayByte = byteArrayOutputStream.toByteArray(); } catch (IOException e) { return null; } return arrayByte; } /** */ /** * 解析指定byte[]所用编码,返回或然率最高的数值类型 * * @param content * @return */ public int getEncodeValue(byte[] content) { if (content == null) return -1; int[] scores; int index, maxscore = 0; int encoding = UNKNOWN; scores = new int[TOTALT]; // 分配或然率 scores[GB2312] = gb2312probability(content); scores[GBK] = gbkprobability(content); scores[BIG5] = big5probability(content); scores[UTF8] = utf8probability(content); scores[UNICODE] = utf16probability(content); scores[EUC_KR] = euc_krprobability(content); scores[ASCII] = asciiprobability(content); scores[SJIS] = sjisprobability(content); scores[EUC_JP] = euc_jpprobability(content); scores[UNKNOWN] = 0; // 概率比较 for (index = 0; index < TOTALT; index++) { if (scores[index] > maxscore) { // 索引 encoding = index; // 最大几率 maxscore = scores[index]; } } // 返回或然率大于50%的数据 if (maxscore <= 50) { encoding = UNKNOWN; } return encoding; } /** */ /** * gb2312数据或然率计算 * * @param content * @return */ private int gb2312probability(byte[] content) { int i, rawtextlen = 0; int dbchars = 1, gbchars = 1; long gbformat = 0, totalformat = 1; float rangeval = 0, formatval = 0; int row, column; // 检查是否在亚洲汉字范围内 rawtextlen = content.length; for (i = 0; i < rawtextlen - 1; i++) { if (content[i] >= 0) { } else { dbchars++; // 汉字GB码由两个字节组成,每个字节的范围是0xA1 ~ 0xFE if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xF7 && (byte) 0xA1 <= content[i + 1] && content[i + 1] <= (byte) 0xFE) { gbchars++; totalformat += 500; row = content[i] + 256 - 0xA1; column = content[i + 1] + 256 - 0xA1; if (GB2312format[row][column] != 0) { gbformat += GB2312format[row][column]; } else if (15 <= row && row < 55) { // 在gb编码范围 gbformat += 200; } } i++; } } rangeval = 50 * ((float) gbchars / (float) dbchars); formatval = 50 * ((float) gbformat / (float) totalformat); return (int) (rangeval + formatval); } /** */ /** * gb2312或然率计算 * * @param content * @return */ private int gbkprobability(byte[] content) { int i, rawtextlen = 0; int dbchars = 1, gbchars = 1; long gbformat = 0, totalformat = 1; float rangeval = 0, formatval = 0; int row, column; rawtextlen = content.length; for (i = 0; i < rawtextlen - 1; i++) { if (content[i] >= 0) { } else { dbchars++; if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xF7 && // gb范围 (byte) 0xA1 <= content[i + 1] && content[i + 1] <= (byte) 0xFE) { gbchars++; totalformat += 500; row = content[i] + 256 - 0xA1; column = content[i + 1] + 256 - 0xA1; if (GB2312format[row][column] != 0) { gbformat += GB2312format[row][column]; } else if (15 <= row && row < 55) { gbformat += 200; } } else if ((byte) 0x81 <= content[i] && content[i] <= (byte) 0xFE && // gb扩展区域 (((byte) 0x80 <= content[i + 1] && content[i + 1] <= (byte) 0xFE) || ((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7E))) { gbchars++; totalformat += 500; row = content[i] + 256 - 0x81; if (0x40 <= content[i + 1] && content[i + 1] <= 0x7E) { column = content[i + 1] - 0x40; } else { column = content[i + 1] + 256 - 0x40; } if (GBKformat[row][column] != 0) { gbformat += GBKformat[row][column]; } } i++; } } rangeval = 50 * ((float) gbchars / (float) dbchars); formatval = 50 * ((float) gbformat / (float) totalformat); return (int) (rangeval + formatval) - 1; } /** */ /** * 解析为big5的或然率 * * @param content * @return */ private int big5probability(byte[] content) { int i, rawtextlen = 0; int dbchars = 1, bfchars = 1; float rangeval = 0, formatval = 0; long bfformat = 0, totalformat = 1; int row, column; rawtextlen = content.length; for (i = 0; i < rawtextlen - 1; i++) { if (content[i] >= 0) { } else { dbchars++; if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xF9 && (((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7E) || ((byte) 0xA1 <= content[i + 1] && content[i + 1] <= (byte) 0xFE))) { bfchars++; totalformat += 500; row = content[i] + 256 - 0xA1; if (0x40 <= content[i + 1] && content[i + 1] <= 0x7E) { column = content[i + 1] - 0x40; } else { column = content[i + 1] + 256 - 0x61; } if (Big5format[row][column] != 0) { bfformat += Big5format[row][column]; } else if (3 <= row && row <= 37) { bfformat += 200; } } i++; } } rangeval = 50 * ((float) bfchars / (float) dbchars); formatval = 50 * ((float) bfformat / (float) totalformat); return (int) (rangeval + formatval); } /** */ /** * 在utf-8中的或然率 * * @param content * @return */ private int utf8probability(byte[] content) { int score = 0; int i, rawtextlen = 0; int goodbytes = 0, asciibytes = 0; // 检查是否为汉字可接受范围 rawtextlen = content.length; for (i = 0; i < rawtextlen; i++) { if ((content[i] & (byte) 0x7F) == content[i]) { asciibytes++; } else if (-64 <= content[i] && content[i] <= -33 && i + 1 < rawtextlen && -128 <= content[i + 1] && content[i + 1] <= -65) { goodbytes += 2; i++; } else if (-32 <= content[i] && content[i] <= -17 && i + 2 < rawtextlen && -128 <= content[i + 1] && content[i + 1] <= -65 && -128 <= content[i + 2] && content[i + 2] <= -65) { goodbytes += 3; i += 2; } } if (asciibytes == rawtextlen) { return 0; } score = (int) (100 * ((float) goodbytes / (float) (rawtextlen - asciibytes))); // 如果不高于98则减少到零 if (score > 98) { return score; } else if (score > 95 && goodbytes > 30) { return score; } else { return 0; } } /** */ /** * 检查为utf-16的或然率 * * @param content * @return */ private int utf16probability(byte[] content) { if (content.length > 1 && ((byte) 0xFE == content[0] && (byte) 0xFF == content[1]) || ((byte) 0xFF == content[0] && (byte) 0xFE == content[1])) { return 100; } return 0; } /** */ /** * 检查为ascii的或然率 * * @param content * @return */ private int asciiprobability(byte[] content) { int score = 75; int i, rawtextlen; rawtextlen = content.length; for (i = 0; i < rawtextlen; i++) { if (content[i] < 0) { score = score - 5; } else if (content[i] == (byte) 0x1B) { // ESC (used by ISO 2022) score = score - 5; } if (score <= 0) { return 0; } } return score; } /** */ /** * 检查为euc_kr的或然率 * * @param content * @return */ private int euc_krprobability(byte[] content) { int i, rawtextlen = 0; int dbchars = 1, krchars = 1; long krformat = 0, totalformat = 1; float rangeval = 0, formatval = 0; int row, column; rawtextlen = content.length; for (i = 0; i < rawtextlen - 1; i++) { if (content[i] >= 0) { } else { dbchars++; if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xFE && (byte) 0xA1 <= content[i + 1] && content[i + 1] <= (byte) 0xFE) { krchars++; totalformat += 500; row = content[i] + 256 - 0xA1; column = content[i + 1] + 256 - 0xA1; if (EUC_KRformat[row][column] != 0) { krformat += EUC_KRformat[row][column]; } else if (15 <= row && row < 55) { krformat += 0; } } i++; } } rangeval = 50 * ((float) krchars / (float) dbchars); formatval = 50 * ((float) krformat / (float) totalformat); return (int) (rangeval + formatval); } private int euc_jpprobability(byte[] content) { int i, rawtextlen = 0; int dbchars = 1, jpchars = 1; long jpformat = 0, totalformat = 1; float rangeval = 0, formatval = 0; int row, column; rawtextlen = content.length; for (i = 0; i < rawtextlen - 1; i++) { if (content[i] >= 0) { } else { dbchars++; if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xFE && (byte) 0xA1 <= content[i + 1] && content[i + 1] <= (byte) 0xFE) { jpchars++; totalformat += 500; row = content[i] + 256 - 0xA1; column = content[i + 1] + 256 - 0xA1; if (JPformat[row][column] != 0) { jpformat += JPformat[row][column]; } else if (15 <= row && row < 55) { jpformat += 0; } } i++; } } rangeval = 50 * ((float) jpchars / (float) dbchars); formatval = 50 * ((float) jpformat / (float) totalformat); return (int) (rangeval + formatval); } private int sjisprobability(byte[] content) { int i, rawtextlen = 0; int dbchars = 1, jpchars = 1; long jpformat = 0, totalformat = 1; float rangeval = 0, formatval = 0; int row, column, adjust; rawtextlen = content.length; for (i = 0; i < rawtextlen - 1; i++) { if (content[i] >= 0) { } else { dbchars++; if (i + 1 < content.length && (((byte) 0x81 <= content[i] && content[i] <= (byte) 0x9F) || ((byte) 0xE0 <= content[i] && content[i] <= (byte) 0xEF)) && (((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7E) || ((byte) 0x80 <= content[i + 1] && content[i + 1] <= (byte) 0xFC))) { jpchars++; totalformat += 500; row = content[i] + 256; column = content[i + 1] + 256; if (column < 0x9f) { adjust = 1; if (column > 0x7f) { column -= 0x20; } else { column -= 0x19; } } else { adjust = 0; column -= 0x7e; } if (row < 0xa0) { row = ((row - 0x70) << 1) - adjust; } else { row = ((row - 0xb0) << 1) - adjust; } row -= 0x20; column = 0x20; if (row < JPformat.length && column < JPformat[row].length && JPformat[row][column] != 0) { jpformat += JPformat[row][column]; } i++; } else if ((byte) 0xA1 <= content[i] && content[i] <= (byte) 0xDF) { } } } rangeval = 50 * ((float) jpchars / (float) dbchars); formatval = 50 * ((float) jpformat / (float) totalformat); return (int) (rangeval + formatval) - 1; } } package com.poi.examples; public class EncodingTest{ public static void main(String argc[]) { ParseEncoding parse; parse = new ParseEncoding(); System.out.println("*:"); System.out.println("测试字符串,编码格式="+parse.getEncoding("百度".getBytes())); System.out.println("测试站点,编码格式="+parse.getEncoding("http://www.baidu.com")); System.out.println(); System.out.println("中国*:"); System.out.println("测试字符串,编码格式="+parse.getEncoding("い地チ瓣".getBytes())); System.out.println("测试站点,编码格式="+parse.getEncoding("http://tw.yahoo.com/")); System.out.println("测试站点(繁体字,UTF编码),编码格式="+parse.getEncoding("http://www.javaworld.com.tw/jute")); System.out.println(); System.out.println("日本:"); System.out.println("测试字符串,编码格式="+parse.getEncoding("その機能".getBytes())); System.out.println("测试站点,编码格式="+parse.getEncoding("http://www.4gamer.net")); System.out.println(); System.out.println("自称蚩尤后代那群……:"); System.out.println("测试站点,编码格式="+parse.getEncoding("http://www.easyjava.co.kr/")); } }