基于Trie树结构中文分词字典(1)
基于Trie树构造中文分词字典(1)
首先,要去定义一个单词树结构,先考虑我们需要的信息,每个节点应该包含的属性,节点常用属性定义如下
public class WordTree implements Comparable<WordTree>{ public WordTree leaf; //数组用来存储该节点所包含的所有子节点 public WordTree[] leafs; //当前节点存储了什么单词字符 public char c; //status 设三种状态,1: 不是单词,继续查找, 2:是单词,但是还可能匹配到更长的单词 3:是单词,当前已经是匹配到的最长单词 public int status; //改字段主要存储单词的一些附加信息,如词性等额外信息,用数组存储 public String[] params; //定义根节点数组容量 public final int MAX_SIZE = 65536;
考虑构造方法
//其实就是构造root节点 public WordTree(){ this.leafs = new WordTree [MAX_SIZE]; } public WordTree(char c){ this.c = c; } //这个主要用来构造子节点 public WordTree(char c, int status, String[] params){ this.c = c; this.status = status; this.params = params; }
在我们构造、查询字典树的时候,我们会经常需要遍历节点,这时就会频繁的查询该节点是否包含某个子节点,下面两个方法可以帮助我们实现这个功能
//根据字符获取当前节点的子节点编号 public int getLeafIdByChar(char c){ if(leafs == null){ return -1; } if(leafs.length == MAX_SIZE){ return c; } return WordTree.binarySearch(leafs, c); } /** * 根据字符c获取子节点对象 * @param c * @return */ public WordTree getLeafByChar(char c){ if(this.leafs == null){ return null; } if(leafs.length == MAX_SIZE){ return this.leafs[c]; } System.out.println(this.leafs.length); int id = WordTree.binarySearch(this.leafs, c); if(id<0){ return null; }else{ if(this.leafs[id] == null){ return null; }else{ return this.leafs[id]; } } }
最重要的就是为节点添加子节点了,需要考虑同事更新节点状态,来标识到哪个节点是一个完整的单词
/** * 为当前节点添加一个子节点 * @return */ @SuppressWarnings("unchecked") public WordTree addSubLeaf(WordTree subleaf){ WordTree leaf = this; WordTree tmpleaf = null; int subLeafId = leaf.getLeafIdByChar(subleaf.getC()); if(leaf.leafs == null){ leaf.leafs = new WordTree [0]; } //if this leaf always exists in this tree, wee need to updated status if needed if(subLeafId > -1){ tmpleaf = leaf.leafs[subLeafId]; if(tmpleaf == null){ tmpleaf = subleaf; } switch(subleaf.getStatus()){ case -1: tmpleaf.setStatus((char)1); break; case 1: if(tmpleaf.getStatus() == 3){ tmpleaf.setStatus(2); } break; case 3: if(tmpleaf.getStatus() != 2){ tmpleaf.setStatus(3); } tmpleaf.setParams(subleaf.getParams()); } leaf.leafs[subLeafId] = tmpleaf; return tmpleaf; } //if this leaf also not exists in this tree else{ WordTree[] tmpleafs = leaf.leafs; leaf.leafs = new WordTree [tmpleafs.length + 1]; int insertId = -(subLeafId+1); System.arraycopy(tmpleafs, 0, leaf.leafs, 0, insertId); System.arraycopy(tmpleafs, insertId, leaf.leafs, insertId+1, tmpleafs.length-insertId); leaf.leafs[insertId] = subleaf; return leaf.leafs[insertId]; } }