DFA算法应用之敏感词过滤

公司在做一个社区应用，由于我朝特色，众所周知社区应用有一个很重要的就是要进行敏感词的过滤，这块由一个同事负责，听他说，有一个算法叫DFA，可以做这个，个人比较感兴趣，就到网上查了一些资料，有一篇文章写的特别好，老夫的这篇文章就是把其核心的部分（就是怎么应用，老夫一直有一个观点，理论弱于实践，理论懂得再多不会用一点用没有，所以老夫认为应用是核心）摘出来，留作笔记，如果有想了解其原理的，请点击下方的参考资料，好了，既然是应用那么就直接上代码了：


package cn.bridgeli.dfa;

import java.util.HashSet;  
import java.util.Iterator;  
import java.util.Map;  
import java.util.Set;

public class SensitivewordFilter {  
@SuppressWarnings("rawtypes")  
private Map sensitiveWordMap = null;  
public static int minMatchTYpe = 1; // 最小匹配规则  
public static int maxMatchType = 2; // 最大匹配规则

/**  
* 初始化敏感词库  
*/  
public SensitivewordFilter() {  
sensitiveWordMap = new SensitiveWordInit().initKeyWord();  
}

/**  
* 判断文字是否包含敏感字符  
*  
* @param txt  
* 文字  
* @param matchType  
* 匹配规则 1：最小匹配规则，2：最大匹配规则  
* @return 若包含返回true，否则返回false  
*/  
public boolean isContaintSensitiveWord(String txt, int matchType) {  
boolean flag = false;  
for (int i = 0; i < txt.length(); i++) {  
int matchFlag = this.CheckSensitiveWord(txt, i, matchType); // 判断是否包含敏感字符  
if (matchFlag > 0) {  
flag = true;  
}  
}  
return flag;  
}

/**  
* 获取文字中的敏感词  
*  
* @param txt  
* 文字  
* @param matchType  
* 匹配规则&nbsp;1：最小匹配规则，2：最大匹配规则  
* @return  
*/  
public Set<String> getSensitiveWord(String txt, int matchType) {  
Set<String> sensitiveWordList = new HashSet<String>();

for (int i = 0; i < txt.length(); i++) {  
int length = CheckSensitiveWord(txt, i, matchType);  
if (length > 0) {  
sensitiveWordList.add(txt.substring(i, i + length));  
i = i + length &#8211; 1; // 减1的原因，是因为for会自增  
}  
}

return sensitiveWordList;  
}

/**  
* 替换敏感字字符  
*  
* @param txt  
* @param matchType  
* @param replaceChar  
\* 替换字符，默认\*  
*/  
public String replaceSensitiveWord(String txt, int matchType, String replaceChar) {  
String resultTxt = txt;  
Set<String> set = getSensitiveWord(txt, matchType); // 获取所有的敏感词  
Iterator<String> iterator = set.iterator();  
String word = null;  
String replaceString = null;  
while (iterator.hasNext()) {  
word = iterator.next();  
replaceString = getReplaceChars(replaceChar, word.length());  
resultTxt = resultTxt.replaceAll(word, replaceString);  
}

return resultTxt;  
}

/**  
* 获取替换字符串  
*  
* @param replaceChar  
* @param length  
* @return  
*/  
private String getReplaceChars(String replaceChar, int length) {  
String resultReplace = replaceChar;  
for (int i = 1; i < length; i++) {  
resultReplace += replaceChar;  
}

return resultReplace;  
}

/**  
* 检查文字中是否包含敏感字符，检查规则如下：<br>  
*  
* @param txt  
* @param beginIndex  
* @param matchType  
* @return，如果存在，则返回敏感词字符的长度，不存在返回0  
*/  
@SuppressWarnings({ "rawtypes" })  
public int CheckSensitiveWord(String txt, int beginIndex, int matchType) {  
boolean flag = false; // 敏感词结束标识位：用于敏感词只有1位的情况  
int matchFlag = 0; // 匹配标识数默认为0  
char word = 0;  
Map nowMap = sensitiveWordMap;  
for (int i = beginIndex; i < txt.length(); i++) {  
word = txt.charAt(i);  
nowMap = (Map) nowMap.get(word); // 获取指定key  
if (nowMap != null) { // 存在，则判断是否为最后一个  
matchFlag++; // 找到相应key，匹配标识+1  
if ("1".equals(nowMap.get("isEnd"))) { // 如果为最后一个匹配规则,结束循环，返回匹配标识数  
flag = true; // 结束标志位为true  
if (SensitivewordFilter.minMatchTYpe == matchType) { // 最小规则，直接返回,最大规则还需继续查找  
break;  
}  
}  
} else { // 不存在，直接返回  
break;  
}  
}  
if (matchFlag < 2 || !flag) { // 长度必须大于等于1，为词  
matchFlag = 0;  
}  
return matchFlag;  
}

public static void main(String[] args) {  
SensitivewordFilter filter = new SensitivewordFilter();  
System.out.println("敏感词的数量：" + filter.sensitiveWordMap.size());  
String string = "太多的伤感情怀也许只局限于饲养基地 荧幕中的情节，主人公尝试着去用某种方式渐渐的很潇洒地释自杀指南怀那些自己经历的伤感。"  
+ "然后法轮功 我们的扮演的角色就是跟随着主人公的喜红客联盟 怒哀乐而过于牵强的把自己的情感也附加于银幕情节中，然后感动就流泪，"  
+ "难过就躺在某一个人的怀里尽情的阐述心扉或者手机卡复制器一个人一杯红酒一部电影在夜三级片 深人静的晚上，关上电话静静的发呆着。";  
Set<String> set = filter.getSensitiveWord(string, 1);  
System.out.println("语句中包含敏感词的个数为：" + set.size() + "。包含：" + set);  
}  
}

这个主要是应用，DFA的核心是下面：


package cn.bridgeli.dfa;

import java.io.BufferedReader;  
import java.io.File;  
import java.io.FileInputStream;  
import java.io.InputStreamReader;  
import java.util.HashMap;  
import java.util.HashSet;  
import java.util.Map;  
import java.util.Set;

/**  
* @Description: 初始化敏感词库，将敏感词加入到HashMap中，构建DFA算法模型  
*/  
public class SensitiveWordInit {  
private String ENCODING = "UTF-8";  
@SuppressWarnings("rawtypes")  
public HashMap sensitiveWordMap;

public SensitiveWordInit() {  
super();  
}

@SuppressWarnings("rawtypes")  
public Map initKeyWord() {  
try {  
// 读取敏感词库  
Set<String> keyWordSet = readSensitiveWordFile();  
// 将敏感词库加入到HashMap中  
addSensitiveWordToHashMap(keyWordSet);  
} catch (Exception e) {  
e.printStackTrace();  
}  
return sensitiveWordMap;  
}

/**  
* 读取敏感词库，将敏感词放入HashSet中，构建一个DFA算法模型  
*  
* @param keyWordSet  
* 敏感词库  
*/  
@SuppressWarnings({ "rawtypes", "unchecked" })  
private void addSensitiveWordToHashMap(Set<String> keyWordSet) {  
sensitiveWordMap = new HashMap(keyWordSet.size()); // 初始化敏感词容器，减少扩容操作  
Map nowMap = null;  
Map<String, String> newWorMap = null;

for (String key : keyWordSet) {  
nowMap = sensitiveWordMap;  
for (int i = 0; i < key.length(); i++) {  
char keyChar = key.charAt(i); // 转换成char型  
Object wordMap = nowMap.get(keyChar); // 获取

if (wordMap != null) { // 如果存在该key，直接赋值  
nowMap = (Map) wordMap;  
} else { // 不存在则，则构建一个map，同时将isEnd设置为0，因为他不是最后一个  
newWorMap = new HashMap<String, String>();  
newWorMap.put("isEnd", "0"); // 不是最后一个  
nowMap.put(keyChar, newWorMap);  
nowMap = newWorMap;  
}

if (i == key.length() &#8211; 1) {  
nowMap.put("isEnd", "1"); // 最后一个  
}  
}  
}  
}

/**  
* 读取敏感词库中的内容，将内容添加到set集合中  
*  
* @return  
* @throws Exception  
*/  
@SuppressWarnings("resource")  
private Set<String> readSensitiveWordFile() throws Exception {  
Set<String> set = null;

File file = new File("D:/SensitiveWord.txt");  
InputStreamReader read = new InputStreamReader(new FileInputStream(file), ENCODING);  
try {  
if (file.isFile() && file.exists()) { // 文件流是否存在  
set = new HashSet<String>();  
BufferedReader bufferedReader = new BufferedReader(read);  
String txt = null;  
while ((txt = bufferedReader.readLine()) != null) { // 读取文件，将文件内容放入到set中  
set.add(txt);  
}  
}  
} catch (Exception e) {  
e.printStackTrace();  
} finally {  
read.close();  
}  
return set;  
}  
}

其实这么多已经说完了，但是有一个点需要给大家说一下，这里面有一个敏感词文件列表：SensitiveWord.txt，很明显是一行一个这个不需要多说，需要说明的是，老夫这里使用的UTF-8流读取的，也就是说，文件的编码是UTF-8码，所以如果用windows的记事本编辑这个文件的同学一定要注意：保存文件时请用UTF-8无BOM格式，否则记事本会自作聪明的在文件首部添加一些看不见的东西，导致出错。另外老夫是vim党，在windows下用的也是vim，知道怎么设置vim，保存时为UTF-8无BOM格式的同学，还请留言指点，谢谢。
最后说一下，该方法的不足之处，这种方法的过滤纯靠SensitiveWord.txt这个敏感词列表，并不能语义的分析，而且还有一定的滞后性，所以做一个小系统还行，大系统这么做就不行了。

参考资料：http://www.cnblogs.com/chenssy/p/3751221.html