利用文本挖掘技术来找出网络中的“小鲜词”
发布时间:2021-01-14 01:54:19 所属栏目:大数据 来源:网络整理
导读:副标题#e# 开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.da
|
EntropyJudger.java计算熵值 package grid.text.evolution;
import grid.common.CountMap;
import grid.common.TextUtils;
import grid.text.index.Pos;
import grid.text.index.TextIndexer;
public class EntropyJudger {
private TextIndexer indexer;
/** * A word least appeared count */
private static int LEAST_COUNT_THRESHOLD = 5; //阈值
/** * Threshold for solid rate calculated by word appeared count and every * single letter. * * The smaller this values is,more new words you will get,but with less * accuracy. The greater this value is,less new words you will get,but * with high accuracy. */
private static double SOLID_RATE_THRESHOLD = 0.018;
/** * Threshold for entropy value calculated by candidate word prefix character * count and suffix character count * * The smaller this values is,but * with high accuracy. */
private static double ENTROPY_THRESHOL = 1.92;
public EntropyJudger(TextIndexer indexer) {
this.indexer = indexer;
}
public boolean judge(String candidate) {
double solidRate = getSolidRate(candidate);
if (solidRate < SOLID_RATE_THRESHOLD) {
return false;
}
double entropy = getEntropy(candidate);
if (entropy < ENTROPY_THRESHOL) {
return false;
}
return true;
}
private double getEntropy(String candidate) {
Pos pos = new Pos(candidate);
CountMap<Character> frontCountMap = new CountMap<Character>();
CountMap<Character> backCountMap = new CountMap<Character>();
final int candidateLen = candidate.length();
int off = 0;
char c;
double rate,frontEntropy = 0,backEntropy = 0;
while (indexer.find(pos).isFound()) {
off = pos.getPos();
c = indexer.charAt(off - 1);
if (TextUtils.isCnLetter(c)) {
frontCountMap.increase(c);
}
c = indexer.charAt(off + candidateLen);
if (TextUtils.isCnLetter(c)) {
backCountMap.increase(c);
}
}
for (char key : frontCountMap.keySet()) {
rate = (double) frontCountMap.get(key) / frontCountMap.count();
frontEntropy -= rate * Math.log(rate);
}
for (char key : backCountMap.keySet()) {
rate = (double) backCountMap.get(key) / backCountMap.count();
backEntropy -= rate * Math.log(rate);
}
return frontEntropy > backEntropy ? backEntropy : frontEntropy;
}
/** * @param candidate * @return */
public double getSolidRate(String candidate) {
final int candidateLen = candidate.length();
if (candidateLen < 2) {
return 1;
}
final int count = indexer.count(candidate);
double rate = 1;
if (count < LEAST_COUNT_THRESHOLD) {
return 0;
}
for (int i = 0; i < candidateLen; i++) {
rate *= (double) count / indexer.count("" + candidate.charAt(i));
}
return Math.pow(rate,1D / candidateLen) * Math.sqrt(candidateLen);
}
public void setIndexer(TextIndexer indexer) {
this.indexer = indexer;
}
}
NewWordDiscover.java抽词程序 package grid.text.evolution;
import grid.common.TextUtils;
import grid.text.dic.CnDictionary;
import grid.text.index.CnPreviewTextIndexer;
import grid.text.index.TextIndexer;
import grid.text.selector.CnTextSelector;
import grid.text.selector.TextSelector;
import java.util.HashSet;
import java.util.Set;
public class NewWordDiscover {
private CnDictionary dictionary;
/** * Minimum word length */
private final static int MIN_CANDIDATE_LEN = 2;
/** * Maximum word length */
private final static int MAX_CANDIDATE_LEN = 6;
private static Set<Character> structuralLetterSet = new HashSet<Character>();
private static char[] structuralLetters = { '我','你','您','他','她','谁','哪','那','这','的','了','着','也','是','有','不','在','与','呢','啊','呀','吧','嗯','哦','哈','呐' };
static {
for (char c : structuralLetters) {
structuralLetterSet.add(c);
}
}
public NewWordDiscover() {
dictionary = CnDictionary.Instance();
}
/** * New word discover is based on statistic and entropy,better to sure * document size is in 100kb level,or you may get a unsatisfied result. * * @param document * @return */
public Set<String> discover(String document) {
Set<String> set = new HashSet<String>();
TextIndexer indexer = new CnPreviewTextIndexer(document);
TextSelector selector = new CnTextSelector(document,MIN_CANDIDATE_LEN,MAX_CANDIDATE_LEN);
EntropyJudger judger = new EntropyJudger(indexer);
String candidate;
while (!selector.end()) {
candidate = selector.next();
if (TextUtils.isBlank(candidate)) {
continue;
}
if (structuralLetterSet.contains(candidate.charAt(0))
|| structuralLetterSet.contains(candidate.charAt(candidate
.length() - 1))) {
continue;
}
// Replace IF clause with "set.contains(candidate)" if you want to
// find new word without any dictionary
if (dictionary.contains(candidate) || set.contains(candidate)) {
selector.select();
} else if (judger.judge(candidate)) {
set.add(candidate);
}
}
return set;
}
}
index
这几个类用于给词创建索引,方便从词典中找出 CnPreviewTextIndexer.javapackage grid.text.index;
import grid.common.TextUtils;
import java.util.HashMap;
import java.util.Map;
import java.util.Vector;
public class CnPreviewTextIndexer implements TextIndexer {
private final static int CN_LETTER_COUNT = 5021;
private String document;
private Map<Character,Vector<Integer>> posMap;
public CnPreviewTextIndexer(String document) {
this.document = document;
init();
}
private void init() {
final int len = document.length();
final int supposedMinCount = 1 + (int) Math.log(len / CN_LETTER_COUNT
+ 1);
char c;
Vector<Integer> posVector;
posMap = new HashMap<Character,Vector<Integer>>(CN_LETTER_COUNT);
for (int i = 0; i < len; i++) {
c = document.charAt(i);
if (!TextUtils.isCnLetter(c)) {
continue;
}
posVector = posMap.get(c);
if (null == posVector) {
posVector = new Vector<Integer>(supposedMinCount);
posMap.put(c,posVector);
}
posVector.add(i);
}
}
@Override
public int count(String text) {
if (TextUtils.isBlank(text)) {
return 0;
}
Vector<Integer> vector = posMap.get(text.charAt(0));
if (null == vector) {
return 0;
}
if (1 == text.length()) {
return vector.size();
}
final int size = vector.size();
int count = 0;
for (int i = 0; i < size; i++) {
if (TextUtils.match(document,vector.get(i),text)) {
count++;
}
}
return count;
}
@Override
public Pos find(Pos pos) {
String text = pos.getTarget();
pos.setFound(false);
if (TextUtils.isBlank(text)) {
return pos;
}
Vector<Integer> vector = posMap.get(text.charAt(0));
if (null == vector) {
return pos;
}
final int arraySize = vector.size();
final int arrayIndex = pos.arrayIndex + 1;
for (int i = arrayIndex; i < arraySize; i++) {
if (TextUtils.match(document,text)) {
pos.setFound(true);
pos.setPos(vector.get(i));
pos.arrayIndex = i;
break;
}
}
return pos;
}
@Override
public int len() {
return document.length();
}
@Override
public String sub(int off,int len) {
if (off < 0 || off + len >= document.length()) {
return "";
}
return document.substring(off,off + len);
}
@Override
public char charAt(int index) {
if (index < 0 || index >= document.length()) {
return 0;
}
return document.charAt(index);
}
}
Pos.javapackage grid.text.index;
public class Pos {
private String target;
/** * Pos for current matched full target text */
private int pos = -1;
/** * Index in position array for current matched full target text */
int arrayIndex = -1;
private boolean found = false;
public Pos(String target) {
this.target = target;
}
public String getTarget() {
return target;
}
public int getPos() {
return pos;
}
public boolean isFound() {
return found;
}
void setPos(int pos) {
this.pos = pos;
}
void setFound(boolean found) {
this.found = found;
}
}
SimpleTextIndexer.javapackage grid.text.index;
public class SimpleTextIndexer implements TextIndexer {
private String document;
public SimpleTextIndexer(String document) {
this.document = document;
}
@Override
public int count(String text) {
int off = 0;
int count = 0;
final int len = text.length();
while ((off = document.indexOf(text,off)) > -1) {
count++;
off += len;
}
return count;
}
@Override
public Pos find(Pos pos) {
final String text = pos.getTarget();
final int len = text.length();
int off = pos.getPos() + len;
if (pos.getPos() < 0)
off = 0;
pos.setFound(false);
if ((off = document.indexOf(text,off)) > -1) {
pos.setFound(true);
pos.setPos(off);
}
return pos;
}
@Override
public int len() {
return document.length();
}
@Override
public String sub(int off,int len) {
return document.substring(off,off + len);
}
@Override
public char charAt(int index) {
if (index < 0 || index >= document.length()) {
return 0;
}
return document.charAt(index);
}
}
TextIndexer.javapackage grid.text.index;
public interface TextIndexer {
/** * @param text * @return count for specific text */
public int count(String text);
/** * @param pos * @return next position for current pos */
public Pos find(Pos pos);
/** * @return original document length */
public int len();
/** * @param off * @param len * @return the sub string start from <b>off</b> and with a length with * <b>len</b> */
public String sub(int off,int len);
/** * @param index * @return return the character in the specified index */
public char charAt(int index);
}
participle(编辑:成都站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |



