数据挖掘-基于贝叶斯算法及KNN算法.docx-道客多多

资源描述

1、吸氧机，家用吸氧机价格制氧机，鱼跃家庭制氧机数据挖掘-基于贝叶斯算法及 KNN 算法的 newsgroup18828 文档分类器的 JAVA 实现（上）本分类器的完整工程可以到点击打开链接下载，详细说明的运行方法，用 eclipse 可以运行，学习数据挖掘的朋友可以跑一下，有问题可以联系我，欢迎交流：）上文中描述了 newsgroup18828 文档集的预处理及贝叶斯算法的 JAVA 实现，下面我们来看看如何实现基于 KNN 算法的 newsgroup 文本分类器1 KNN 算法的描述KNN 算法描述如下：STEP ONE:文本向量化表示,由特征词的 TF*IDF 值计算STEP TWO:在新

2、文本到达后，根据特征词确定新文本的向量STEP THREE:在训练文本集中选出与新文本最相似的 K 个文本，相似度用向量夹角余弦度量，计算公式为：其中，K 值的确定目前没有很好的方法，一般采用先定一个初始值，然后根据实验测试的结果调整 K 值本项目中 K 取 20STEP FOUR:在新文本的 K 个邻居中，依次计算每类的权重，每类的权重等于 K 个邻居中属于该类的训练样本与测试样本的相似度之和。STEP FIVE:比较类的权重，将文本分到权重最大的那个类别中。2 文档 TF-IDF 计算及向量化表示实现 KNN 算法首先要实现文档的向量化表示计算特征词的 TF*IDF，每个文档的向量由包含所

3、有特征词的 TF*IDF 值组成，每一维对应一个特征词TF 及 IDF 的计算公式如下，分别为特征词的特征项频率和逆文档频率吸氧机，家用吸氧机价格制氧机，鱼跃家庭制氧机文档向量计算类 ComputeWordsVector.java 如下1. package com.pku.yangliu; 2. import java.io.BufferedReader; 3. import java.io.File; 4. import java.io.FileReader; 5. import java.io.FileWriter; 6. import java.io.IOException; 7. im

4、port java.util.SortedMap; 8. import java.util.Map; 9. import java.util.Set; 10. import java.util.TreeMap; 11. import java.util.Iterator; 12. 13. /*计算文档的属性向量，将所有文档向量化 14. * author yangliu 15. * qq 772330184 16. * mail 17. * 18. */ 19. public class ComputeWordsVector 20. 21. /*计算文档的 TF 属性向量,直接写成二维数组遍

5、历形式即可，没必要递归 22. * param strDir 处理好的 newsgroup 文件目录的绝对路径 23. * param trainSamplePercent 训练样例集占每个类目的比例 24. * param indexOfSample 测试样例集的起始的测试样例编号 25. * param wordMap 属性词典 map 26. * throws IOException 27. */ 28. public void computeTFMultiIDF(String strDir, double trainSamplePercent, int indexOfSample, M

6、ap iDFPerWordMap, Map wordMap) throws IOException 29. File fileDir = new File(strDir); 30. String word; 吸氧机，家用吸氧机价格制氧机，鱼跃家庭制氧机31. SortedMap TFPerDocMap = new TreeMap();32. /注意可以用两个写文件，一个专门写测试样例，一个专门写训练样例，用sampleType 的值来表示 33. String trainFileDir = “F:/DataMiningSample/docVector/wordTFIDFMapTrainSamp

7、le“+indexOfSample; 34. String testFileDir = “F:/DataMiningSample/docVector/wordTFIDFMapTestSample“+indexOfSample; 35. FileWriter tsTrainWriter = new FileWriter(new File(trainFileDir); 36. FileWriter tsTestWrtier = new FileWriter(new File(testFileDir); 37. FileWriter tsWriter = tsTrainWriter; 38. Fil

8、e sampleDir = fileDir.listFiles(); 39. for(int i = 0; i = testBeginIndex 75. for(Iterator mt = tempTF.iterator(); mt.hasNext();) 76. Map.Entry me = mt.next(); 77. /wordWeight = (me.getValue() / wordSumPerDoc) * IDFPerWordMap.get(me.getKey(); 78. /这里 IDF 暂时设为 1，具体的计算 IDF 算法改进和实现见我的博客中关于 kmeans 聚类的博文

9、79. wordWeight = (me.getValue() / wordSumPerDoc) * 1.0; 80. TFPerDocMap.put(me.getKey(), wordWeight); 81. 82. tsWriter.append(cateShortName + “ “); 83. String keyWord = fileShortName.substring(0,5); 84. tsWriter.append(keyWord+ “ “); 85. Set tempTF2 = TFPerDocMap.entrySet(); 86. for(Iterator mt = te

10、mpTF2.iterator(); mt.hasNext();) 87. Map.Entry ne = mt.next(); 88. tsWriter.append(ne.getKey() + “ “ + ne.getValue() + “ “); 89. 90. tsWriter.append(“n“); 91. tsWriter.flush(); 92. 93. 94. tsTrainWriter.close(); 95. tsTestWrtier.close(); 96. tsWriter.close(); 97. 98. 99. /*统计每个词的总的出现次数，返回出现次数大于 3 次的

11、词汇构成最终的属性词典 100. * param strDir 处理好的 newsgroup 文件目录的绝对路径吸氧机，家用吸氧机价格制氧机，鱼跃家庭制氧机101. * throws IOException 102. */ 103. public SortedMap countWords(String strDir,Map wordMap) throws IOException 104. File sampleFile = new File(strDir); 105. File sample = sampleFile.listFiles(); 106. String word; 107. f

12、or(int i = 0; i newWordMap = new TreeMap();127. Set allWords = wordMap.entrySet(); 128. for(Iterator it = allWords.iterator(); it.hasNext();) 129. Map.Entry me = it.next(); 130. if(me.getValue() = 1) 131. newWordMap.put(me.getKey(),me.getValue(); 132. 133. 134. return newWordMap; 135. 136. 137. /*打印

13、属性词典 138. * param SortedMap 属性词典 139. * throws IOException 140. */ 141. void printWordMap(Map wordMap) throws IOException 吸氧机，家用吸氧机价格制氧机，鱼跃家庭制氧机142. / TODO Auto-generated method stub 143. System.out.println(“printWordMap“); 144. int countLine = 0; 145. File outPutFile = new File(“F:/DataMiningSample

14、/docVector/allDicWordCountMap.txt“); 146. FileWriter outPutFileWriter = new FileWriter(outPutFile); 147. Set allWords = wordMap.entrySet(); 148. for(Iterator it = allWords.iterator(); it.hasNext();) 149. Map.Entry me = it.next(); 150. outPutFileWriter.write(me.getKey()+“ “+me.getValue()+“n“); 151. c

15、ountLine+; 152. 153. System.out.println(“WordMap size“ + countLine); 154. 155. 156. /*计算 IDF，即属性词典中每个词在多少个文档中出现过 157. * param SortedMap 属性词典 158. * return 单词的 IDFmap 159. * throws IOException 160. */ 161. SortedMap computeIDF(String string, Map wordMap) throws IOException 162. / TODO Auto-generated

16、method stub 163. File fileDir = new File(string); 164. String word; 165. SortedMap IDFPerWordMap = new TreeMap(); 166. Set wordMapSet = wordMap.entrySet(); 167. for(Iterator pt = wordMapSet.iterator();pt.hasNext();) 168. Map.Entry pe = pt.next(); 169. Double coutDoc = 0.0; 170. String dicWord = pe.g

17、etKey(); 171. File sampleDir = fileDir.listFiles(); 172. for(int i = 0; i 保存测试集和训练集(2)注意要以“类目_ 文件名“作为每个文件的 key，才能避免同名不同内容的文件出现(3)注意设置 JM 参数，否则会出现 JAVA heap 溢出错误(4)本程序用向量夹角余弦计算相似度KNN 算法实现类 KNNClassifier.java 如下1. package com.pku.yangliu; 2. 3. import java.io.BufferedReader; 4. import java.io.File; 5.

18、 import java.io.FileReader; 6. import java.io.FileWriter; 7. import java.io.IOException; 8. import java.util.Comparator; 9. import java.util.HashMap; 10. import java.util.Iterator; 11. import java.util.Map; 12. import java.util.Set; 13. import java.util.TreeMap; 14. 15. /*KNN 算法的实现类，本程序用向量夹角余弦计算相似度

19、16. * author yangliu 17. * qq 772330184 18. * mail 吸氧机，家用吸氧机价格制氧机，鱼跃家庭制氧机19. * 20. */ 21. 22. public class KNNClassifier 23. 24. /*用 KNN 算法对测试文档集分类,读取测试样例和训练样例集 25. * param trainFiles 训练样例的所有向量构成的文件 26. * param testFiles 测试样例的所有向量构成的文件 27. * param kNNResultFile KNN 分类结果文件路径 28. * return double 分类准确

20、率 29. * throws IOException 30. */ 31. private double doProcess(String trainFiles, String testFiles, 32. String kNNResultFile) throws IOException 33. / TODO Auto-generated method stub 34. /首先读取训练样本和测试样本，用 map保存测试集和训练集，注意训练样本的类目信息也得保存， 35. /然后遍历测试样本，对于每一个测试样本去计算它与所有训练样本的相似度，相似度保存入 map有 36. /序 map 中去，然

21、后取前 K 个样本，针对这 k 个样本来给它们所属的类目计算权重得分，对属于同一个类目的权重求和进而得到 37. /最大得分的类目，就可以判断测试样例属于该类目下， K 值可以反复测试，找到分类准确率最高的那个值 38. /！注意要以“类目_文件名“作为每个文件的 key，才能避免同名不同内容的文件出现39. /！注意设置 JM 参数，否则会出现 JAVA heap 溢出错误 40. /！本程序用向量夹角余弦计算相似度 41. File trainSamples = new File(trainFiles); 42. BufferedReader trainSamplesBR = new Bu

22、fferedReader(new FileReader(trainSamples); 43. String line; 44. String lineSplitBlock; 45. Map trainFileNameWordTFMap = new TreeMap (); 46. TreeMap trainWordTFMap = new TreeMap(); 47. while(line = trainSamplesBR.readLine() != null) 48. lineSplitBlock = line.split(“ “); 49. trainWordTFMap.clear(); 50

23、. for(int i = 2; i tempMap = new TreeMap(); 吸氧机，家用吸氧机价格制氧机，鱼跃家庭制氧机54. tempMap.putAll(trainWordTFMap); 55. trainFileNameWordTFMap.put(lineSplitBlock0+“_“+lineSplitBlock1, tempMap); 56. 57. trainSamplesBR.close(); 58. 59. File testSamples = new File(testFiles); 60. BufferedReader testSamplesBR = new B

24、ufferedReader(new FileReader(testSamples); 61. Map testFileNameWordTFMap = new TreeMap (); 62. Map testClassifyCateMap = new TreeMap();/分类形成的对 63. Map testWordTFMap = new TreeMap(); 64. while(line = testSamplesBR.readLine() != null) 65. lineSplitBlock = line.split(“ “); 66. testWordTFMap.clear(); 67

25、. for(int i = 2; i tempMap = new TreeMap(); 71. tempMap.putAll(testWordTFMap); 72. testFileNameWordTFMap.put(lineSplitBlock0+“_“+lineSplitBlock1,tempMap); 73. 74. testSamplesBR.close(); 75. /下面遍历每一个测试样例计算与所有训练样本的距离，做分类 76. String classifyResult; 77. FileWriter testYangliuWriter = new FileWriter(new

26、File(“F:/DataMiningSample/docVector/yangliuTest“); 78. FileWriter KNNClassifyResWriter = new FileWriter(kNNResultFile); 79. Set testFileNameWordTFMapSet =testFileNameWordTFMap.entrySet(); 80. for(Iterator it = testFileNameWordTFMapSet.iterator(); it.hasNext();) 81. Map.Entry me = it.next(); 82. clas

27、sifyResult = KNNComputeCate(me.getKey(), me.getValue(), trainFileNameWordTFMap, testYangliuWriter); 83. KNNClassifyResWriter.append(me.getKey()+“ “+classifyResult+“n“); 84. KNNClassifyResWriter.flush(); 85. testClassifyCateMap.put(me.getKey(), classifyResult); 86. 吸氧机，家用吸氧机价格制氧机，鱼跃家庭制氧机87. KNNClassi

28、fyResWriter.close(); 88. /计算分类的准确率 89. double righteCount = 0; 90. Set testClassifyCateMapSet = testClassifyCateMap.entrySet(); 91. for(Iterator it = testClassifyCateMapSet.iterator(); it.hasNext();) 92. Map.Entry me = it.next(); 93. String rightCate = me.getKey().split(“_“)0; 94. if(me.getValue().e

29、quals(rightCate) 95. righteCount+; 96. 97. 98. testYangliuWriter.close(); 99. return righteCount / testClassifyCateMap.size(); 100. 101. 102. /*对于每一个测试样本去计算它与所有训练样本的向量夹角余弦相似度 103. * 相似度保存入 map有序 map 中去，然后取前 K 个样本， 104. * 针对这 k 个样本来给它们所属的类目计算权重得分，对属于同一个类 105. * 目的权重求和进而得到最大得分的类目，就可以判断测试样例属于该 106. * 类

30、目下。K 值可以反复测试，找到分类准确率最高的那个值 107. * param testWordTFMap 当前测试文件的向量 108. * param trainFileNameWordTFMap 训练样本Map 109. * param testYangliuWriter 110. * return String K 个邻居权重得分最大的类目 111. * throws IOException 112. */ 113. private String KNNComputeCate( 114. String testFileName, 115. Map testWordTFMap, 116.

31、Map trainFileNameWordTFMap, FileWriter testYangliuWriter) throws IOException 117. / TODO Auto-generated method stub 118. HashMap simMap = new HashMap();/ 后面需要将该 HashMap 按照 value 排序 119. double similarity; 120. Set trainFileNameWordTFMapSet = trainFileNameWordTFMap.entrySet(); 121. for(Iterator it =

32、trainFileNameWordTFMapSet.iterator(); it.hasNext();) 122. Map.Entry me = it.next(); 123. similarity = computeSim(testWordTFMap, me.getValue(); 124. simMap.put(me.getKey(),similarity); 吸氧机，家用吸氧机价格制氧机，鱼跃家庭制氧机125. 126. /下面对 simMap 按照 value 排序 127. ByValueComparator bvc = new ByValueComparator(simMap);

33、128. TreeMap sortedSimMap = new TreeMap(bvc); 129. sortedSimMap.putAll(simMap); 130. 131. /在 disMap 中取前 K 个最近的训练样本对其类别计算距离之和，K 的值通过反复试验而得 132. Map cateSimMap = new TreeMap();/K 个最近训练样本所属类目的距离之和 133. double K = 20; 134. double count = 0; 135. double tempSim; 136. 137. Set simMapSet = sortedSimMap.ent

34、rySet();138. for(Iterator it = simMapSet.iterator(); it.hasNext();) 139. Map.Entry me = it.next(); 140. count+; 141. String categoryName = me.getKey().split(“_“)0; 142. if(cateSimMap.containsKey(categoryName) 143. tempSim = cateSimMap.get(categoryName); 144. cateSimMap.put(categoryName, tempSim + me

35、.getValue(); 145. 146. else cateSimMap.put(categoryName, me.getValue(); 147. if (count K) break; 148. 149. /下面到 cateSimMap 里面把 sim 最大的那个类目名称找出来 150. /testYangliuWriter.flush(); 151. /testYangliuWriter.close(); 152. double maxSim = 0; 153. String bestCate = null; 154. Set cateSimMapSet = cateSimMap.e

36、ntrySet();155. for(Iterator it = cateSimMapSet.iterator(); it.hasNext();) 156. Map.Entry me = it.next(); 157. if(me.getValue() maxSim) 158. bestCate = me.getKey(); 159. maxSim = me.getValue(); 160. 161. 吸氧机，家用吸氧机价格制氧机，鱼跃家庭制氧机162. return bestCate; 163. 164. 165. /*计算测试样本向量和训练样本向量的相似度 166. * param tes

37、tWordTFMap 当前测试文件的向量 167. * param trainWordTFMap 当前训练样本向量 168. * return Double 向量之间的相似度以向量夹角余弦计算 169. * throws IOException 170. */ 171. private double computeSim(Map testWordTFMap, 172. Map trainWordTFMap) 173. / TODO Auto-generated method stub 174. double mul = 0, testAbs = 0, trainAbs = 0; 175. S

38、et testWordTFMapSet = testWordTFMap.entrySet(); 176. for(Iterator it = testWordTFMapSet.iterator(); it.hasNext();) 177. Map.Entry me = it.next(); 178. if(trainWordTFMap.containsKey(me.getKey() 179. mul += me.getValue()*trainWordTFMap.get(me.getKey(); 180. 181. testAbs += me.getValue() * me.getValue(

39、); 182. 183. testAbs = Math.sqrt(testAbs); 184. 185. Set trainWordTFMapSet = trainWordTFMap.entrySet(); 186. for(Iterator it = trainWordTFMapSet.iterator(); it.hasNext();) 187. Map.Entry me = it.next(); 188. trainAbs += me.getValue()*me.getValue(); 189. 190. trainAbs = Math.sqrt(trainAbs); 191. retu

40、rn mul / (testAbs * trainAbs); 192. 193. 194. /*根据 KNN 算法分类结果文件生成正确类目文件，而正确率和混淆矩阵的计算可以复用贝叶斯算法类中的方法 195. * param kNNRightFile 分类正确类目文件 196. * param kNNResultFile 分类结果文件 197. * throws IOException 198. */ 199. private void createRightFile(String kNNResultFile, String kNNRightFile)throws IOException 吸氧机

41、，家用吸氧机价格制氧机，鱼跃家庭制氧机200. / TODO Auto-generated method stub 201. String rightCate; 202. FileReader fileR = new FileReader(kNNResultFile); 203. FileWriter KNNRrightResult = new FileWriter(new File(kNNRightFile); 204. BufferedReader fileBR = new BufferedReader(fileR); 205. String line; 206. String lineB

42、lock; 207. while(line = fileBR.readLine() != null) 208. lineBlock = line.split(“ “); 209. rightCate = lineBlock0.split(“_“)0; 210. KNNRrightResult.append(lineBlock0+“ “+rightCate+“n“); 211. 212. KNNRrightResult.flush(); 213. KNNRrightResult.close(); 214. 215. 216. 217. /* 218. * param args 219. * th

43、rows IOException 220. */ 221. public void KNNClassifierMain(String args) throws IOException 222. / TODO Auto-generated method stub 223. /wordMap 是所有属性词的词典 224. double accuracyOfEveryExp = new double10; 225. double accuracyAvg,sum = 0; 226. KNNClassifier knnClassifier = new KNNClassifier(); 227. Naiv

44、eBayesianClassifier nbClassifier = new NaiveBayesianClassifier();228. Map wordMap = new TreeMap(); 229. Map IDFPerWordMap = new TreeMap(); 230. ComputeWordsVector computeWV = new ComputeWordsVector(); 231. wordMap = computeWV.countWords(“F:/DataMiningSample/processedSampleOnlySpecial“, wordMap); 232. IDFPerWordMap = computeWV.computeIDF(“F:/DataMiningSample/processedSampleOnlySpecial“,wordMap); 233. computeWV.printWordMap(wordMap); 234. /首先生成 KNN 算法 10 次试验需要的文档 TF 矩阵文件 235. for(int i = 0; i 253.

展开阅读全文