weka是著名的数据挖掘工具,在
这里有详细介绍,
IDMer老师的博客里也有比较详细的用法描述。当然,如果直接使用weka的工具,自然没有问题,但是如果想用weka的功能在自己的平台框架中呢?我这里放出一个当初对weka的源码学习过程,主要是如何调用weka的api。仅供参考,代码中有什么问题,欢迎邮件联系。
这里简单讲解一下流程。构造方法首先载入一个arff文件,然后调用doCluster()方法进行聚类。
本文用到的arff文件是weka的data目录里自带的标准数据集。主要格式如下:
1
/** *//**
2
*
3
*/
4
package edu.tju.ikse.mi.util;
5
6
import java.io.File;
7
import java.io.FileNotFoundException;
8
import java.io.IOException;
9
import java.io.PrintWriter;
10
import java.util.Scanner;
11
12
import edu.tju.ikse.mi.anno.util.CfUtil;
13
14
15
import weka.clusterers.XMeans;
16
import weka.core.Instances;
17
import weka.core.converters.ArffLoader;
18
19
/** *//**
20
* @author Jia Yu
21
* @date 2010-5-28
22
*/
23
public class WekaCluster
{
24
25
/** *//**
26
* @param args
27
*/
28
29
private ArffLoader loader;
30
private Instances dataSet;
31
private weka.clusterers.Clusterer cluster;
32
private int numOfClusters;
33
private String newAttribute;
34
private File arffFile;
35
private int sizeOfDataset;
36
37
public WekaCluster(File arffFile)
{
38
this.arffFile = arffFile;
39
doCluster();
40
}
41
42
private void doCluster()
{
43
loader = new ArffLoader();
44
newAttribute = "";
45
try
{
46
loader.setFile(arffFile);
47
dataSet = loader.getDataSet();
48
cluster = new XMeans();
49
cluster.buildClusterer(dataSet);
50
numOfClusters = cluster.numberOfClusters();
51
StringBuilder sb = new StringBuilder();
52
for (int i = 0; i < numOfClusters; i++)
{
53
sb.append("s" + (i + 1) + " ");
54
}
55
newAttribute = sb.toString().trim();
56
sizeOfDataset = dataSet.numInstances();
57
} catch (Exception e)
{
58
e.printStackTrace();
59
}
60
}
61
62
public void newArffWriter()
{
63
int lineNum = 0;
64
try
{
65
Scanner input = new Scanner(arffFile);
66
PrintWriter out = new PrintWriter(CfUtil
67
.GetFileNameNoExtFromFileName(arffFile.getName())
68
+ "_classification.arff");
69
70
while (input.hasNext())
{
71
String line = input.nextLine();
72
if (line.startsWith("@relation"))
{
73
out.println("@relation" + line.substring(9)
74
+ "_classification");
75
} else if (line.startsWith("@data"))
{
76
out.println("@attribute shape {" + newAttribute + "}");
77
out.println("@data");
78
} else if (line.startsWith("@attribute"))
{
79
out.println(line);
80
} else if (line.isEmpty())
{
81
out.println();
82
} else
{
83
line += ",class"
84
+ (cluster.clusterInstance(dataSet
85
.instance(lineNum)) + 1);
86
out.println(line);
87
lineNum++;
88
}
89
}
90
out.close();
91
} catch (FileNotFoundException e)
{
92
e.printStackTrace();
93
} catch (Exception e)
{
94
e.printStackTrace();
95
}
96
}
97
98
public int clusterNewInstance(weka.core.Instance instance)
{
99
int indexOfCluster = -1;
100
try
{
101
indexOfCluster = cluster.clusterInstance(instance);
102
//System.out.println("cluster " + indexOfCluster);
103
} catch (Exception e)
{
104
e.printStackTrace();
105
}
106
return indexOfCluster;
107
}
108
109
public double[] frequencyOfCluster()
{
110
int[] sum = new int[this.numOfClusters];
111
try
{
112
for (int i = 0; i < this.sizeOfDataset; i++)
{
113
sum[cluster.clusterInstance(dataSet.instance(i))]++;
114
}
115
} catch (Exception e)
{
116
e.printStackTrace();
117
}
118
double[] fre = new double[sum.length];
119
for (int i = 0; i < sum.length; i++)
{
120
fre[i] = (double)sum[i] / (double)this.sizeOfDataset;
121
}
122
return fre;
123
}
124
125
public static void main(String[] args)
{
126
File file = new File("cpu.arff");
127
WekaCluster wc = new WekaCluster(file);
128
double[] fre = wc.frequencyOfCluster();
129
for(int i=0;i<fre.length;i++)
130
System.out.println(fre[i]);
131
// wc.newArffWriter(file);
132
double[] feature =
{ 125,256,6000,256,16,128,199 };
133
weka.core.Instance ins = new weka.core.Instance(7);
134
for (int i = 0; i < ins.numAttributes(); i++)
{
135
ins.setValue(i, feature[i]);
136
// System.out.println(ins.attribute(i).getLowerNumericBound());
137
}
138
System.out.println("cluster in : "+wc.clusterNewInstance(ins));
139
}
140
141
}
142
@relation 'cpu'
@attribute MYCT real
@attribute MMIN real
@attribute MMAX real
@attribute CACH real
@attribute CHMIN real
@attribute CHMAX real
@attribute class real
@data
125,256,6000,256,16,128,199
29,8000,32000,32,8,32,253
29,8000,32000,32,8,32,253
这里摘取了3项。运行程序执行结果如下:
0.03827751196172249
0.16267942583732056
0.69377990430622
0.10526315789473684
cluster in : 0
表示聚类方法将数据集聚为四类,程序中提供的instance被聚到第一类里。每一类的在总文件中的比率如上显示。
具体的数据挖掘的内容就不在这里讲述了。只是为大家提供一个weka的java用法实现。方便在程序中使用weka。