/* $RCSfile$
* $Author$
* $Date$
* $Revision$
*
* Copyright (C) 1997-2007 The Chemistry Development Kit (CDK) project
*
* Contact: cdk-devel@lists.sourceforge.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
* All we ask is that proper credit is given for our work, which includes
* - but is not limited to - adding the above copyright notice to the beginning
* of your source code files, and to any copyright notice that you may distribute
* with programs based on this work.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
package org.openscience.cdk.similarity;
import org.openscience.cdk.annotations.TestClass;
import org.openscience.cdk.annotations.TestMethod;
import org.openscience.cdk.exception.CDKException;
import Java.util.BitSet;
/**
* Calculates the Tanimoto coefficient for a given pair of two
* fingerprint bitsets or real valued feature vectors.
*
* The Tanimoto coefficient is one way to
* quantitatively measure the "distance" or similarity of
* two chemical structures.
*
* <p>You can use the FingerPrinter class to retrieve two fingerprint bitsets.
* We assume that you have two structures stored in cdk.Molecule objects.
* A tanimoto coefficient can then be calculated like:
* <pre>
* BitSet fingerprint1 = Fingerprinter.getFingerprint(molecule1);
* BitSet fingerprint2 = Fingerprinter.getFingerprint(molecule2);
* float tanimoto_coefficient = Tanimoto.calculate(fingerprint1, fingerprint2);
* </pre>
*
* <p>The FingerPrinter assumes that hydrogens are explicitely given, if this
* is desired!
* <p>Note that the continuous Tanimoto coefficient does not lead to a metric space
*
*@author steinbeck
* @cdk.githash
*@cdk.created 2005-10-19
*@cdk.keyword jaccard
*@cdk.keyword similarity, tanimoto
* @cdk.module fingerprint
*/
@TestClass("org.openscience.cdk.similarity.TanimotoTest")
public class Tanimoto
{
/**
* Evaluates Tanimoto coefficient for two bit sets.
*
* @param bitset1 A bitset (such as a fingerprint) for the first molecule
* @param bitset2 A bitset (such as a fingerprint) for the second molecule
* @return The Tanimoto coefficient
* @throws org.openscience.cdk.exception.CDKException if bitsets are not of the same length
*/
@TestMethod("testTanimoto1,testTanimoto2")
public static float calculate(BitSet bitset1, BitSet bitset2) throws CDKException
{
float _bitset1_cardinality = bitset1.cardinality();
float _bitset2_cardinality = bitset2.cardinality();
if (bitset1.size() != bitset2.size()) {
throw new CDKException("Bisets must have the same bit length");
}
BitSet one_and_two = (BitSet)bitset1.clone();
one_and_two.and(bitset2);
float _common_bit_count = one_and_two.cardinality();
return _common_bit_count/(_bitset1_cardinality + _bitset2_cardinality - _common_bit_count);
}
/**
* Evaluates the continuous Tanimoto coefficient for two real valued vectors.
*
* @param features1 The first feature vector
* @param features2 The second feature vector
* @return The continuous Tanimoto coefficient
* @throws org.openscience.cdk.exception.CDKException if the features are not of the same length
*/
@TestMethod("testTanimoto3")
public static float calculate(double[] features1, double[] features2) throws CDKException {
if (features1.length != features2.length) {
throw new CDKException("Features vectors must be of the same length");
}
int n = features1.length;
double ab = 0.0;
double a2 = 0.0;
double b2 = 0.0;
for (int i = 0; i < n; i++) {
ab += features1[i] * features2[i];
a2 += features1[i]*features1[i];
b2 += features2[i]*features2[i];
}
return (float)ab/(float)(a2+b2-ab);
}
}
通过源码可以看出calculate(BitSet bitset1, BitSet bitset2)方法,是通过比较两个分子的fingerprint的位,来计算相似度.通过BitSet的and操作得到共同的个数,然后在除以总共为true的个数,这样就得到相似值.
posted on 2009-10-18 13:36
周锐 阅读(464)
评论(0) 编辑 收藏 所属分类:
Chemistry 、
Java 、
CDK