COMBINATORIAL_BLAS 1.6
 
Loading...
Searching...
No Matches
Fscore.cpp
Go to the documentation of this file.
1#include <iostream>
2#include <fstream>
3#include <sstream>
4#include <vector>
5#include <numeric>
6#include <algorithm>
7#include <string.h>
8#include <assert.h>
9#include <omp.h>
10using namespace std;
11
12template <typename T>
13vector<size_t> sortIndices(const vector<T> &v) {
14
15 // initialize original index locations
16 vector<size_t> idx(v.size());
17 iota(idx.begin(), idx.end(), 0);
18
19 // sort indexes based on comparing values in v
20 sort(idx.begin(), idx.end(),
21 [&v](size_t i1, size_t i2) {return v[i1] > v[i2];});
22
23 return idx;
24}
25
26
27
28double Fscore(string mclFile, string hipmclFile, int base)
29{
30 ifstream mclStream (mclFile);
31 ifstream hipmclStream (hipmclFile);
32 string line;
33 int64_t item, clustID = 0;
34 int64_t numMCLClusters, numHipMCLClusters;
35 int64_t nproteins = 0;
36 // item 0-based
37
38 vector<vector<int64_t>> mclClusters;
39
40 if (mclStream.is_open())
41 {
42 while ( getline (mclStream,line) )
43 {
44 istringstream iss(line);
45 if(clustID >= mclClusters.size())
46 {
47 mclClusters.resize(clustID * 2 + 1);
48 }
49 while ( iss >> item)
50 {
51 if(base==1) item --;
52 mclClusters[clustID].push_back(item);
53 }
54
55 nproteins += mclClusters[clustID].size();
56 clustID++;
57 }
58 mclStream.close();
59 numMCLClusters = clustID;
60 }
61 else
62 {
63 cout << "Unable to open " << mclFile << endl;
64 return -1;
65 }
66
67 cout << "Number of clusters from MCL: " << numMCLClusters << endl;
68 vector<int64_t> clust2(nproteins, -1);
69 clustID = 0;
70 if (hipmclStream.is_open())
71 {
72 while ( getline (hipmclStream,line) )
73 {
74 istringstream iss(line);
75 while ( iss >> item)
76 {
77 if(base==1) item --;
78 clust2[item] = clustID;
79 if(item >= nproteins)
80 {
81 cout << "The number of vertices in MCL and HipMCL outputs does not match. \n Please check if there are isolated vertices in HipMCL.\n Exiting.............." << endl;
82 exit(1);
83 }
84 }
85 clustID++;
86 }
87 hipmclStream.close();
88 numHipMCLClusters = clustID;
89 }
90 else
91 {
92 cout << "Unable to open " << hipmclFile << endl;
93 return -1;
94 }
95
96 // this will account for isolated vertices. These vertices are assigned to their own clusters
97 for(int i=0; i<nproteins; i++)
98 {
99 if(clust2[i]==-1)
100 clust2[i]=numHipMCLClusters++;
101 }
102 cout << "Number of clusters from HipMCL: " << numHipMCLClusters << endl;
103
104 vector<int64_t> clusterSizes2(numHipMCLClusters,0);
105 for(int i=0; i<nproteins; i++)
106 {
107 clusterSizes2[clust2[i]]++;
108 }
109
110
111 vector<double> F(numMCLClusters);
112 vector<int64_t> nisect(numHipMCLClusters); // number of items in the intersection
113 vector<int64_t> isect(numHipMCLClusters); // clusters with nonzero intersection with the current cluster
114 int mismatch = 0;
115
116 for(int i=0; i<numMCLClusters; i++)
117 {
118 int64_t isectCount = 0;
119 fill(nisect.begin(), nisect.end(), 0);
120 for(int j=0; j<mclClusters[i].size(); j++)
121 {
122 int64_t item1 = mclClusters[i][j];
123 int64_t c2 = clust2[item1];
124 if(nisect[c2]==0) isect[isectCount++] = c2;
125 nisect[c2] ++;
126 }
127 auto maxoverlap = max_element(nisect.begin(), nisect.end());
128 int64_t c2_max = distance(nisect.begin(), maxoverlap);
129 if(*maxoverlap!=mclClusters[i].size() || *maxoverlap!=clusterSizes2[c2_max])
130 {
131 cout << "Mismatch# " << mismatch++ << ":: MCL Cluster: "<< i << " Size: " << mclClusters[i].size() << " HipMCL Cluster: "<< c2_max <<" Size: " << clusterSizes2[c2_max] << " last vertex in MCL output: " << mclClusters[i].back()<< endl;
132
133 }
134 double Fi = 0;
135 for(int j=0; j<isectCount; j++)
136 {
137 int64_t c2 = isect[j];
138 double precision = (double) nisect[c2] / clusterSizes2[c2];
139 double recall = (double) nisect[c2] / mclClusters[i].size();
140 double Fij = 2 * precision * recall / (precision + recall);
141 Fi = max(Fi, Fij);
142 }
143
144 Fi = Fi * mclClusters[i].size()/ nproteins;
145 F[i] = Fi;
146 }
147 return accumulate(F.begin(), F.end(), 0.0);
148}
149
150int main(int argc, char* argv[])
151{
152
153 string ifilename1 = "";
154 string ifilename2 = "";
155 int base = 0;
156
157
158 if(argc != 7)
159 {
160 cout << "Usage: ./fscore -M1 <MCLOut> -M2 <HipMCLOut> -base <Base of vertices (same as HipMCL) 1 or 0>\n";
161 cout << "Example: ./fscore -M1 input1.txt -M2 input2.txt -base 0 " << endl;
162 return -1;
163 }
164
165 for (int i = 1; i < argc; i++)
166 {
167 if (strcmp(argv[i],"-M1")==0)
168 {
169 ifilename1 = string(argv[i+1]);
170 printf("\nMCL output file: %s",ifilename1.c_str());
171 }
172 else if (strcmp(argv[i],"-M2")==0)
173 {
174 ifilename2 = string(argv[i+1]);
175 printf("\nHipMCL output file: %s",ifilename2.c_str());
176 }
177 else if (strcmp(argv[i],"-base")==0)
178 {
179 base = atoi(argv[i+1]);
180 printf("\nbase: %d",base);
181 }
182 }
183 printf("\n");
184 double F = Fscore(ifilename1, ifilename2, base);
185 cout << "F score: " << F << endl;
186 return 0;
187}
int main()
Definition Driver.cpp:12
double Fscore(string mclFile, string hipmclFile, int base)
Definition Fscore.cpp:28
vector< size_t > sortIndices(const vector< T > &v)
Definition Fscore.cpp:13
int size
Definition common.h:20
void iota(_ForwardIter __first, _ForwardIter __last, T __value)