Jumamosi, 30 Novemba 2013

Cosine Similarity Using C++

/*C++ program to show cosine similarities*/
#include<iostream>//include the library of input output stream
#include<fstream>//Header file for reading the word in the file
#include<vector>//Header file for storing the word in the documents
#include<map>//for relating the terms,frequencies,TF&TF_IDF and cosine smilarities
#include<cmath>//for  doing mathematical calculation
#include<algorithm>//for sorting
using namespace std;//allows us to use the standard library

//string Lower(string& lowercase_term);//prototype for changing words to lowercase
void tf_idf_compute(map<string,int> &frequency,vector<vector<string> > &documents, vector<string> &terms);//prototype for finding a frequency of terms
int main()//where a c++ program starts/execution starts.
{//openining curled brac i.e main()opens
cout<<"Terms\t\t""Term frequency\t\t\t""TF_IDF\t\t""CosineSimilarity";
    cout<<"\n";
    vector<vector<string> > dc;// dictionary
    vector<string> tokens;//container for documents which represent refered vector  in prototype
    map<string,int>S;//A vector "s"that stores int type values.

    tf_idf_compute(S,dc,tokens);//call function for tf_idf
}
void tf_idf_compute(map<string,int> &frequency,vector<vector<string> > &documents,vector<string> &terms)//function for creating a dictionery
{
     fstream file("regs.txt");//opens the file named regs.
       if(!file)// reading file is not found
       {
       cout<<"file not found"<<endl;
       }
       else
       {
         while(!file.eof())//reading file is not found doesnot mark end of fuction
         {
            string hb;//variable of type string for holding a term
            vector<string> words;//container for storing terms before storing the terms into a memory
            while(file>>hb && hb!="#")//condition which direct raeding of documents with specified delimiter as the sign of an end of a document
            {
                   words.push_back(hb);//put terms into a temporary holding vector
                   terms.push_back(hb);//keep terms in memory
                    frequency[hb]++;
                   sort(terms.begin(),terms.end());//sorting the terms
                   terms.erase(unique(terms.begin(),terms.end()),terms.end());//remove term repeatation
                   }
                   if(!words.empty())//if the vector is not empty
           {
           documents.push_back(words);//push the words in temporary vector into the vector of vectors in order to be stored in a memory
           }

        }
          int a;
          vector<int> ting;
          for(int j=0;j<terms.size();j++)
          {
                 a=0;

         for(int i=0;i<documents.size();i++)
         {
                 //finds if a term occurs or doesn't occcur in the document
          vector<string>::const_iterator p=find(documents[i].begin(),documents[i].end(),terms[j]);
          if(p!=documents[i].end())
          {
            a=a+1;
          }
         }
         ting.push_back(a);
        //cout<<cnt<<endl;
         }


        //operation for calculating tf_idf
        map<string,int>::iterator iter;
        float tf_idf;
        float cosine;
        for(iter=frequency.begin();iter!=frequency.end();iter++)
        {
        tf_idf=(1+log10(iter->second))*log10(documents.size()/a);//formular to calculate tf_idf & cosine similarities
        cosine=(tf_idf*iter->second)/(abs(tf_idf)*abs(iter->second));
        cout<<iter->first<<"                  "<<iter->second<<"                       "<<tf_idf<<"              "<<cosine<<endl;
        }

     cin.get();//holds the screen
}
}//closing curled brac (}) i.e main()ends


====================================================================
your .txt file should look like
mine was "regs.txt"below

Information retrieval #
Information retrieval it is a discipline #
organization and storage should provide easy access #

Hakuna maoni:

Chapisha Maoni