public static class TFIDF { /// <summary> /// Document vocabulary, containing each word's IDF value. /// </summary> private static Dictionary<string, double> _vocabularyIDF = new Dictionary<string, double>(); /// <summary> /// Transforms a list of documents into their associated TF*IDF values. /// If a vocabulary does not yet exist, one will be created, based upon the documents' words. /// </summary> /// <param name="documents">string[]</param> /// <param name="vocabularyThreshold">Minimum number of occurences of the term within all documents</param> /// <returns>double[][]</returns> public static double[][] Transform(string[] documents, int vocabularyThreshold = 3) { List<List<string>> stemmedDocs; List<string> vocabulary; // Get the vocabulary and stem the documents at the same time. vocabulary = GetVocabulary(documents, out stemmedDocs, vocabularyThreshold); if (_vocabularyIDF.Count == 0) { // Calculate the IDF for each vocabulary term. foreach (var term in vocabulary) { double numberOfDocsContainingTerm = stemmedDocs.Where(d => d.Contains(term)).Count(); _vocabularyIDF[term] = Math.Log((double)stemmedDocs.Count / ((double)1 + numberOfDocsContainingTerm)); } } // Transform each document into a vector of tfidf values. return TransformToTFIDFVectors(stemmedDocs, _vocabularyIDF); } /// <summary> /// 预处理 /// </summary> /// <param name="documents"></param> /// <returns></returns> public static List<List<double>> Preprocessing(string[] documents) { List<List<string>> stemmedDocs; List<string> vocabulary; List<List<double>> r = new List<List<double>>(); vocabulary = GetVocabulary(documents, out stemmedDocs, 0); foreach (List<string> lt in stemmedDocs){ List<double> temp = new List<double>(); foreach (var item in vocabulary) { if (lt.Contains(item)){ temp.Add(1); } else { temp.Add(0); } } r.Add(temp); } return r; } /// <summary> /// 对比字符串相似度 /// </summary> /// <param name="str1"></param> /// <param name="str2"></param> /// <returns></returns> public static double StringCompare(string str1,string str2) { //提取特征点 List<List<double>> inputs = TFIDF.Preprocessing(new string[] { str1.ToLower(),str2.ToLower() }); //求两向量弧度 double RadianVal=Radian(inputs[0].ToArray(), inputs[1].ToArray()); //弧度转角度 double Degree = (180 / Math.PI) * RadianVal; return (90 - Degree) / 90.0; } /// <summary> /// 向量归一化 /// </summary> /// <param name="d"></param> /// <returns></returns> private static double[] guiyi(double[] d) { List<double> nd = new List<double>(); double sum = 0; foreach (double item in d) { sum += item * item; } double length = Math.Sqrt(sum); foreach (double item in d) { if (length != 0.0) { nd.Add(item / length); } else { nd.Add(0); } } return nd.ToArray(); } /// <summary> /// 求两个向量的弧度 /// </summary> /// <param name="d1"></param> /// <param name="d2"></param> /// <returns></returns> private static double Radian(double[] d1, double[] d2) { //归一 double[] nd1 = guiyi(d1); double[] nd2 = guiyi(d2); //向量点积 double sum = 0; for (int i = 0; i < d1.Length; i++) { sum += nd1[i] * nd2[i]; } sum = Math.Round(sum, 4); double aa = Math.Acos(sum); return Math.Acos(sum); } /// <summary> /// Converts a list of stemmed documents (lists of stemmed words) and their associated vocabulary + idf values, into an array of TF*IDF values. /// </summary> /// <param name="stemmedDocs">List of List of string</param> /// <param name="vocabularyIDF">Dictionary of string, double (term, IDF)</param> /// <returns>double[][]</returns> private static double[][] TransformToTFIDFVectors(List<List<string>> stemmedDocs, Dictionary<string, double> vocabularyIDF) { // Transform each document into a vector of tfidf values. List<List<double>> vectors = new List<List<double>>(); foreach (var doc in stemmedDocs) { List<double> vector = new List<double>(); foreach (var vocab in vocabularyIDF) { // Term frequency = count how many times the term appears in this document. double tf = doc.Where(d => d == vocab.Key).Count(); double tfidf = tf * vocab.Value; vector.Add(tfidf); } vectors.Add(vector); } return vectors.Select(v => v.ToArray()).ToArray(); } /// <summary> /// Normalizes a TF*IDF array of vectors using L2-Norm. /// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2) /// </summary> /// <param name="vectors">double[][]</param> /// <returns>double[][]</returns> public static double[][] Normalize(double[][] vectors) { // Normalize the vectors using L2-Norm. List<double[]> normalizedVectors = new List<double[]>(); foreach (var vector in vectors) { var normalized = Normalize(vector); normalizedVectors.Add(normalized); } return normalizedVectors.ToArray(); } /// <summary> /// Normalizes a TF*IDF vector using L2-Norm. /// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2) /// </summary> /// <param name="vectors">double[][]</param> /// <returns>double[][]</returns> public static double[] Normalize(double[] vector) { List<double> result = new List<double>(); double sumSquared = 0; foreach (var value in vector) { sumSquared += value * value; } double SqrtSumSquared = Math.Sqrt(sumSquared); foreach (var value in vector) { // L2-norm: Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2) result.Add(value / SqrtSumSquared); } return result.ToArray(); } /// <summary> /// Saves the TFIDF vocabulary to disk. /// </summary> /// <param name="filePath">File path</param> public static void Save(string filePath = "vocabulary.dat") { // Save result to disk. using (FileStream fs = new FileStream(filePath, FileMode.Create)) { BinaryFormatter formatter = new BinaryFormatter(); formatter.Serialize(fs, _vocabularyIDF); } } /// <summary> /// Loads the TFIDF vocabulary from disk. /// </summary> /// <param name="filePath">File path</param> public static void Load(string filePath = "vocabulary.dat") { // Load from disk. using (FileStream fs = new FileStream(filePath, FileMode.Open)) { BinaryFormatter formatter = new BinaryFormatter(); _vocabularyIDF = (Dictionary<string, double>)formatter.Deserialize(fs); } } #region Private Helpers /// <summary> /// Parses and tokenizes a list of documents, returning a vocabulary of words. /// </summary> /// <param name="docs">string[]</param> /// <param name="stemmedDocs">List of List of string</param> /// <returns>Vocabulary (list of strings)</returns> private static List<string> GetVocabulary(string[] docs, out List<List<string>> stemmedDocs, int vocabularyThreshold) { List<string> vocabulary = new List<string>(); Dictionary<string, int> wordCountList = new Dictionary<string, int>(); stemmedDocs = new List<List<string>>(); int docIndex = 0; foreach (var doc in docs) { List<string> stemmedDoc = new List<string>(); docIndex++; if (docIndex % 100 == 0) { Console.WriteLine("Processing " + docIndex + "/" + docs.Length); } string[] parts2 = Tokenize(doc); List<string> words = new List<string>(); foreach (string part in parts2) { // Strip non-alphanumeric characters. string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", ""); if (!StopWords.stopWordsList.Contains(stripped.ToLower())) { try { var english = new EnglishWord(stripped); string stem = english.Stem; words.Add(stem); if (stem.Length > 0) { // Build the word count list. if (wordCountList.ContainsKey(stem)) { wordCountList[stem]++; } else { wordCountList.Add(stem, 0); } stemmedDoc.Add(stem); } } catch { } } } stemmedDocs.Add(stemmedDoc); } // Get the top words. var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold); foreach (var item in vocabList) { vocabulary.Add(item.Key); } return vocabulary; } /// <summary> /// Tokenizes a string, returning its list of words. /// </summary> /// <param name="text">string</param> /// <returns>string[]</returns> private static string[] Tokenize(string text) { // Strip all HTML. text = Regex.Replace(text, "<[^<>]+>", ""); // Strip numbers. text = Regex.Replace(text, "[0-9]+", "number"); // Strip urls. text = Regex.Replace(text, @"(http|https)://[^\s]*", "httpaddr"); // Strip email addresses. text = Regex.Replace(text, @"[^\s]+@[^\s]+", "emailaddr"); // Strip dollar sign. text = Regex.Replace(text, "[$]+", "dollar"); // Strip usernames. text = Regex.Replace(text, @"@[^\s]+", "username"); // Tokenize and also get rid of any punctuation return text.Split(" @$/#.-:&*+=[]?!(){},''\">_<;%\\".ToCharArray()); } #endregion }
调用:
double aa=TFIDF.StringCompare("Universal 360 Rotating Vehicle Car Mount Holder For iPhone 4S 4G/iPad1 2 3[Tablet PC Car Headrest Ho", "new Auto GPS Mount Stand Holder for Samsung Galaxy Tab 8.9");
Console.WriteLine(aa);