字符串相似度对比

using EnglishStemmer;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.Serialization.Formatters.Binary;
using System.Text;
using System.Text.RegularExpressions;

public class StringCompare
{
    /// <summary>
    /// Document vocabulary, containing each word's IDF value.
    /// </summary>
    private Dictionary<string, double> _vocabularyIDF = new Dictionary<string, double>();


    public string[] stopWordsList = new string[]
        {
            "a",
            "about",
            "above",
            "across",
            "afore",
            "aforesaid",
            "after",
            "again",
            "against",
            "agin",
            "ago",
            "aint",
            "albeit",
            "all",
            "almost",
            "alone",
            "along",
            "alongside",
            "already",
            "also",
            "although",
            "always",
            "am",
            "american",
            "amid",
            "amidst",
            "among",
            "amongst",
            "an",
            "and",
            "anent",
            "another",
            "any",
            "anybody",
            "anyone",
            "anything",
            "are",
            "aren't",
            "around",
            "as",
            "aslant",
            "astride",
            "at",
            "athwart",
            "away",
            "b",
            "back",
            "bar",
            "barring",
            "be",
            "because",
            "been",
            "before",
            "behind",
            "being",
            "below",
            "beneath",
            "beside",
            "besides",
            "best",
            "better",
            "between",
            "betwixt",
            "beyond",
            "both",
            "but",
            "by",
            "c",
            "can",
            "cannot",
            "can't",
            "certain",
            "circa",
            "close",
            "concerning",
            "considering",
            "cos",
            "could",
            "couldn't",
            "couldst",
            "d",
            "dare",
            "dared",
            "daren't",
            "dares",
            "daring",
            "despite",
            "did",
            "didn't",
            "different",
            "directly",
            "do",
            "does",
            "doesn't",
            "doing",
            "done",
            "don't",
            "dost",
            "doth",
            "down",
            "during",
            "durst",
            "e",
            "each",
            "early",
            "either",
            "em",
            "english",
            "enough",
            "ere",
            "even",
            "ever",
            "every",
            "everybody",
            "everyone",
            "everything",
            "except",
            "excepting",
            "f",
            "failing",
            "far",
            "few",
            "first",
            "five",
            "following",
            "for",
            "four",
            "from",
            "g",
            "gonna",
            "gotta",
            "h",
            "had",
            "hadn't",
            "hard",
            "has",
            "hasn't",
            "hast",
            "hath",
            "have",
            "haven't",
            "having",
            "he",
            "he'd",
            "he'll",
            "her",
            "here",
            "here's",
            "hers",
            "herself",
            "he's",
            "high",
            "him",
            "himself",
            "his",
            "home",
            "how",
            "howbeit",
            "however",
            "how's",
            "i",
            "id",
            "if",
            "ill",
            "i'm",
            "immediately",
            "important",
            "in",
            "inside",
            "instantly",
            "into",
            "is",
            "isn't",
            "it",
            "it'll",
            "it's",
            "its",
            "itself",
            "i've",
            "j",
            "just",
            "k",
            "l",
            "large",
            "last",
            "later",
            "least",
            "left",
            "less",
            "lest",
            "let's",
            "like",
            "likewise",
            "little",
            "living",
            "long",
            "m",
            "many",
            "may",
            "mayn't",
            "me",
            "mid",
            "midst",
            "might",
            "mightn't",
            "mine",
            "minus",
            "more",
            "most",
            "much",
            "must",
            "mustn't",
            "my",
            "myself",
            "n",
            "near",
            "'neath",
            "need",
            "needed",
            "needing",
            "needn't",
            "needs",
            "neither",
            "never",
            "nevertheless",
            "new",
            "next",
            "nigh",
            "nigher",
            "nighest",
            "nisi",
            "no",
            "no-one",
            "nobody",
            "none",
            "nor",
            "not",
            "nothing",
            "notwithstanding",
            "now",
            "o",
            "o'er",
            "of",
            "off",
            "often",
            "on",
            "once",
            "one",
            "oneself",
            "only",
            "onto",
            "open",
            "or",
            "other",
            "otherwise",
            "ought",
            "oughtn't",
            "our",
            "ours",
            "ourselves",
            "out",
            "outside",
            "over",
            "own",
            "p",
            "past",
            "pending",
            "per",
            "perhaps",
            "plus",
            "possible",
            "present",
            "probably",
            "provided",
            "providing",
            "public",
            "q",
            "qua",
            "quite",
            "r",
            "rather",
            "re",
            "real",
            "really",
            "respecting",
            "right",
            "round",
            "s",
            "same",
            "sans",
            "save",
            "saving",
            "second",
            "several",
            "shall",
            "shalt",
            "shan't",
            "she",
            "shed",
            "shell",
            "she's",
            "short",
            "should",
            "shouldn't",
            "since",
            "six",
            "small",
            "so",
            "some",
            "somebody",
            "someone",
            "something",
            "sometimes",
            "soon",
            "special",
            "still",
            "such",
            "summat",
            "supposing",
            "sure",
            "t",
            "than",
            "that",
            "that'd",
            "that'll",
            "that's",
            "the",
            "thee",
            "their",
            "theirs",
            "their's",
            "them",
            "themselves",
            "then",
            "there",
            "there's",
            "these",
            "they",
            "they'd",
            "they'll",
            "they're",
            "they've",
            "thine",
            "this",
            "tho",
            "those",
            "thou",
            "though",
            "three",
            "thro'",
            "through",
            "throughout",
            "thru",
            "thyself",
            "till",
            "to",
            "today",
            "together",
            "too",
            "touching",
            "toward",
            "towards",
            "true",
            "'twas",
            "'tween",
            "'twere",
            "'twill",
            "'twixt",
            "two",
            "'twould",
            "u",
            "under",
            "underneath",
            "unless",
            "unlike",
            "until",
            "unto",
            "up",
            "upon",
            "us",
            "used",
            "usually",
            "v",
            "versus",
            "very",
            "via",
            "vice",
            "vis-a-vis",
            "w",
            "wanna",
            "wanting",
            "was",
            "wasn't",
            "way",
            "we",
            "we'd",
            "well",
            "were",
            "weren't",
            "wert",
            "we've",
            "what",
            "whatever",
            "what'll",
            "what's",
            "when",
            "whencesoever",
            "whenever",
            "when's",
            "whereas",
            "where's",
            "whether",
            "which",
            "whichever",
            "whichsoever",
            "while",
            "whilst",
            "who",
            "who'd",
            "whoever",
            "whole",
            "who'll",
            "whom",
            "whore",
            "who's",
            "whose",
            "whoso",
            "whosoever",
            "will",
            "with",
            "within",
            "without",
            "wont",
            "would",
            "wouldn't",
            "wouldst",
            "x",
            "y",
            "ye",
            "yet",
            "you",
            "you'd",
            "you'll",
            "your",
            "you're",
            "yours",
            "yourself",
            "yourselves",
            "you've",
            "z",
        };

    /// <summary>
    /// Transforms a list of documents into their associated TF*IDF values.
    /// If a vocabulary does not yet exist, one will be created, based upon the documents' words.
    /// </summary>
    /// <param name="documents">string[]</param>
    /// <param name="vocabularyThreshold">Minimum number of occurences of the term within all documents</param>
    /// <returns>double[][]</returns>
    public double[][] Transform(string[] documents, int vocabularyThreshold = 3)
    {
        List<List<string>> stemmedDocs;
        List<string> vocabulary;

        // Get the vocabulary and stem the documents at the same time.
        vocabulary = GetVocabulary(documents, out stemmedDocs, vocabularyThreshold);

        if (_vocabularyIDF.Count == 0)
        {
            // Calculate the IDF for each vocabulary term.
            foreach (var term in vocabulary)
            {
                double numberOfDocsContainingTerm = stemmedDocs.Where(d => d.Contains(term)).Count();
                _vocabularyIDF[term] = Math.Log((double)stemmedDocs.Count / ((double)1 + numberOfDocsContainingTerm));
            }
        }

        // Transform each document into a vector of tfidf values.
        return TransformToTFIDFVectors(stemmedDocs, _vocabularyIDF);
    }

    /// <summary>
    /// 预处理
    /// </summary>
    /// <param name="documents"></param>
    /// <returns></returns>
    public List<List<double>> Preprocessing(string[] documents)
    {
        List<List<string>> stemmedDocs;
        List<string> vocabulary;
        List<List<double>> r = new List<List<double>>();
        vocabulary = GetVocabulary(documents, out stemmedDocs, 0);
        foreach (List<string> lt in stemmedDocs)
        {
            List<double> temp = new List<double>();
            foreach (var item in vocabulary)
            {
                if (lt.Contains(item))
                {
                    temp.Add(1);
                }
                else
                {
                    temp.Add(0);
                }
            }
            r.Add(temp);
        }
        return r;
    }

    /// <summary>
    /// 对比字符串相似度
    /// </summary>
    /// <param name="str1"></param>
    /// <param name="str2"></param>
    /// <returns></returns>
    public double StringCompare(string str1, string str2)
    {
        //提取特征点
        List<List<double>> inputs = Preprocessing(new string[] { str1.ToLower(), str2.ToLower() });
        //求两向量弧度
        double RadianVal = Radian(inputs[0].ToArray(), inputs[1].ToArray());
        //弧度转角度
        double Degree = (180 / Math.PI) * RadianVal;
        return (90 - Degree) / 90.0;
    }


    /// <summary>
    /// 向量归一化
    /// </summary>
    /// <param name="d"></param>
    /// <returns></returns>
    private double[] guiyi(double[] d)
    {
        List<double> nd = new List<double>();
        double sum = 0;
        foreach (double item in d)
        {
            sum += item * item;
        }
        double length = Math.Sqrt(sum);
        foreach (double item in d)
        {
            if (length != 0.0)
            {
                nd.Add(item / length);
            }
            else
            {
                nd.Add(0);
            }
        }
        return nd.ToArray();
    }

    /// <summary>
    /// 求两个向量的弧度
    /// </summary>
    /// <param name="d1"></param>
    /// <param name="d2"></param>
    /// <returns></returns>
    private double Radian(double[] d1, double[] d2)
    {
        //归一
        double[] nd1 = guiyi(d1);
        double[] nd2 = guiyi(d2);
        //向量点积
        double sum = 0;
        for (int i = 0; i < d1.Length; i++)
        {
            sum += nd1[i] * nd2[i];
        }
        sum = Math.Round(sum, 4);
        double aa = Math.Acos(sum);
        return Math.Acos(sum);
    }

    /// <summary>
    /// Converts a list of stemmed documents (lists of stemmed words) and their associated vocabulary + idf values, into an array of TF*IDF values.
    /// </summary>
    /// <param name="stemmedDocs">List of List of string</param>
    /// <param name="vocabularyIDF">Dictionary of string, double (term, IDF)</param>
    /// <returns>double[][]</returns>
    private double[][] TransformToTFIDFVectors(List<List<string>> stemmedDocs, Dictionary<string, double> vocabularyIDF)
    {
        // Transform each document into a vector of tfidf values.
        List<List<double>> vectors = new List<List<double>>();
        foreach (var doc in stemmedDocs)
        {
            List<double> vector = new List<double>();

            foreach (var vocab in vocabularyIDF)
            {
                // Term frequency = count how many times the term appears in this document.
                double tf = doc.Where(d => d == vocab.Key).Count();
                double tfidf = tf * vocab.Value;

                vector.Add(tfidf);
            }

            vectors.Add(vector);
        }

        return vectors.Select(v => v.ToArray()).ToArray();
    }

    /// <summary>
    /// Normalizes a TF*IDF array of vectors using L2-Norm.
    /// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2)
    /// </summary>
    /// <param name="vectors">double[][]</param>
    /// <returns>double[][]</returns>
    public double[][] Normalize(double[][] vectors)
    {
        // Normalize the vectors using L2-Norm.
        List<double[]> normalizedVectors = new List<double[]>();
        foreach (var vector in vectors)
        {
            var normalized = Normalize(vector);
            normalizedVectors.Add(normalized);
        }

        return normalizedVectors.ToArray();
    }

    /// <summary>
    /// Normalizes a TF*IDF vector using L2-Norm.
    /// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2)
    /// </summary>
    /// <param name="vectors">double[][]</param>
    /// <returns>double[][]</returns>
    public double[] Normalize(double[] vector)
    {
        List<double> result = new List<double>();

        double sumSquared = 0;
        foreach (var value in vector)
        {
            sumSquared += value * value;
        }

        double SqrtSumSquared = Math.Sqrt(sumSquared);

        foreach (var value in vector)
        {
            // L2-norm: Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2)
            result.Add(value / SqrtSumSquared);
        }

        return result.ToArray();
    }

    /// <summary>
    /// Saves the TFIDF vocabulary to disk.
    /// </summary>
    /// <param name="filePath">File path</param>
    public void Save(string filePath = "vocabulary.dat")
    {
        // Save result to disk.
        using (FileStream fs = new FileStream(filePath, FileMode.Create))
        {
            BinaryFormatter formatter = new BinaryFormatter();
            formatter.Serialize(fs, _vocabularyIDF);
        }
    }

    /// <summary>
    /// Loads the TFIDF vocabulary from disk.
    /// </summary>
    /// <param name="filePath">File path</param>
    public void Load(string filePath = "vocabulary.dat")
    {
        // Load from disk.
        using (FileStream fs = new FileStream(filePath, FileMode.Open))
        {
            BinaryFormatter formatter = new BinaryFormatter();
            _vocabularyIDF = (Dictionary<string, double>)formatter.Deserialize(fs);
        }
    }

    #region Private Helpers

    /// <summary>
    /// Parses and tokenizes a list of documents, returning a vocabulary of words.
    /// </summary>
    /// <param name="docs">string[]</param>
    /// <param name="stemmedDocs">List of List of string</param>
    /// <returns>Vocabulary (list of strings)</returns>
    private List<string> GetVocabulary(string[] docs, out List<List<string>> stemmedDocs, int vocabularyThreshold)
    {
        List<string> vocabulary = new List<string>();
        Dictionary<string, int> wordCountList = new Dictionary<string, int>();
        stemmedDocs = new List<List<string>>();

        int docIndex = 0;

        foreach (var doc in docs)
        {
            List<string> stemmedDoc = new List<string>();

            docIndex++;

            if (docIndex % 100 == 0)
            {
                //Console.WriteLine("Processing " + docIndex + "/" + docs.Length);
            }

            string[] parts2 = Tokenize(doc);

            List<string> words = new List<string>();
            foreach (string part in parts2)
            {
                // Strip non-alphanumeric characters.
                string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");

                if (!stopWordsList.Contains(stripped.ToLower()))
                {
                    try
                    {
                        var english = new EnglishWord(stripped);
                        string stem = english.Stem;
                        words.Add(stem);

                        if (stem.Length > 0)
                        {
                            // Build the word count list.
                            if (wordCountList.ContainsKey(stem))
                            {
                                wordCountList[stem]++;
                            }
                            else
                            {
                                wordCountList.Add(stem, 0);
                            }

                            stemmedDoc.Add(stem);
                        }
                    }
                    catch
                    {
                    }
                }
            }

            stemmedDocs.Add(stemmedDoc);
        }

        // Get the top words.
        var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold);
        foreach (var item in vocabList)
        {
            vocabulary.Add(item.Key);
        }

        return vocabulary;
    }

    /// <summary>
    /// Tokenizes a string, returning its list of words.
    /// </summary>
    /// <param name="text">string</param>
    /// <returns>string[]</returns>
    private string[] Tokenize(string text)
    {
        // Strip all HTML.
        text = Regex.Replace(text, "<[^<>]+>", "");

        // Strip numbers.
        text = Regex.Replace(text, "[0-9]+", "number");

        // Strip urls.
        text = Regex.Replace(text, @"(http|https)://[^\s]*", "httpaddr");

        // Strip email addresses.
        text = Regex.Replace(text, @"[^\s]+@[^\s]+", "emailaddr");

        // Strip dollar sign.
        text = Regex.Replace(text, "[$]+", "dollar");

        // Strip usernames.
        text = Regex.Replace(text, @"@[^\s]+", "username");

        // Tokenize and also get rid of any punctuation
        return text.Split(" @$/#.-:&*+=[]?!(){},''\">_<;%\\".ToCharArray());
    }

    #endregion
}

double aa = new StringCompare().StringCompare(str1,str2);
        return new SqlDouble (aa);

字符串相似度对比

猜你喜欢