个人项目:SimHash实现的论文查重

个人项目:论文查重

这个作业要求在哪里 https://edu.cnblogs.com/campus/gdgy/Networkengineering1834/homework/11146
传送门 https://github.com/asiL-tcefreP/-software-engineering-2/tree/master

一、模块接口的设计与实现过程

1.1 算法来源

文本相似度计算常用于网页去重以及NLP里文本分析等场景。文本相似度,可以分为两种,一种是字面相似度,另一种是语义相似度。本文记录的是文本的字面相似度的计算及实现,语义相似度计算则需要海量数据去计算语义值,较为复杂。
最常用的且最简单的两种文本相似检测方法:局部敏感hash、余弦相似度

在本案例中,用到的是局部敏感hash(LSH)中的simhash。计算出simhash值后,再计算hash值得汉明距离,即可得到文本的相似程度。

汉明距离:
定义:两个长度相同的字符串对应位字符不同的个数
两个关键点:

1.2 项目结构


包含文件读写类以及算法的实现类

方法的接口如下:

package pers.fjl.papercheck.service;
/**
 * @program: PaperCheck
 *
 * @description: ${description}
 *
 * @author: Fang Jiale
 *
 * @create: 2020-10-24 17:05
 **/
import pers.fjl.papercheck.service.impl.SimHashImpl;

import java.math.BigInteger;
import java.util.List;

public interface SimHash {
    
    
    /**
     * SimHash模块
     * @return
     */
    BigInteger simHash();

    /**
     *计算哈希值
     * @param source
     * @return
     */
    BigInteger hash(String source);

    /**
     * 汉明距离
     * @param other
     * @return
     */
    int hammingDistance(SimHashImpl other);

    /**
     *计算汉明距离
     * @param str1
     * @param str2
     * @return
     */
    double getDistance(String str1, String str2);

    /**
     *获取特征值
     * @param simHashImpl
     * @param distance
     * @return
     */
    List subByDistance(SimHashImpl simHashImpl, int distance);
}

实现类

package pers.fjl.papercheck.service.impl;
/**
 * @program: PaperCheck
 *
 * @description: ${description}
 *
 * @author: Fang Jiale
 *
 * @create: 2020-10-24 17:05
 **/
import pers.fjl.papercheck.file.FileInput;
import pers.fjl.papercheck.service.SimHash;

import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

public class SimHashImpl implements SimHash {
    
    

    private String tokens;

    private BigInteger intSimHash;

    private String strSimHash;

    private int hashbits = 64;

    public SimHashImpl(String tokens, int hashbits) {
    
    
        this.tokens = tokens;
        this.hashbits = hashbits;
        this.intSimHash = this.simHash();
    }

    public BigInteger simHash() {
    
    
        // 定义特征向量/数组
        int[] v = new int[this.hashbits];
        StringTokenizer stringTokens = new StringTokenizer(this.tokens);
        while (stringTokens.hasMoreTokens()) {
    
    
            String temp = stringTokens.nextToken();
            //2、将每一个分词hash为一组固定长度的数列.比如 64bit 的一个整数.
            BigInteger t = this.hash(temp);
            for (int i = 0; i < this.hashbits; i++) {
    
    
                BigInteger bitmask = new BigInteger("1").shiftLeft(i);
                // 3、建立一个长度为64的整数数组(假设要生成64位的数字指纹,也可以是其它数字),
                // 对每一个分词hash后的数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1,
                // 中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
                if (t.and(bitmask).signum() != 0) {
    
    
                    v[i] += 1;
                } else {
    
    
                    v[i] -= 1;
                }
            }
        }
        BigInteger fingerprint = new BigInteger("0");
        StringBuffer simHashBuffer = new StringBuffer();
        for (int i = 0; i < this.hashbits; i++) {
    
    
            // 4、最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名.
            if (v[i] >= 0) {
    
    
                fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
                simHashBuffer.append("1");
            }else{
    
    
                simHashBuffer.append("0");
            }
        }
        this.strSimHash = simHashBuffer.toString();
        setStrSimHash(strSimHash);
//        System.out.println(this.strSimHash + " length " + this.strSimHash.length());
        return fingerprint;
    }

    public String getStrSimHash() {
    
    
        return strSimHash;
    }

    public void setStrSimHash(String strSimHash) {
    
    
        this.strSimHash = strSimHash;
    }

    public BigInteger hash(String source) {
    
    
        if (source == null || source.length() == 0) {
    
    
            return new BigInteger("0");
        } else {
    
    
            char[] sourceArray = source.toCharArray();
            BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
            BigInteger m = new BigInteger("1000003");
            BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(
                    new BigInteger("1"));
            for (char item : sourceArray) {
    
    
                BigInteger temp = BigInteger.valueOf((long) item);
                x = x.multiply(m).xor(temp).and(mask);
            }
            x = x.xor(new BigInteger(String.valueOf(source.length())));
            if (x.equals(new BigInteger("-1"))) {
    
    
                x = new BigInteger("-2");
            }
            return x;
        }
    }

    public int hammingDistance(SimHashImpl other) {
    
    

        BigInteger x = this.intSimHash.xor(other.intSimHash);
        int tot = 0;

        //统计x中二进制位数为1的个数
        //我们想想,一个二进制数减去1,那么,从最后那个1(包括那个1)后面的数字全都反了,对吧,然后,n&(n-1)就相当于把后面的数字清0,
        //我们看n能做多少次这样的操作就OK了。

        while (x.signum() != 0) {
    
    
            tot += 1;
            x = x.and(x.subtract(new BigInteger("1")));
        }
        return tot;
    }

    public double getDistance(String str1, String str2) {
    
    
        double distance;
        if (str1.length() != str2.length()) {
    
    
            distance = -1;
        } else {
    
    
            distance = 0;
            for (int i = 0; i < str1.length(); i++) {
    
    
                if (str1.charAt(i) != str2.charAt(i)) {
    
    
                    distance++;
                }
            }
        }
        return distance;
    }


    public List subByDistance(SimHashImpl simHashImpl, int distance){
    
    
        // 分成几组来检查
        int numEach = this.hashbits/(distance+1);
        List characters = new ArrayList();

        StringBuffer buffer = new StringBuffer();

        int k = 0;
        for( int i = 0; i < this.intSimHash.bitLength(); i++){
    
    
            // 当且仅当设置了指定的位时,返回 true
            boolean sr = simHashImpl.intSimHash.testBit(i);

            if(sr){
    
    
                buffer.append("1");
            }
            else{
    
    
                buffer.append("0");
            }

            if( (i+1)%numEach == 0 ){
    
    
                // 将二进制转为BigInteger
                BigInteger eachValue = new BigInteger(buffer.toString(),2);
//                System.out.println("----" +eachValue );
                buffer.delete(0, buffer.length());
                characters.add(eachValue);
            }
        }
        return characters;
    }

//    public double distance(String strSimHash1,String strSimHash2){
    
    
//        double distance;
//        return hash1.getDistance(hash1.strSimHash,hash2.strSimHash);
//    }

    public static void main(String[] args) {
    
    
        String origin="G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig.txt";
        String[] s={
    
    
        "G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_add.txt",
        "G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_del.txt",
        "G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_1.txt",
                "G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_10.txt",
                "G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_15.txt"};
        FileInput fileInput = new FileInput();
        SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
        hash1.subByDistance(hash1, 3);

        for (String s1 : s) {
    
    
            SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s1), 64);
            hash2.subByDistance(hash2, 3);
            double distance = hash1.getDistance(hash1.strSimHash,hash2.strSimHash);
            System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
        }
    }
}
package pers.fjl.papercheck.file;
/**
 * @program: PaperCheck
 *
 * @description: ${description}
 *
 * @author: Fang Jiale
 *
 * @create: 2020-10-24 17:05
 **/
import java.io.*;

public class FileInput {
    
    

    public String readString(String FI){
    
    
        int len=0;
        StringBuffer str=new StringBuffer("");
        File file = new File(FI);
        try {
    
    
            FileInputStream fileInputStream = new FileInputStream(file);
            InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream);
            BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
            String line=null;
            while((line=bufferedReader.readLine())!=null){
    
    
                if (len!=0){
    
    
                    str.append("\r\n"+line);
                }else {
    
    
                    str.append(line);
                }
                len++;
            }
            bufferedReader.close();
            fileInputStream.close();
        } catch (FileNotFoundException e) {
    
    
            e.printStackTrace();
        } catch (IOException e) {
    
    
            e.printStackTrace();
        }
        return str.toString();
    }

    public static void main(String[] args) {
    
    
        FileInput fileInput = new FileInput();
        String s = fileInput.readString("G:\\orig.txt");
        System.out.println(s);
    }

}

二、测试

2.1 单元测试

这次测试只完成了空指针异常的测试,还应包括读写文件错误异常的测试。(后面有时间再commit)

package pers.fjl.test;

import org.junit.Test;
import pers.fjl.papercheck.file.FileInput;
import pers.fjl.papercheck.service.impl.SimHashImpl;

import java.math.BigInteger;

public class AllTest {
    
    
    String origin="G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig.txt";
    String[] s={
    
    
            "G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_add.txt",
            "G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_del.txt",
            "G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_1.txt",
            "G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_10.txt",
            "G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_15.txt"};

    @org.junit.Test
    public void addTest(){
    
    
        FileInput fileInput = new FileInput();
        SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
        hash1.subByDistance(hash1, 3);
        SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s[0]), 64);
        hash2.subByDistance(hash2, 3);
        double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
        System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
    }

    @org.junit.Test
    public void delTest(){
    
    
        FileInput fileInput = new FileInput();
        SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
        hash1.subByDistance(hash1, 3);
        SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s[1]), 64);
        hash2.subByDistance(hash2, 3);
        double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
        System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
    }

    @org.junit.Test
    public void dis_1Test(){
    
    
        FileInput fileInput = new FileInput();
        SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
        hash1.subByDistance(hash1, 3);
        SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s[2]), 64);
        hash2.subByDistance(hash2, 3);
        double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
        System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
    }

    @org.junit.Test
    public void dis_10Test(){
    
    
        FileInput fileInput = new FileInput();
        SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
        hash1.subByDistance(hash1, 3);
        SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s[3]), 64);
        hash2.subByDistance(hash2, 3);
        double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
        System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
    }

    @org.junit.Test
    public void dis_15Test(){
    
    
        FileInput fileInput = new FileInput();
        SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
        hash1.subByDistance(hash1, 3);
        SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s[4]), 64);
        hash2.subByDistance(hash2, 3);
        double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
        System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
    }

//    @org.junit.Test
//    public void FileNotFoundException(){
    
    
//        FileInput fileInput = new FileInput();
//        SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
//        hash1.subByDistance(hash1, 3);
//        SimHashImpl hash2 = new SimHashImpl(fileInput.readString("G:\\1.txt"), 64);
//        hash2.subByDistance(hash2, 3);
//        double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
//        System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
//    }
}

2.2 覆盖率



三、性能检测




对该性能分析工具的使用还不太熟练,但可以看见的是,使用了GC之后,char,与String依旧占据内存的大部分。

猜你喜欢

转载自blog.csdn.net/Dlihctcefrep/article/details/109319944