个人项目:论文查重
这个作业要求在哪里 | https://edu.cnblogs.com/campus/gdgy/Networkengineering1834/homework/11146 |
---|---|
传送门 | https://github.com/asiL-tcefreP/-software-engineering-2/tree/master |
一、模块接口的设计与实现过程
1.1 算法来源
文本相似度计算常用于网页去重以及NLP里文本分析等场景。文本相似度,可以分为两种,一种是字面相似度,另一种是语义相似度。本文记录的是文本的字面相似度的计算及实现,语义相似度计算则需要海量数据去计算语义值,较为复杂。
最常用的且最简单的两种文本相似检测方法:局部敏感hash、余弦相似度
在本案例中,用到的是局部敏感hash(LSH)中的simhash。计算出simhash值后,再计算hash值得汉明距离,即可得到文本的相似程度。
汉明距离:
定义:两个长度相同的字符串对应位字符不同的个数
两个关键点:
- 长度相同
- 对应位字符不同
传送门: SimHash详细介绍.
1.2 项目结构
包含文件读写类以及算法的实现类
方法的接口如下:
package pers.fjl.papercheck.service;
/**
* @program: PaperCheck
*
* @description: ${description}
*
* @author: Fang Jiale
*
* @create: 2020-10-24 17:05
**/
import pers.fjl.papercheck.service.impl.SimHashImpl;
import java.math.BigInteger;
import java.util.List;
public interface SimHash {
/**
* SimHash模块
* @return
*/
BigInteger simHash();
/**
*计算哈希值
* @param source
* @return
*/
BigInteger hash(String source);
/**
* 汉明距离
* @param other
* @return
*/
int hammingDistance(SimHashImpl other);
/**
*计算汉明距离
* @param str1
* @param str2
* @return
*/
double getDistance(String str1, String str2);
/**
*获取特征值
* @param simHashImpl
* @param distance
* @return
*/
List subByDistance(SimHashImpl simHashImpl, int distance);
}
实现类
package pers.fjl.papercheck.service.impl;
/**
* @program: PaperCheck
*
* @description: ${description}
*
* @author: Fang Jiale
*
* @create: 2020-10-24 17:05
**/
import pers.fjl.papercheck.file.FileInput;
import pers.fjl.papercheck.service.SimHash;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
public class SimHashImpl implements SimHash {
private String tokens;
private BigInteger intSimHash;
private String strSimHash;
private int hashbits = 64;
public SimHashImpl(String tokens, int hashbits) {
this.tokens = tokens;
this.hashbits = hashbits;
this.intSimHash = this.simHash();
}
public BigInteger simHash() {
// 定义特征向量/数组
int[] v = new int[this.hashbits];
StringTokenizer stringTokens = new StringTokenizer(this.tokens);
while (stringTokens.hasMoreTokens()) {
String temp = stringTokens.nextToken();
//2、将每一个分词hash为一组固定长度的数列.比如 64bit 的一个整数.
BigInteger t = this.hash(temp);
for (int i = 0; i < this.hashbits; i++) {
BigInteger bitmask = new BigInteger("1").shiftLeft(i);
// 3、建立一个长度为64的整数数组(假设要生成64位的数字指纹,也可以是其它数字),
// 对每一个分词hash后的数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1,
// 中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
if (t.and(bitmask).signum() != 0) {
v[i] += 1;
} else {
v[i] -= 1;
}
}
}
BigInteger fingerprint = new BigInteger("0");
StringBuffer simHashBuffer = new StringBuffer();
for (int i = 0; i < this.hashbits; i++) {
// 4、最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名.
if (v[i] >= 0) {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
simHashBuffer.append("1");
}else{
simHashBuffer.append("0");
}
}
this.strSimHash = simHashBuffer.toString();
setStrSimHash(strSimHash);
// System.out.println(this.strSimHash + " length " + this.strSimHash.length());
return fingerprint;
}
public String getStrSimHash() {
return strSimHash;
}
public void setStrSimHash(String strSimHash) {
this.strSimHash = strSimHash;
}
public BigInteger hash(String source) {
if (source == null || source.length() == 0) {
return new BigInteger("0");
} else {
char[] sourceArray = source.toCharArray();
BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
BigInteger m = new BigInteger("1000003");
BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(
new BigInteger("1"));
for (char item : sourceArray) {
BigInteger temp = BigInteger.valueOf((long) item);
x = x.multiply(m).xor(temp).and(mask);
}
x = x.xor(new BigInteger(String.valueOf(source.length())));
if (x.equals(new BigInteger("-1"))) {
x = new BigInteger("-2");
}
return x;
}
}
public int hammingDistance(SimHashImpl other) {
BigInteger x = this.intSimHash.xor(other.intSimHash);
int tot = 0;
//统计x中二进制位数为1的个数
//我们想想,一个二进制数减去1,那么,从最后那个1(包括那个1)后面的数字全都反了,对吧,然后,n&(n-1)就相当于把后面的数字清0,
//我们看n能做多少次这样的操作就OK了。
while (x.signum() != 0) {
tot += 1;
x = x.and(x.subtract(new BigInteger("1")));
}
return tot;
}
public double getDistance(String str1, String str2) {
double distance;
if (str1.length() != str2.length()) {
distance = -1;
} else {
distance = 0;
for (int i = 0; i < str1.length(); i++) {
if (str1.charAt(i) != str2.charAt(i)) {
distance++;
}
}
}
return distance;
}
public List subByDistance(SimHashImpl simHashImpl, int distance){
// 分成几组来检查
int numEach = this.hashbits/(distance+1);
List characters = new ArrayList();
StringBuffer buffer = new StringBuffer();
int k = 0;
for( int i = 0; i < this.intSimHash.bitLength(); i++){
// 当且仅当设置了指定的位时,返回 true
boolean sr = simHashImpl.intSimHash.testBit(i);
if(sr){
buffer.append("1");
}
else{
buffer.append("0");
}
if( (i+1)%numEach == 0 ){
// 将二进制转为BigInteger
BigInteger eachValue = new BigInteger(buffer.toString(),2);
// System.out.println("----" +eachValue );
buffer.delete(0, buffer.length());
characters.add(eachValue);
}
}
return characters;
}
// public double distance(String strSimHash1,String strSimHash2){
// double distance;
// return hash1.getDistance(hash1.strSimHash,hash2.strSimHash);
// }
public static void main(String[] args) {
String origin="G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig.txt";
String[] s={
"G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_add.txt",
"G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_del.txt",
"G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_1.txt",
"G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_10.txt",
"G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_15.txt"};
FileInput fileInput = new FileInput();
SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
hash1.subByDistance(hash1, 3);
for (String s1 : s) {
SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s1), 64);
hash2.subByDistance(hash2, 3);
double distance = hash1.getDistance(hash1.strSimHash,hash2.strSimHash);
System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
}
}
}
package pers.fjl.papercheck.file;
/**
* @program: PaperCheck
*
* @description: ${description}
*
* @author: Fang Jiale
*
* @create: 2020-10-24 17:05
**/
import java.io.*;
public class FileInput {
public String readString(String FI){
int len=0;
StringBuffer str=new StringBuffer("");
File file = new File(FI);
try {
FileInputStream fileInputStream = new FileInputStream(file);
InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
String line=null;
while((line=bufferedReader.readLine())!=null){
if (len!=0){
str.append("\r\n"+line);
}else {
str.append(line);
}
len++;
}
bufferedReader.close();
fileInputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return str.toString();
}
public static void main(String[] args) {
FileInput fileInput = new FileInput();
String s = fileInput.readString("G:\\orig.txt");
System.out.println(s);
}
}
二、测试
2.1 单元测试
这次测试只完成了空指针异常的测试,还应包括读写文件错误异常的测试。(后面有时间再commit)
package pers.fjl.test;
import org.junit.Test;
import pers.fjl.papercheck.file.FileInput;
import pers.fjl.papercheck.service.impl.SimHashImpl;
import java.math.BigInteger;
public class AllTest {
String origin="G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig.txt";
String[] s={
"G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_add.txt",
"G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_del.txt",
"G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_1.txt",
"G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_10.txt",
"G:\\download\\app\\Git\\gitRepos\\paperpass\\src\\main\\resources\\orig_0.8_dis_15.txt"};
@org.junit.Test
public void addTest(){
FileInput fileInput = new FileInput();
SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
hash1.subByDistance(hash1, 3);
SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s[0]), 64);
hash2.subByDistance(hash2, 3);
double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
}
@org.junit.Test
public void delTest(){
FileInput fileInput = new FileInput();
SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
hash1.subByDistance(hash1, 3);
SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s[1]), 64);
hash2.subByDistance(hash2, 3);
double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
}
@org.junit.Test
public void dis_1Test(){
FileInput fileInput = new FileInput();
SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
hash1.subByDistance(hash1, 3);
SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s[2]), 64);
hash2.subByDistance(hash2, 3);
double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
}
@org.junit.Test
public void dis_10Test(){
FileInput fileInput = new FileInput();
SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
hash1.subByDistance(hash1, 3);
SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s[3]), 64);
hash2.subByDistance(hash2, 3);
double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
}
@org.junit.Test
public void dis_15Test(){
FileInput fileInput = new FileInput();
SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
hash1.subByDistance(hash1, 3);
SimHashImpl hash2 = new SimHashImpl(fileInput.readString(s[4]), 64);
hash2.subByDistance(hash2, 3);
double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
}
// @org.junit.Test
// public void FileNotFoundException(){
// FileInput fileInput = new FileInput();
// SimHashImpl hash1 = new SimHashImpl(fileInput.readString(origin), 64);
// hash1.subByDistance(hash1, 3);
// SimHashImpl hash2 = new SimHashImpl(fileInput.readString("G:\\1.txt"), 64);
// hash2.subByDistance(hash2, 3);
// double distance = hash1.getDistance(hash1.getStrSimHash(),hash2.getStrSimHash());
// System.out.println("该文章与原文相似度为:"+(100-distance*100/128)+"%");
// }
}
2.2 覆盖率
三、性能检测
对该性能分析工具的使用还不太熟练,但可以看见的是,使用了GC之后,char,与String依旧占据内存的大部分。