php计算海明距离(64位)查找相似文档

<?php
class Simhash { 

	public $m_hash = null;
	public $hashbits = null;
	public $code = null;
	public $m_hashbits = 64;

	public function __construct($code='UTF-8'){
		$this->code = $code;
	}
	public function __toString(){
		return strval($this->m_hash);
	}

	//返回hash值
	public function simhash($tokens){
		foreach($tokens as $x){
			$x = $this->string_hash($x);
			for($i=0;$i<$this->m_hashbits;$i++){
				$bitmask = gmp_init(1);
				gmp_setbit($bitmask, $i);
				$bitmask = gmp_sub($bitmask,1);
				if (gmp_strval(gmp_and($x,$bitmask)) != "0") $v[$i] += 1;
				else $v[$i] -= 1;
			}
		}
		$sum = 0;
		for($i=0;$i<$this->m_hashbits;$i++){
			if ($v[$i] >= 0){
				$num = gmp_init(1);
				gmp_setbit($num, $i);
				$num = gmp_sub($num,1);
				$sum = gmp_add($sum,$num);
			}
		}
		$this->m_hash = gmp_strval($sum);
		return $this->m_hash;
	}

	//求海明距离
	public function hamming_distance($hash1,$hash2){
		$hash1 = strval($hash1);
		$hash2 = strval($hash2);
		$a = gmp_init($hash1);
		$b = gmp_init($hash2);
		$c = gmp_init(1);
		gmp_setbit($c, $this->m_hashbits);
		$c = gmp_sub($c,2);
		$x = gmp_and(gmp_xor($a,$b),$c);
		$tot = 0;
		while(gmp_strval($x)){
			$tot += 1;
			$x = gmp_and($x,gmp_sub($x,1));
		}
		return $tot;
	}

	public function string_hash($source){
		if(empty($source)) return 0;
		else{
			$x = $this->utf8_unicode(mb_substr($source,0,1,$this->code)) << 7;
			$m = 1000003;
			$mask = gmp_sub(gmp_pow("2", $this->m_hashbits),1);
			$len = mb_strlen($source,$this->code);
			for($i=0;$i<$len;$i++){
				$x = gmp_and(gmp_xor(gmp_mul($x,$m),$this->utf8_unicode(mb_substr($source,$i,1,$this->code))),$mask);
			}
			$x = gmp_xor($x,$len);
			if(intval(gmp_strval($x)) == -1) $x = -2;
			return $x;
		}
	}
	function utf8_unicode($c) {
		switch(strlen($c)) {
			case 1:
			return ord($c);
			case 2:
				$n = (ord($c[0]) & 0x3f) << 6;
				$n += ord($c[1]) & 0x3f;
			return $n;
			case 3:
				$n = (ord($c[0]) & 0x1f) << 12;
				$n += (ord($c[1]) & 0x3f) << 6;
				$n += ord($c[2]) & 0x3f;
			return $n;
			case 4:
				$n = (ord($c[0]) & 0x0f) << 18;
				$n += (ord($c[1]) & 0x3f) << 12;
				$n += (ord($c[2]) & 0x3f) << 6;
				$n += ord($c[3]) & 0x3f;
			return $n;
		}
	}

	function dec_to_bin($dec='') {
		$bin = '';
		if($this->is_64bit()){
			$bin = decbin(intval($dec));
		}
		else{
			while ($dec) {
				$m = bcmod($dec, 2);
				$dec = bcdiv($dec, 2);
				$bin .= abs($m);
			}
			$bin = strrev($bin);
		}
		if(strlen($bin)<$this->m_hashbits)$bin = str_pad($bin,$this->m_hashbits,"0",STR_PAD_LEFT);
		unset($dec);unset($m);
		return $bin;
	}
	function bin_to_dec($input='') {
	  if($this->is_64bit()){
		  $output = bindec($input);
	  }
	  else{
		  $output='0';
		  if(preg_match("/^[01]+$/",$input)) {
			for($i=0;$i<strlen($input);$i++)
			  $output=bcadd(bcmul($output,'2'),$input{$i});
		  }
		  if(strpos($output,'.')!==false){
			  $arr = explode('.',$output);
			  $output = $arr[0];
		  }
	  }
	  return($output);
	}
	function is_64bit() {
		return false;
	}

}

?>

猜你喜欢

转载自strayly.iteye.com/blog/2308378