<?php class Simhash { public $m_hash = null; public $hashbits = null; public $code = null; public $m_hashbits = 64; public function __construct($code='UTF-8'){ $this->code = $code; } public function __toString(){ return strval($this->m_hash); } //返回hash值 public function simhash($tokens){ foreach($tokens as $x){ $x = $this->string_hash($x); for($i=0;$i<$this->m_hashbits;$i++){ $bitmask = gmp_init(1); gmp_setbit($bitmask, $i); $bitmask = gmp_sub($bitmask,1); if (gmp_strval(gmp_and($x,$bitmask)) != "0") $v[$i] += 1; else $v[$i] -= 1; } } $sum = 0; for($i=0;$i<$this->m_hashbits;$i++){ if ($v[$i] >= 0){ $num = gmp_init(1); gmp_setbit($num, $i); $num = gmp_sub($num,1); $sum = gmp_add($sum,$num); } } $this->m_hash = gmp_strval($sum); return $this->m_hash; } //求海明距离 public function hamming_distance($hash1,$hash2){ $hash1 = strval($hash1); $hash2 = strval($hash2); $a = gmp_init($hash1); $b = gmp_init($hash2); $c = gmp_init(1); gmp_setbit($c, $this->m_hashbits); $c = gmp_sub($c,2); $x = gmp_and(gmp_xor($a,$b),$c); $tot = 0; while(gmp_strval($x)){ $tot += 1; $x = gmp_and($x,gmp_sub($x,1)); } return $tot; } public function string_hash($source){ if(empty($source)) return 0; else{ $x = $this->utf8_unicode(mb_substr($source,0,1,$this->code)) << 7; $m = 1000003; $mask = gmp_sub(gmp_pow("2", $this->m_hashbits),1); $len = mb_strlen($source,$this->code); for($i=0;$i<$len;$i++){ $x = gmp_and(gmp_xor(gmp_mul($x,$m),$this->utf8_unicode(mb_substr($source,$i,1,$this->code))),$mask); } $x = gmp_xor($x,$len); if(intval(gmp_strval($x)) == -1) $x = -2; return $x; } } function utf8_unicode($c) { switch(strlen($c)) { case 1: return ord($c); case 2: $n = (ord($c[0]) & 0x3f) << 6; $n += ord($c[1]) & 0x3f; return $n; case 3: $n = (ord($c[0]) & 0x1f) << 12; $n += (ord($c[1]) & 0x3f) << 6; $n += ord($c[2]) & 0x3f; return $n; case 4: $n = (ord($c[0]) & 0x0f) << 18; $n += (ord($c[1]) & 0x3f) << 12; $n += (ord($c[2]) & 0x3f) << 6; $n += ord($c[3]) & 0x3f; return $n; } } function dec_to_bin($dec='') { $bin = ''; if($this->is_64bit()){ $bin = decbin(intval($dec)); } else{ while ($dec) { $m = bcmod($dec, 2); $dec = bcdiv($dec, 2); $bin .= abs($m); } $bin = strrev($bin); } if(strlen($bin)<$this->m_hashbits)$bin = str_pad($bin,$this->m_hashbits,"0",STR_PAD_LEFT); unset($dec);unset($m); return $bin; } function bin_to_dec($input='') { if($this->is_64bit()){ $output = bindec($input); } else{ $output='0'; if(preg_match("/^[01]+$/",$input)) { for($i=0;$i<strlen($input);$i++) $output=bcadd(bcmul($output,'2'),$input{$i}); } if(strpos($output,'.')!==false){ $arr = explode('.',$output); $output = $arr[0]; } } return($output); } function is_64bit() { return false; } } ?>
php计算海明距离(64位)查找相似文档
猜你喜欢
转载自strayly.iteye.com/blog/2308378
今日推荐
周排行