一、自己写lsh代码:
def train_lsh(data, num_vector=16, seed=None): dim = data.shape[1] if seed is not None: np.random.seed(seed) random_vectors = generate_random_vectors(num_vector, dim) powers_of_two = 1 << np.arange(num_vector-1, -1, -1) table = {} # Partition data points into bins bin_index_bits = (data.dot(random_vectors) >= 0) # Encode bin index bits into integers bin_indices = bin_index_bits.dot(powers_of_two) # Update `table` so that `table[i]` is the list of document ids with bin index equal to i. for data_index, bin_index in enumerate(bin_indices): if bin_index not in table: # If no list yet exists for this bin, assign the bin an empty list. table[bin_index] = [] # Fetch the list of document ids associated with the bin and add the document id to the end. table[bin_index].append(data_index) model = {'data': data, 'bin_index_bits': bin_index_bits, 'bin_indices': bin_indices, 'table': table, 'random_vectors': random_vectors, 'num_vector': num_vector} return model
model = train_lsh(corpus, num_vector=16, seed=143) table = model['table'] if 0 in table and table[0] == [39583] and \ 143 in table and table[143] == [19693, 28277, 29776, 30399]: print 'Passed!' else: print 'Check your code.'
二、学了一个好玩的
doc_obama = corpus[35817, :] # first document index_bits_obama = (doc_obama.dot(random_vectors) >= 0) powers_of_two = (1 << np.arange(15, -1, -1)) print index_bits_obama print powers_of_two print index_bits_obama.dot(powers_of_two)
三、自己写lsh最近邻代码:
def search_nearby_bins(query_bin_bits, table, search_radius=2, initial_candidates=set()): """ For a given query vector and trained LSH model, return all candidate neighbors for the query among all bins within the given search radius. Example usage ------------- >>> model = train_lsh(corpus, num_vector=16, seed=143) >>> q = model['bin_index_bits'][0] # vector for the first document >>> candidates = search_nearby_bins(q, model['table']) """ num_vector = len(query_bin_bits) powers_of_two = 1 << np.arange(num_vector-1, -1, -1) # Allow the user to provide an initial set of candidates. candidate_set = copy(initial_candidates) for different_bits in combinations(range(num_vector), search_radius): # Flip the bits (n_1,n_2,...,n_r) of the query bin to produce a new bit vector. ## Hint: you can iterate over a tuple like a list alternate_bits = copy(query_bin_bits) for i in different_bits: alternate_bits[i] = not query_bin_bits[i] # Convert the new bit vector to an integer index nearby_bin = alternate_bits.dot(powers_of_two) # Fetch the list of documents belonging to the bin indexed by the new bit vector. # Then add those documents to candidate_set # Make sure that the bin exists in the table! # Hint: update() method for sets lets you add an entire list to the set if nearby_bin in table: nearby_docs = model['table'][nearby_bin] candidate_set.update(nearby_docs) return candidate_set
obama_bin_index = model['bin_index_bits'][35817] # bin index of Barack Obama candidate_set = search_nearby_bins(obama_bin_index, model['table'], search_radius=0) if candidate_set == set([35817, 21426, 53937, 39426, 50261]): print 'Passed test' else: print 'Check your code' print 'List of documents in the same bin as Obama: 35817, 21426, 53937, 39426, 50261'