import os import ujson import torch import numpy as np import tqdm from colbert.search.index_loader import IndexLoader from colbert.indexing.index_saver import IndexSaver from colbert.indexing.collection_encoder import CollectionEncoder from colbert.utils.utils import lengths2offsets, print_message, dotdict, flatten from colbert.indexing.codecs.residual import ResidualCodec from colbert.indexing.utils import optimize_ivf from colbert.search.strided_tensor import StridedTensor from colbert.modeling.checkpoint import Checkpoint from colbert.utils.utils import print_message, batch from colbert.data import Collection from colbert.indexing.codecs.residual_embeddings import ResidualEmbeddings from colbert.indexing.codecs.residual_embeddings_strided import ( ResidualEmbeddingsStrided, ) from colbert.indexing.utils import optimize_ivf # For testing writing into new chunks, can set DEFAULT_CHUNKSIZE smaller (e.g. 1 or 2) DEFAULT_CHUNKSIZE = 25000 class IndexUpdater: """ IndexUpdater takes in a searcher and adds/remove passages from the searcher. A checkpoint for passage-encoding must be provided for adding passages. IndexUpdater can also persist the change of passages to the index on disk. Sample usage: index_updater = IndexUpdater(config, searcher, checkpoint) added_pids = index_updater.add(passages) # all passages added to searcher with their pids returned index_updater.remove(pids) # all pid within pids removed from searcher searcher.search() # the search now reflects the added & removed passages index_updater.persist_to_disk() # added & removed passages persisted to index on disk searcher.Searcher(index, config) # if we reload the searcher now from disk index, the changes we made persists """ def __init__(self, config, searcher, checkpoint=None): self.config = config self.searcher = searcher self.index_path = searcher.index self.has_checkpoint = False if checkpoint: self.has_checkpoint = True self.checkpoint = Checkpoint(checkpoint, config) self.encoder = CollectionEncoder(config, self.checkpoint) self._load_disk_ivf() # variables to track removal / append of passages self.removed_pids = [] self.first_new_emb = torch.sum(self.searcher.ranker.doclens).item() self.first_new_pid = len(self.searcher.ranker.doclens) def remove(self, pids): """ Input: pids: list(int) Return: None Removes a list of pids from the searcher, these pids will no longer apppear in future searches with this searcher to erase passage data from index, call persist_to_disk() after calling remove() """ invalid_pids = self._check_pids(pids) if invalid_pids: raise ValueError("Invalid PIDs", invalid_pids) print_message(f"#> Removing pids: {pids}...") self._remove_pid_from_ivf(pids) self.removed_pids.extend(pids) def create_embs_and_doclens( self, passages, embs_path="embs.pt", doclens_path="doclens.pt", persist=False ): # Extend doclens and embs of self.searcher.ranker embs, doclens = self.encoder.encode_passages(passages) compressed_embs = self.searcher.ranker.codec.compress(embs) if persist: torch.save(compressed_embs, embs_path) torch.save(doclens, doclens_path) return compressed_embs, doclens def update_searcher(self, compressed_embs, doclens, curr_pid): # Update searcher # NOTE: For codes and residuals, the tensors end with padding of length 512, # hence we concatenate the new appendage in front of the padding self.searcher.ranker.embeddings.codes = torch.cat( ( self.searcher.ranker.embeddings.codes[:-512], compressed_embs.codes, self.searcher.ranker.embeddings.codes[-512:], ) ) self.searcher.ranker.embeddings.residuals = torch.cat( ( self.searcher.ranker.embeddings.residuals[:-512], compressed_embs.residuals, self.searcher.ranker.embeddings.residuals[-512:], ), dim=0, ) self.searcher.ranker.doclens = torch.cat( (self.searcher.ranker.doclens, torch.tensor(doclens)) ) # Build partitions for each pid and update IndexUpdater's current ivf start = 0 ivf = self.curr_ivf.tolist() ivf_lengths = self.curr_ivf_lengths.tolist() for doclen in doclens: end = start + doclen codes = compressed_embs.codes[start:end] partitions, _ = self._build_passage_partitions(codes) ivf, ivf_lengths = self._add_pid_to_ivf(partitions, curr_pid, ivf, ivf_lengths) start = end curr_pid += 1 assert start == sum(doclens) # Replace the current ivf with new_ivf self.curr_ivf = torch.tensor(ivf, dtype=self.curr_ivf.dtype) self.curr_ivf_lengths = torch.tensor(ivf_lengths, dtype=self.curr_ivf_lengths.dtype) # Update new ivf in searcher new_ivf_tensor = StridedTensor( self.curr_ivf, self.curr_ivf_lengths, use_gpu=False ) assert new_ivf_tensor != self.searcher.ranker.ivf self.searcher.ranker.ivf = new_ivf_tensor # Rebuild StridedTensor within searcher self.searcher.ranker.set_embeddings_strided() def add(self, passages): """ Input: passages: list(string) Output: passage_ids: list(int) Adds new passages to the searcher, to add passages to the index, call persist_to_disk() after calling add() """ if not self.has_checkpoint: raise ValueError( "No checkpoint was provided at IndexUpdater initialization." ) # Find pid for the first added passage start_pid = len(self.searcher.ranker.doclens) curr_pid = start_pid compressed_embs, doclens = self.create_embs_and_doclens(passages) self.update_searcher(compressed_embs, doclens, curr_pid) print_message(f"#> Added {len(passages)} passages from pid {start_pid}.") new_pids = list(range(start_pid, start_pid + len(passages))) return new_pids def persist_to_disk(self): """ Persist all previous stored changes in IndexUpdater to index on disk, changes include all calls to IndexUpdater.remove() and IndexUpdater.add() before persist_to_disk() is called. """ print_message("#> Persisting index changes to disk") # Propagate all removed passages to disk self._load_metadata() for pid in self.removed_pids: self._remove_passage_from_disk(pid) # Propagate all added passages to disk # Rationale: keep record of all added passages in IndexUpdater.searcher, # divide passages into chunks and create / write chunks here self._load_metadata() # Reload after removal # Calculate avg number of passages per chunk curr_num_chunks = self.metadata["num_chunks"] last_chunk_metadata = self._load_chunk_metadata(curr_num_chunks - 1) if curr_num_chunks == 1: avg_chunksize = DEFAULT_CHUNKSIZE else: avg_chunksize = last_chunk_metadata["passage_offset"] / ( curr_num_chunks - 1 ) print_message(f"#> Current average chunksize is: {avg_chunksize}.") # Calculate number of additional passages we can write to the last chunk last_chunk_capacity = max( 0, avg_chunksize - last_chunk_metadata["num_passages"] ) print_message( f"#> The last chunk can hold {last_chunk_capacity} additional passages." ) # Find the first and last passages to be persisted pid_start = self.first_new_pid emb_start = self.first_new_emb pid_last = len(self.searcher.ranker.doclens) emb_last = ( emb_start + torch.sum(self.searcher.ranker.doclens[pid_start:]).item() ) # First populate the last chunk if last_chunk_capacity > 0: pid_end = min(pid_last, pid_start + last_chunk_capacity) emb_end = ( emb_start + torch.sum(self.searcher.ranker.doclens[pid_start:pid_end]).item() ) # Write to last chunk self._write_to_last_chunk(pid_start, pid_end, emb_start, emb_end) pid_start = pid_end emb_start = emb_end # Then create new chunks to hold the remaining added passages while pid_start < pid_last: pid_end = min(pid_last, pid_start + avg_chunksize) emb_end = ( emb_start + torch.sum(self.searcher.ranker.doclens[pid_start:pid_end]).item() ) # Write new chunk with id = curr_num_chunks self._write_to_new_chunk( curr_num_chunks, pid_start, pid_end, emb_start, emb_end ) curr_num_chunks += 1 pid_start = pid_end emb_start = emb_end assert pid_start == pid_last assert emb_start == emb_last # Update metadata print_message("#> Updating metadata for added passages...") self.metadata["num_chunks"] = curr_num_chunks self.metadata["num_embeddings"] += torch.sum( self.searcher.ranker.doclens ).item() metadata_path = os.path.join(self.index_path, "metadata.json") with open(metadata_path, "w") as output_metadata: ujson.dump(self.metadata, output_metadata) # Save current IVF to disk optimized_ivf_path = os.path.join(self.index_path, "ivf.pid.pt") torch.save((self.curr_ivf, self.curr_ivf_lengths), optimized_ivf_path) print_message(f"#> Persisted updated IVF to {optimized_ivf_path}") self.removed_pids = [] self.first_new_emb = torch.sum(self.searcher.ranker.doclens).item() self.first_new_pid = len(self.searcher.ranker.doclens) # HELPER FUNCTIONS BELOW def _load_disk_ivf(self): print_message(f"#> Loading IVF...") if os.path.exists(os.path.join(self.index_path, "ivf.pid.pt")): ivf, ivf_lengths = torch.load( os.path.join(self.index_path, "ivf.pid.pt"), map_location="cpu" ) else: assert os.path.exists(os.path.join(self.index_path, "ivf.pt")) ivf, ivf_lengths = torch.load( os.path.join(self.index_path, "ivf.pt"), map_location="cpu" ) ivf, ivf_lengths = optimize_ivf(ivf, ivf_lengths, self.index_path) self.curr_ivf = ivf self.curr_ivf_lengths = ivf_lengths def _load_metadata(self): with open(os.path.join(self.index_path, "metadata.json")) as f: self.metadata = ujson.load(f) def _load_chunk_doclens(self, chunk_idx): doclens = [] print_message("#> Loading doclens...") with open(os.path.join(self.index_path, f"doclens.{chunk_idx}.json")) as f: chunk_doclens = ujson.load(f) doclens.extend(chunk_doclens) doclens = torch.tensor(doclens) return doclens def _load_chunk_codes(self, chunk_idx): codes_path = os.path.join(self.index_path, f"{chunk_idx}.codes.pt") return torch.load(codes_path, map_location="cpu") def _load_chunk_residuals(self, chunk_idx): residuals_path = os.path.join(self.index_path, f"{chunk_idx}.residuals.pt") return torch.load(residuals_path, map_location="cpu") def _load_chunk_metadata(self, chunk_idx): with open(os.path.join(self.index_path, f"{chunk_idx}.metadata.json")) as f: chunk_metadata = ujson.load(f) return chunk_metadata def _get_chunk_idx(self, pid): for i in range(self.metadata["num_chunks"]): chunk_metadata = self._load_chunk_metadata(i) if ( chunk_metadata["passage_offset"] <= pid and chunk_metadata["passage_offset"] + chunk_metadata["num_passages"] > pid ): return i raise ValueError("Passage ID out of range") def _check_pids(self, pids): invalid_pids = [] for pid in pids: if pid < 0 or pid >= len(self.searcher.ranker.doclens): invalid_pids.append(pid) return invalid_pids def _remove_pid_from_ivf(self, pids): # Helper function for IndexUpdater.remove() new_ivf = [] new_ivf_lengths = [] runner = 0 pids = set(pids) # Construct mask of where pids to be removed appear in ivf mask = torch.isin(self.curr_ivf, torch.tensor(list(pids))) indices = mask.nonzero() # Calculate end-indices of each centroid section in ivf section_end_indices = [] c = 0 for length in self.curr_ivf_lengths.tolist(): c += length section_end_indices.append(c) # Record the number of pids removed from each centroid section removed_len = [0 for _ in range(len(section_end_indices))] j = 0 for ind in indices: while ind >= section_end_indices[j]: j += 1 removed_len[j] += 1 # Update changes new_ivf = torch.masked_select(self.curr_ivf, ~mask) new_ivf_lengths = self.curr_ivf_lengths - torch.tensor(removed_len) new_ivf_tensor = StridedTensor(new_ivf, new_ivf_lengths, use_gpu=False) assert new_ivf_tensor != self.searcher.ranker.ivf self.searcher.ranker.ivf = new_ivf_tensor self.curr_ivf = new_ivf self.curr_ivf_lengths = new_ivf_lengths def _build_passage_partitions(self, codes): # Helper function for IndexUpdater.add() # Return a list of ordered, unique centroid ids from codes of a passage codes = codes.sort() ivf, values = codes.indices, codes.values partitions, ivf_lengths = values.unique_consecutive(return_counts=True) return partitions, ivf_lengths def _add_pid_to_ivf(self, partitions, pid, old_ivf, old_ivf_lengths): """ Helper function for IndexUpdater.add() Input: partitions: list(int), centroid ids of the passage pid: int, passage id Output: None Adds the pid of new passage into the ivf. """ new_ivf = [] new_ivf_lengths = [] partitions_runner = 0 ivf_runner = 0 for i in range(len(old_ivf_lengths)): # First copy existing partition pids to new ivf new_ivf.extend(old_ivf[ivf_runner : ivf_runner + old_ivf_lengths[i]]) new_ivf_lengths.append(old_ivf_lengths[i]) ivf_runner += old_ivf_lengths[i] # Add pid if partition_index i is in the passage's partitions if ( partitions_runner < len(partitions) and i == partitions[partitions_runner] ): new_ivf.append(pid) new_ivf_lengths[-1] += 1 partitions_runner += 1 assert ivf_runner == len(old_ivf) assert sum(new_ivf_lengths) == len(new_ivf) return new_ivf, new_ivf_lengths def _write_to_last_chunk(self, pid_start, pid_end, emb_start, emb_end): # Helper function for IndexUpdater.persist_to_disk() print_message(f"#> Writing {pid_end - pid_start} passages to the last chunk...") num_chunks = self.metadata["num_chunks"] # Append to current last chunk curr_embs = ResidualEmbeddings.load(self.index_path, num_chunks - 1) curr_embs.codes = torch.cat( (curr_embs.codes, self.searcher.ranker.embeddings.codes[emb_start:emb_end]) ) curr_embs.residuals = torch.cat( ( curr_embs.residuals, self.searcher.ranker.embeddings.residuals[emb_start:emb_end], ) ) path_prefix = os.path.join(self.index_path, f"{num_chunks - 1}") curr_embs.save(path_prefix) # Update doclen of last chunk curr_doclens = self._load_chunk_doclens(num_chunks - 1).tolist() curr_doclens.extend(self.searcher.ranker.doclens.tolist()[pid_start:pid_end]) doclens_path = os.path.join(self.index_path, f"doclens.{num_chunks - 1}.json") with open(doclens_path, "w") as output_doclens: ujson.dump(curr_doclens, output_doclens) # Update metadata of last chunk chunk_metadata = self._load_chunk_metadata(num_chunks - 1) chunk_metadata["num_passages"] += pid_end - pid_start chunk_metadata["num_embeddings"] += emb_end - emb_start chunk_metadata_path = os.path.join( self.index_path, f"{num_chunks - 1}.metadata.json" ) with open(chunk_metadata_path, "w") as output_chunk_metadata: ujson.dump(chunk_metadata, output_chunk_metadata) def _write_to_new_chunk(self, chunk_idx, pid_start, pid_end, emb_start, emb_end): # Helper function for IndexUpdater.persist_to_disk() # Save embeddings to new chunk curr_embs = ResidualEmbeddings( self.searcher.ranker.embeddings.codes[emb_start:emb_end], self.searcher.ranker.embeddings.residuals[emb_start:emb_end], ) path_prefix = os.path.join(self.index_path, f"{chunk_idx}") curr_embs.save(path_prefix) # Create doclen json file for new chunk curr_doclens = self.searcher.ranker.doclens.tolist()[pid_start:pid_end] doclens_path = os.path.join(self.index_path, f"doclens.{chunk_idx}.json") with open(doclens_path, "w+") as output_doclens: ujson.dump(curr_doclens, output_doclens) # Create metadata json file for new chunk chunk_metadata = { "passage_offset": pid_start, "num_passages": pid_end - pid_start, "embedding_offset": emb_start, "num_embeddings": emb_end - emb_start, } chunk_metadata_path = os.path.join( self.index_path, f"{chunk_idx}.metadata.json" ) with open(chunk_metadata_path, "w+") as output_chunk_metadata: ujson.dump(chunk_metadata, output_chunk_metadata) def _remove_passage_from_disk(self, pid): # Helper function for IndexUpdater.persist_to_disk() chunk_idx = self._get_chunk_idx(pid) chunk_metadata = self._load_chunk_metadata(chunk_idx) i = pid - chunk_metadata["passage_offset"] doclens = self._load_chunk_doclens(chunk_idx) codes, residuals = ( self._load_chunk_codes(chunk_idx), self._load_chunk_residuals(chunk_idx), ) # Remove embeddings from codes and residuals start = sum(doclens[:i]) end = start + doclens[i] codes = torch.cat((codes[:start], codes[end:])) residuals = torch.cat((residuals[:start], residuals[end:])) codes_path = os.path.join(self.index_path, f"{chunk_idx}.codes.pt") residuals_path = os.path.join(self.index_path, f"{chunk_idx}.residuals.pt") torch.save(codes, codes_path) torch.save(residuals, residuals_path) # Change doclen for passage to 0 doclens = doclens.tolist() doclen_to_remove = doclens[i] doclens[i] = 0 doclens_path = os.path.join(self.index_path, f"doclens.{chunk_idx}.json") with open(doclens_path, "w") as output_doclens: ujson.dump(doclens, output_doclens) # Modify chunk_metadata['num_embeddings'] for chunk_idx chunk_metadata["num_embeddings"] -= doclen_to_remove chunk_metadata_path = os.path.join( self.index_path, f"{chunk_idx}.metadata.json" ) with open(chunk_metadata_path, "w") as output_chunk_metadata: ujson.dump(chunk_metadata, output_chunk_metadata) # Modify chunk_metadata['embedding_offset'] for all later chunks (minus num_embs_removed) for idx in range(chunk_idx + 1, self.metadata["num_chunks"]): metadata = self._load_chunk_metadata(idx) metadata["embedding_offset"] -= doclen_to_remove metadata_path = os.path.join(self.index_path, f"{idx}.metadata.json") with open(metadata_path, "w") as output_chunk_metadata: ujson.dump(metadata, output_chunk_metadata) # Modify num_embeddings in overall metadata (minus num_embs_removed) self.metadata["num_embeddings"] -= doclen_to_remove metadata_path = os.path.join(self.index_path, "metadata.json") with open(metadata_path, "w") as output_metadata: ujson.dump(self.metadata, output_metadata)