Source code for spec2vec.vector_operations

import logging
from typing import Union
import numba
import numpy as np
from gensim.models.basemodel import BaseTopicModel
from spec2vec.Document import Document


logger = logging.getLogger("spec2vec")


[docs]def calc_vector(model: BaseTopicModel, document: Document, intensity_weighting_power: Union[float, int] = 0, allowed_missing_percentage: Union[float, int] = 10) -> np.ndarray: """Compute document vector as a (weighted) sum of individual word vectors. Parameters ---------- model Pretrained word2vec model to convert words into vectors. document Document containing document.words and document.weights. intensity_weighting_power Specify to what power weights should be raised. The default is 0, which means that no weighing will be done. allowed_missing_percentage: Set the maximum allowed percentage of the document that may be missing from the input model. This is measured as percentage of the weighted, missing words compared to all word vectors of the document. Default is 10, which means up to 10% missing words are allowed. If more words are missing from the model, an empty embedding will be returned (leading to similarities of 0) and a warning is raised. Returns ------- vector Vector representing the input document in latent space. Will return None if the missing percentage of the document in the model is > allowed_missing_percentage. """ assert max(document.weights) <= 1.0, "Weights are not normalized to unity as expected." assert 0 <= allowed_missing_percentage <= 100.0, "allowed_missing_percentage must be within [0,100]" def _check_model_coverage(): """Return True if model covers enough of the document words.""" if len(idx_not_in_model) > 0: weights_missing = np.array([document.weights[i] for i in idx_not_in_model]) weights_missing_raised = np.power(weights_missing, intensity_weighting_power) missing_percentage = 100 * weights_missing_raised.sum() / (weights_raised.sum() + weights_missing_raised.sum()) msg = (f"Found {len(idx_not_in_model)} word(s) missing in the model.", f"Weighted missing percentage not covered by the given model is {missing_percentage:.2f}%.") logger.info(msg) if missing_percentage > allowed_missing_percentage: msg = (f"Missing percentage ({missing_percentage:.2f}%) is above set maximum. An empty vector will be returned.", "Consider retraining the used model or change the `allowed_missing_percentage`.") logger.warning(msg) return False return True idx_not_in_model = [i for i, x in enumerate(document.words) if x not in model.wv.key_to_index] if len(idx_not_in_model) == len(document.words): msg = ("Spectrum without peaks known by the used model. An empty vector will be returned.", "Consider retraining the used model or make sure decimal rounding is correct.") logger.warning(msg) return np.zeros(model.wv.vector_size) words_in_model = [x for i, x in enumerate(document.words) if i not in idx_not_in_model] weights_in_model = np.asarray([x for i, x in enumerate(document.weights) if i not in idx_not_in_model]).reshape(len(words_in_model), 1) word_vectors = model.wv[words_in_model] weights_raised = np.power(weights_in_model, intensity_weighting_power) if _check_model_coverage() is True: weights_raised_tiled = np.tile(weights_raised, (1, model.wv.vector_size)) return np.sum(word_vectors * weights_raised_tiled, 0) return np.zeros(model.wv.vector_size)
[docs]@numba.njit def cosine_similarity_matrix(vectors_1: np.ndarray, vectors_2: np.ndarray) -> np.ndarray: """Fast implementation of cosine similarity between two arrays of vectors. For example: .. code-block:: python import numpy as np from spec2vec.vector_operations import cosine_similarity_matrix vectors_1 = np.array([[1, 1, 0, 0], [1, 0, 1, 1]]) vectors_2 = np.array([[0, 1, 1, 0], [0, 0, 1, 1]]) similarity_matrix = cosine_similarity_matrix(vectors_1, vectors_2) Parameters ---------- vectors_1 Numpy array of vectors. vectors_1.shape[0] is number of vectors, vectors_1.shape[1] is vector dimension. vectors_2 Numpy array of vectors. vectors_2.shape[0] is number of vectors, vectors_2.shape[1] is vector dimension. """ assert vectors_1.shape[1] == vectors_2.shape[1], "Input vectors must have same shape." vectors_1 = vectors_1.astype(np.float64) # Numba dot only accepts float or complex arrays vectors_2 = vectors_2.astype(np.float64) norm_1 = np.sqrt(np.sum(vectors_1**2, axis=1)) norm_2 = np.sqrt(np.sum(vectors_2**2, axis=1)) for i in range(vectors_1.shape[0]): vectors_1[i] = vectors_1[i] / norm_1[i] for i in range(vectors_2.shape[0]): vectors_2[i] = vectors_2[i] / norm_2[i] return np.dot(vectors_1, vectors_2.T)
[docs]@numba.njit def cosine_similarity(vector1: np.ndarray, vector2: np.ndarray) -> np.float64: """Calculate cosine similarity between two input vectors. For example: .. testcode:: import numpy as np from spec2vec.vector_operations import cosine_similarity vector1 = np.array([1, 1, 0, 0]) vector2 = np.array([1, 1, 1, 1]) print("Cosine similarity: {:.3f}".format(cosine_similarity(vector1, vector2))) Should output .. testoutput:: Cosine similarity: 0.707 Parameters ---------- vector1 Input vector. Can be array of integers or floats. vector2 Input vector. Can be array of integers or floats. """ assert vector1.shape[0] == vector2.shape[0], "Input vector must have same shape." prod12 = 0 prod11 = 0 prod22 = 0 for i in range(vector1.shape[0]): prod12 += vector1[i] * vector2[i] prod11 += vector1[i] * vector1[i] prod22 += vector2[i] * vector2[i] cosine_score = 0 if prod11 != 0 and prod22 != 0: cosine_score = prod12 / np.sqrt(prod11 * prod22) return np.float64(cosine_score)