Source code for spec2vec.vector_operations

import logging
from typing import Union
import numba
import numpy as np
from gensim.models.basemodel import BaseTopicModel
from spec2vec.Document import Document


logger = logging.getLogger("spec2vec")


[docs]def calc_vector(model: BaseTopicModel, document: Document,
                intensity_weighting_power: Union[float, int] = 0,
                allowed_missing_percentage: Union[float, int] = 10) -> np.ndarray:
    """Compute document vector as a (weighted) sum of individual word vectors.

    Parameters
    ----------
    model
        Pretrained word2vec model to convert words into vectors.
    document
        Document containing document.words and document.weights.
    intensity_weighting_power
        Specify to what power weights should be raised. The default is 0, which
        means that no weighing will be done.
    allowed_missing_percentage:
        Set the maximum allowed percentage of the document that may be missing
        from the input model. This is measured as percentage of the weighted, missing
        words compared to all word vectors of the document. Default is 10, which
        means up to 10% missing words are allowed. If more words are missing from
        the model, an empty embedding will be returned (leading to similarities of 0)
        and a warning is raised.

    Returns
    -------
    vector
        Vector representing the input document in latent space. Will return None
        if the missing percentage of the document in the model is > allowed_missing_percentage.
    """
    assert max(document.weights) <= 1.0, "Weights are not normalized to unity as expected."
    assert 0 <= allowed_missing_percentage <= 100.0, "allowed_missing_percentage must be within [0,100]"

    def _check_model_coverage():
        """Return True if model covers enough of the document words."""
        if len(idx_not_in_model) > 0:
            weights_missing = np.array([document.weights[i] for i in idx_not_in_model])
            weights_missing_raised = np.power(weights_missing, intensity_weighting_power)
            missing_percentage = 100 * weights_missing_raised.sum() / (weights_raised.sum()
                                                                       + weights_missing_raised.sum())
            msg = (f"Found {len(idx_not_in_model)} word(s) missing in the model.",
                   f"Weighted missing percentage not covered by the given model is {missing_percentage:.2f}%.")
            logger.info(msg)

            if missing_percentage > allowed_missing_percentage:
                msg = (f"Missing percentage ({missing_percentage:.2f}%) is above set maximum. An empty vector will be returned.",
                       "Consider retraining the used model or change the `allowed_missing_percentage`.")
                logger.warning(msg)
                return False
        return True

    idx_not_in_model = [i for i, x in enumerate(document.words) if x not in model.wv.key_to_index]
    if len(idx_not_in_model) == len(document.words):
        msg = ("Spectrum without peaks known by the used model. An empty vector will be returned.",
               "Consider retraining the used model or make sure decimal rounding is correct.")
        logger.warning(msg)
        return np.zeros(model.wv.vector_size)

    words_in_model = [x for i, x in enumerate(document.words) if i not in idx_not_in_model]
    weights_in_model = np.asarray([x for i, x in enumerate(document.weights)
                                   if i not in idx_not_in_model]).reshape(len(words_in_model), 1)

    word_vectors = model.wv[words_in_model]
    weights_raised = np.power(weights_in_model, intensity_weighting_power)

    if _check_model_coverage() is True:
        weights_raised_tiled = np.tile(weights_raised, (1, model.wv.vector_size))
        return np.sum(word_vectors * weights_raised_tiled, 0)
    return np.zeros(model.wv.vector_size)


[docs]@numba.njit
def cosine_similarity_matrix(vectors_1: np.ndarray, vectors_2: np.ndarray) -> np.ndarray:
    """Fast implementation of cosine similarity between two arrays of vectors.

    For example:

    .. code-block:: python

        import numpy as np
        from spec2vec.vector_operations import cosine_similarity_matrix

        vectors_1 = np.array([[1, 1, 0, 0],
                              [1, 0, 1, 1]])
        vectors_2 = np.array([[0, 1, 1, 0],
                              [0, 0, 1, 1]])
        similarity_matrix = cosine_similarity_matrix(vectors_1, vectors_2)


    Parameters
    ----------
    vectors_1
        Numpy array of vectors. vectors_1.shape[0] is number of vectors, vectors_1.shape[1]
        is vector dimension.
    vectors_2
        Numpy array of vectors. vectors_2.shape[0] is number of vectors, vectors_2.shape[1]
        is vector dimension.
    """
    assert vectors_1.shape[1] == vectors_2.shape[1], "Input vectors must have same shape."
    vectors_1 = vectors_1.astype(np.float64)  # Numba dot only accepts float or complex arrays
    vectors_2 = vectors_2.astype(np.float64)
    norm_1 = np.sqrt(np.sum(vectors_1**2, axis=1))
    norm_2 = np.sqrt(np.sum(vectors_2**2, axis=1))
    for i in range(vectors_1.shape[0]):
        vectors_1[i] = vectors_1[i] / norm_1[i]
    for i in range(vectors_2.shape[0]):
        vectors_2[i] = vectors_2[i] / norm_2[i]
    return np.dot(vectors_1, vectors_2.T)


[docs]@numba.njit
def cosine_similarity(vector1: np.ndarray, vector2: np.ndarray) -> np.float64:
    """Calculate cosine similarity between two input vectors.

    For example:

    .. testcode::

        import numpy as np
        from spec2vec.vector_operations import cosine_similarity

        vector1 = np.array([1, 1, 0, 0])
        vector2 = np.array([1, 1, 1, 1])
        print("Cosine similarity: {:.3f}".format(cosine_similarity(vector1, vector2)))

    Should output

    .. testoutput::

        Cosine similarity: 0.707

    Parameters
    ----------
    vector1
        Input vector. Can be array of integers or floats.
    vector2
        Input vector. Can be array of integers or floats.
    """
    assert vector1.shape[0] == vector2.shape[0], "Input vector must have same shape."
    prod12 = 0
    prod11 = 0
    prod22 = 0
    for i in range(vector1.shape[0]):
        prod12 += vector1[i] * vector2[i]
        prod11 += vector1[i] * vector1[i]
        prod22 += vector2[i] * vector2[i]
    cosine_score = 0
    if prod11 != 0 and prod22 != 0:
        cosine_score = prod12 / np.sqrt(prod11 * prod22)
    return np.float64(cosine_score)
Source code for spec2vec.vector_operations

spec2vec

Navigation

Related Topics