Module ktrain.text.kw.core

Expand source code
import warnings
from collections import Counter

from ... import imports as I
from .. import textutils as TU

try:
    import textblob

    TEXTBLOB_INSTALLED = True
except ImportError:
    TEXTBLOB_INSTALLED = False

SUPPORTED_LANGS = {
    "en": "english",
    "ar": "arabic",
    "az": "azerbaijani",
    "da": "danish",
    "nl": "dutch",
    "fi": "finnish",
    "fr": "french",
    "de": "german",
    "el": "greek",
    "hu": "hungarian",
    "id": "indonesian",
    "it": "italian",
    "kk": "kazakh",
    "ne": "nepali",
    "no": "norwegian",
    "pt": "portuguese",
    "ro": "romanian",
    "ru": "russian",
    "sl": "slovene",
    "es": "spanish",
    "sv": "swedish",
    "tg": "tajik",
    "tr": "turkish",
    "zh": "chinese",
}


class KeywordExtractor:
    """
    Keyphrase Extraction
    """

    def __init__(
        self, lang="en", custom_stopwords=["et al", "et", "al", "n't", "did", "does"]
    ):
        """
        ```
        Keyphrase Extraction

        Args:
          lang(str):  2-character language code:
          custom_stopwords(list): list of custom stopwords to ignore
        ```
        """
        # error checks
        if not TEXTBLOB_INSTALLED:
            raise Exception(
                "The textblob package is required for keyphrase extraction: pip install textblob; python -m textblob.download_corpora"
            )
        if lang not in SUPPORTED_LANGS:
            raise ValueError(
                f'lang="{lang}" is not supported. Supported 2-character ISO 639-1 language codes are: {SUPPORTED_LANGS}'
            )
        self.lang = lang

        # build blacklist
        from nltk.corpus import stopwords as nltk_stopwords
        from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

        if lang == "en":
            stopwords = list(ENGLISH_STOP_WORDS) + custom_stopwords
        elif lang == "zh":
            stopwords = TU.chinese_stopwords() + custom_stopwords
        elif lang in SUPPORTED_LANGS:
            stopwords = nltk_stopwords.words(SUPPORTED_LANGS[lang])
        else:
            stopwords = []
        blacklist = stopwords + custom_stopwords
        self.blacklist = blacklist

    def extract_keywords(
        self,
        text,
        ngram_range=(1, 3),
        top_n=10,
        n_candidates=50,
        omit_scores=False,
        candidate_generator="ngrams",
        constrain_unigram_case=True,
        maxlen=64,
    ):
        """
        ```
        simple keyword extraction

        This is a simplified TextBlob implementation of the KERA algorithm from:
          https://arxiv.org/pdf/1308.2359.pdf
        Args:
          text(str): the text as unicode string
          ngram_range(tuple): the ngram range.  Example: (1,3) considers unigrams, bigrams, and trigrams as candidates
          top_n(int): number of keyphrases to return
          n_candidates(int): number of candidates considered, when ranking
          omit_scores(bool):  If True, no scores are returned.
          candidate_generator(str):  Either 'noun_phrases' or 'ngrams'.
                                     The default 'ngrams' method will be faster.
          contrain_unigram_case(bool): Only applies if candidate_generator=='ngrams'.
                                       If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA).
                                       True is recommended.
          maxlen(int): maximum number of characters in keyphrase. Default:64

          Returns:
            list
          ```

        """
        if candidate_generator not in ["noun_phrases", "ngrams"]:
            raise ValueError(
                'candidate_generator must be one of {"noun_phrases", "ngrams"}'
            )
        if self.lang == "zh":
            text = " ".join(I.jieba.cut(text, HMM=False))
        if candidate_generator == "noun_phrases" and self.lang != "en":
            warnings.warn(
                f'lang={self.lang} but candidate_generator="noun_phrases" is not supported. '
                + 'Falling back to candidate_generator="ngrams"'
            )
            candidate_generator = "ngrams"

        blob = textblob.TextBlob(text)
        candidates = []
        min_n, max_n = ngram_range
        ngram_lens = list(range(min_n, max_n + 1))

        # generate ngrams or noun phrases
        ngrams = {}
        if candidate_generator == "ngrams":
            for n in ngram_lens:
                ngrams[n] = blob.ngrams(n=n)
        else:
            noun_phrases = blob.noun_phrases
            for np in noun_phrases:
                words = np.split()
                n = len(words)
                if n not in ngram_lens:
                    continue
                lst = ngrams.get(n, [])
                lst.append(words)
                ngrams[n] = lst

        # generate candidates
        for n in range(min_n, max_n + 1):
            if n == 1:
                grams = [
                    k[0].lower()
                    for k in ngrams.get(n, [])
                    if not any(w.lower() in self.blacklist for w in k)
                    and (
                        not constrain_unigram_case
                        or (
                            candidate_generator == "ngrams"
                            and constrain_unigram_case
                            and k[0].isupper()
                        )
                    )
                ]
            else:
                grams = [
                    " ".join(k).lower()
                    for k in ngrams.get(n, [])
                    if not any(w.lower() in self.blacklist for w in k)
                    and len(set(k)) != 1
                    and len(k[0]) > 1
                    and len(k[1]) > 1
                ]
            candidates.extend(
                [
                    kw
                    for kw in grams
                    if kw[0].isalpha()
                    and len([w for w in kw if not w.isspace() and w not in ["-", "."]])
                    >= 3
                    and (kw[-1].isalpha() or kw[-1].isdigit())
                    and "@" not in kw
                ]
            )
        cnt = Counter(candidates)
        tups = cnt.most_common(n_candidates)

        # normalize and return
        keywords = [tup[0] for tup in tups if len(tup[0]) <= maxlen]
        scores = [tup[1] for tup in tups if len(tup[0]) <= maxlen]
        scores = [float(i) / sum(scores) for i in scores]
        result = list(zip(keywords, scores))
        return [r[0] for r in result[:top_n]] if omit_scores else result[:top_n]

Classes

class KeywordExtractor (lang='en', custom_stopwords=['et al', 'et', 'al', "n't", 'did', 'does'])

Keyphrase Extraction

Keyphrase Extraction

Args:
  lang(str):  2-character language code:
  custom_stopwords(list): list of custom stopwords to ignore
Expand source code
class KeywordExtractor:
    """
    Keyphrase Extraction
    """

    def __init__(
        self, lang="en", custom_stopwords=["et al", "et", "al", "n't", "did", "does"]
    ):
        """
        ```
        Keyphrase Extraction

        Args:
          lang(str):  2-character language code:
          custom_stopwords(list): list of custom stopwords to ignore
        ```
        """
        # error checks
        if not TEXTBLOB_INSTALLED:
            raise Exception(
                "The textblob package is required for keyphrase extraction: pip install textblob; python -m textblob.download_corpora"
            )
        if lang not in SUPPORTED_LANGS:
            raise ValueError(
                f'lang="{lang}" is not supported. Supported 2-character ISO 639-1 language codes are: {SUPPORTED_LANGS}'
            )
        self.lang = lang

        # build blacklist
        from nltk.corpus import stopwords as nltk_stopwords
        from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

        if lang == "en":
            stopwords = list(ENGLISH_STOP_WORDS) + custom_stopwords
        elif lang == "zh":
            stopwords = TU.chinese_stopwords() + custom_stopwords
        elif lang in SUPPORTED_LANGS:
            stopwords = nltk_stopwords.words(SUPPORTED_LANGS[lang])
        else:
            stopwords = []
        blacklist = stopwords + custom_stopwords
        self.blacklist = blacklist

    def extract_keywords(
        self,
        text,
        ngram_range=(1, 3),
        top_n=10,
        n_candidates=50,
        omit_scores=False,
        candidate_generator="ngrams",
        constrain_unigram_case=True,
        maxlen=64,
    ):
        """
        ```
        simple keyword extraction

        This is a simplified TextBlob implementation of the KERA algorithm from:
          https://arxiv.org/pdf/1308.2359.pdf
        Args:
          text(str): the text as unicode string
          ngram_range(tuple): the ngram range.  Example: (1,3) considers unigrams, bigrams, and trigrams as candidates
          top_n(int): number of keyphrases to return
          n_candidates(int): number of candidates considered, when ranking
          omit_scores(bool):  If True, no scores are returned.
          candidate_generator(str):  Either 'noun_phrases' or 'ngrams'.
                                     The default 'ngrams' method will be faster.
          contrain_unigram_case(bool): Only applies if candidate_generator=='ngrams'.
                                       If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA).
                                       True is recommended.
          maxlen(int): maximum number of characters in keyphrase. Default:64

          Returns:
            list
          ```

        """
        if candidate_generator not in ["noun_phrases", "ngrams"]:
            raise ValueError(
                'candidate_generator must be one of {"noun_phrases", "ngrams"}'
            )
        if self.lang == "zh":
            text = " ".join(I.jieba.cut(text, HMM=False))
        if candidate_generator == "noun_phrases" and self.lang != "en":
            warnings.warn(
                f'lang={self.lang} but candidate_generator="noun_phrases" is not supported. '
                + 'Falling back to candidate_generator="ngrams"'
            )
            candidate_generator = "ngrams"

        blob = textblob.TextBlob(text)
        candidates = []
        min_n, max_n = ngram_range
        ngram_lens = list(range(min_n, max_n + 1))

        # generate ngrams or noun phrases
        ngrams = {}
        if candidate_generator == "ngrams":
            for n in ngram_lens:
                ngrams[n] = blob.ngrams(n=n)
        else:
            noun_phrases = blob.noun_phrases
            for np in noun_phrases:
                words = np.split()
                n = len(words)
                if n not in ngram_lens:
                    continue
                lst = ngrams.get(n, [])
                lst.append(words)
                ngrams[n] = lst

        # generate candidates
        for n in range(min_n, max_n + 1):
            if n == 1:
                grams = [
                    k[0].lower()
                    for k in ngrams.get(n, [])
                    if not any(w.lower() in self.blacklist for w in k)
                    and (
                        not constrain_unigram_case
                        or (
                            candidate_generator == "ngrams"
                            and constrain_unigram_case
                            and k[0].isupper()
                        )
                    )
                ]
            else:
                grams = [
                    " ".join(k).lower()
                    for k in ngrams.get(n, [])
                    if not any(w.lower() in self.blacklist for w in k)
                    and len(set(k)) != 1
                    and len(k[0]) > 1
                    and len(k[1]) > 1
                ]
            candidates.extend(
                [
                    kw
                    for kw in grams
                    if kw[0].isalpha()
                    and len([w for w in kw if not w.isspace() and w not in ["-", "."]])
                    >= 3
                    and (kw[-1].isalpha() or kw[-1].isdigit())
                    and "@" not in kw
                ]
            )
        cnt = Counter(candidates)
        tups = cnt.most_common(n_candidates)

        # normalize and return
        keywords = [tup[0] for tup in tups if len(tup[0]) <= maxlen]
        scores = [tup[1] for tup in tups if len(tup[0]) <= maxlen]
        scores = [float(i) / sum(scores) for i in scores]
        result = list(zip(keywords, scores))
        return [r[0] for r in result[:top_n]] if omit_scores else result[:top_n]

Methods

def extract_keywords(self, text, ngram_range=(1, 3), top_n=10, n_candidates=50, omit_scores=False, candidate_generator='ngrams', constrain_unigram_case=True, maxlen=64)
simple keyword extraction

This is a simplified TextBlob implementation of the KERA algorithm from:
  <https://arxiv.org/pdf/1308.2359.pdf>

Args
-----=
text(str): the text as unicode string
ngram_range(tuple): the ngram range.  Example: (1,3) considers unigrams, bigrams, and trigrams as candidates
top_n(int): number of keyphrases to return
n_candidates(int): number of candidates considered, when ranking
omit_scores(bool):  If True, no scores are returned.
candidate_generator(str):  Either 'noun_phrases' or 'ngrams'.
                           The default 'ngrams' method will be faster.
contrain_unigram_case(bool): Only applies if candidate_generator=='ngrams'.
                             If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA).
                             True is recommended.
maxlen(int): maximum number of characters in keyphrase. Default:64

Returns:
  list
Expand source code
def extract_keywords(
    self,
    text,
    ngram_range=(1, 3),
    top_n=10,
    n_candidates=50,
    omit_scores=False,
    candidate_generator="ngrams",
    constrain_unigram_case=True,
    maxlen=64,
):
    """
    ```
    simple keyword extraction

    This is a simplified TextBlob implementation of the KERA algorithm from:
      https://arxiv.org/pdf/1308.2359.pdf
    Args:
      text(str): the text as unicode string
      ngram_range(tuple): the ngram range.  Example: (1,3) considers unigrams, bigrams, and trigrams as candidates
      top_n(int): number of keyphrases to return
      n_candidates(int): number of candidates considered, when ranking
      omit_scores(bool):  If True, no scores are returned.
      candidate_generator(str):  Either 'noun_phrases' or 'ngrams'.
                                 The default 'ngrams' method will be faster.
      contrain_unigram_case(bool): Only applies if candidate_generator=='ngrams'.
                                   If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA).
                                   True is recommended.
      maxlen(int): maximum number of characters in keyphrase. Default:64

      Returns:
        list
      ```

    """
    if candidate_generator not in ["noun_phrases", "ngrams"]:
        raise ValueError(
            'candidate_generator must be one of {"noun_phrases", "ngrams"}'
        )
    if self.lang == "zh":
        text = " ".join(I.jieba.cut(text, HMM=False))
    if candidate_generator == "noun_phrases" and self.lang != "en":
        warnings.warn(
            f'lang={self.lang} but candidate_generator="noun_phrases" is not supported. '
            + 'Falling back to candidate_generator="ngrams"'
        )
        candidate_generator = "ngrams"

    blob = textblob.TextBlob(text)
    candidates = []
    min_n, max_n = ngram_range
    ngram_lens = list(range(min_n, max_n + 1))

    # generate ngrams or noun phrases
    ngrams = {}
    if candidate_generator == "ngrams":
        for n in ngram_lens:
            ngrams[n] = blob.ngrams(n=n)
    else:
        noun_phrases = blob.noun_phrases
        for np in noun_phrases:
            words = np.split()
            n = len(words)
            if n not in ngram_lens:
                continue
            lst = ngrams.get(n, [])
            lst.append(words)
            ngrams[n] = lst

    # generate candidates
    for n in range(min_n, max_n + 1):
        if n == 1:
            grams = [
                k[0].lower()
                for k in ngrams.get(n, [])
                if not any(w.lower() in self.blacklist for w in k)
                and (
                    not constrain_unigram_case
                    or (
                        candidate_generator == "ngrams"
                        and constrain_unigram_case
                        and k[0].isupper()
                    )
                )
            ]
        else:
            grams = [
                " ".join(k).lower()
                for k in ngrams.get(n, [])
                if not any(w.lower() in self.blacklist for w in k)
                and len(set(k)) != 1
                and len(k[0]) > 1
                and len(k[1]) > 1
            ]
        candidates.extend(
            [
                kw
                for kw in grams
                if kw[0].isalpha()
                and len([w for w in kw if not w.isspace() and w not in ["-", "."]])
                >= 3
                and (kw[-1].isalpha() or kw[-1].isdigit())
                and "@" not in kw
            ]
        )
    cnt = Counter(candidates)
    tups = cnt.most_common(n_candidates)

    # normalize and return
    keywords = [tup[0] for tup in tups if len(tup[0]) <= maxlen]
    scores = [tup[1] for tup in tups if len(tup[0]) <= maxlen]
    scores = [float(i) / sum(scores) for i in scores]
    result = list(zip(keywords, scores))
    return [r[0] for r in result[:top_n]] if omit_scores else result[:top_n]