Module hashformers.segmenter
Expand source code
from hashformers.beamsearch.algorithm import Beamsearch
from hashformers.beamsearch.reranker import Reranker
from hashformers.beamsearch.data_structures import enforce_prob_dict
from hashformers.ensemble.top2_fusion import top2_ensemble
from typing import List, Union, Any
class WordSegmenter(object):
def __init__(
self,
segmenter_model_name_or_path = "gpt2",
segmenter_model_type = "gpt2",
segmenter_device = "cuda",
segmenter_gpu_batch_size = 1,
reranker_gpu_batch_size = 2000,
reranker_model_name_or_path = "bert-base-uncased",
reranker_model_type = "bert"
):
"""Word segmentation API initialization.
A GPT-2 model must be passed to `segmenter_model_name_or_path`, and optionally a BERT model to `reranker_model_name_or_path`.
If `reranker_model_name_or_path` is set to `False` or `None`, the word segmenter object will work without a reranker.
Args:
segmenter_model_name_or_path (str, optional): GPT-2 that will be fetched from the Hugging Face Model Hub. Defaults to "gpt2".
segmenter_model_type (str, optional): Transformer decoder model type. Defaults to "gpt2".
segmenter_device (str, optional): Device. Defaults to "cuda".
segmenter_gpu_batch_size (int, optional): Segmenter GPU batch size. Defaults to 1.
reranker_gpu_batch_size (int, optional): Reranker GPU split size. Defaults to 2000.
reranker_model_name_or_path (str, optional): BERT model that will be fetched from the Hugging Face Model Hub. It is possible to turn off the reranker by passing a None or False value to this argument. Defaults to "bert-base-uncased".
reranker_model_type (str, optional): Transformer encoder model type. Defaults to "bert".
"""
self.segmenter_model = Beamsearch(
model_name_or_path=segmenter_model_name_or_path,
model_type=segmenter_model_type,
device=segmenter_device,
gpu_batch_size=segmenter_gpu_batch_size
)
if reranker_model_name_or_path:
self.reranker_model = Reranker(
model_name_or_path=reranker_model_name_or_path,
model_type=reranker_model_type,
gpu_batch_size=reranker_gpu_batch_size
)
else:
self.reranker_model = None
def segment(
self,
word_list: List[str],
topk: int = 20,
steps: int = 13,
alpha: float = 0.222,
beta: float = 0.111,
use_reranker: bool = True,
return_ranks: bool = False,
trim_hashtags: bool = True) -> Any :
"""Segment a list of hashtags.
Args:
word_list (List[str]): A list of hashtag strings.
topk (int, optional):
top-k parameter for the Beamsearch algorithm.
A lower top-k value will speed up the algorithm.
However, this will decrease the amount of candidate segmentations in a rank.
Defaults to 20.
steps (int, optional):
steps parameter for the Beamsearch algorithm.
A lower amount of steps will speed up the algorithm.
However, the algorithm will never detect a number of words larger than amount of steps.
Defaults to 13.
alpha (float, optional):
alpha parameter for the top-2 ensemble.
It controls the weight given to the segmenter candidates.
Reasonable values range from 0 to 1.
Defaults to 0.222.
beta (float, optional):
beta parameter for the top-2 ensemble.
It controls the weight given to the reranker candidates.
Reasonable values range from 0 to 1.
Defaults to 0.111.
use_reranker (bool, optional):
Whether or not to run the reranker.
Defaults to True.
return_ranks (bool, optional):
Return not just the segmented hashtags but also the a dictionary of the ranks.
Defaults to False.
trim_hashtags (bool, optional):
Automatically remove "#" characters from the beginning of the hashtags.
Defaults to True.
Returns:
Any: A list of segmented hashtags if return_ranks == False. A dictionary of the ranks and the segmented hashtags if return_ranks == True.
"""
if trim_hashtags:
word_list = \
[ x.lstrip("#") for x in word_list ]
segmenter_run = self.segmenter_model.run(
word_list,
topk=topk,
steps=steps
)
ensemble = None
if use_reranker:
reranker_run = self.reranker_model.rerank(segmenter_run)
ensemble = top2_ensemble(
segmenter_run,
reranker_run,
alpha=alpha,
beta=beta
)
ensemble_prob_dict = enforce_prob_dict(
ensemble,
score_field="ensemble_rank")
segs = ensemble_prob_dict.get_segmentations(
astype="list",
gold_array=word_list
)
else:
segmenter_prob_dict = enforce_prob_dict(
segmenter_run,
score_field="score"
)
segs = segmenter_prob_dict.get_segmentations(
astype="list",
gold_array=word_list
)
if not return_ranks:
return segs
else:
segmenter_df = segmenter_run.to_dataframe().reset_index(drop=True)
if use_reranker:
reranker_df = reranker_run.to_dataframe().reset_index(drop=True)
else:
reranker_df = None
return {
"segmenter": segmenter_df,
"reranker": reranker_df,
"ensemble": ensemble,
"segmentations": segs
}
Classes
class WordSegmenter (segmenter_model_name_or_path='gpt2', segmenter_model_type='gpt2', segmenter_device='cuda', segmenter_gpu_batch_size=1, reranker_gpu_batch_size=2000, reranker_model_name_or_path='bert-base-uncased', reranker_model_type='bert')
-
Word segmentation API initialization. A GPT-2 model must be passed to
segmenter_model_name_or_path
, and optionally a BERT model toreranker_model_name_or_path
. Ifreranker_model_name_or_path
is set toFalse
orNone
, the word segmenter object will work without a reranker.Args
segmenter_model_name_or_path
:str
, optional- GPT-2 that will be fetched from the Hugging Face Model Hub. Defaults to "gpt2".
segmenter_model_type
:str
, optional- Transformer decoder model type. Defaults to "gpt2".
segmenter_device
:str
, optional- Device. Defaults to "cuda".
segmenter_gpu_batch_size
:int
, optional- Segmenter GPU batch size. Defaults to 1.
reranker_gpu_batch_size
:int
, optional- Reranker GPU split size. Defaults to 2000.
reranker_model_name_or_path
:str
, optional- BERT model that will be fetched from the Hugging Face Model Hub. It is possible to turn off the reranker by passing a None or False value to this argument. Defaults to "bert-base-uncased".
reranker_model_type
:str
, optional- Transformer encoder model type. Defaults to "bert".
Expand source code
class WordSegmenter(object): def __init__( self, segmenter_model_name_or_path = "gpt2", segmenter_model_type = "gpt2", segmenter_device = "cuda", segmenter_gpu_batch_size = 1, reranker_gpu_batch_size = 2000, reranker_model_name_or_path = "bert-base-uncased", reranker_model_type = "bert" ): """Word segmentation API initialization. A GPT-2 model must be passed to `segmenter_model_name_or_path`, and optionally a BERT model to `reranker_model_name_or_path`. If `reranker_model_name_or_path` is set to `False` or `None`, the word segmenter object will work without a reranker. Args: segmenter_model_name_or_path (str, optional): GPT-2 that will be fetched from the Hugging Face Model Hub. Defaults to "gpt2". segmenter_model_type (str, optional): Transformer decoder model type. Defaults to "gpt2". segmenter_device (str, optional): Device. Defaults to "cuda". segmenter_gpu_batch_size (int, optional): Segmenter GPU batch size. Defaults to 1. reranker_gpu_batch_size (int, optional): Reranker GPU split size. Defaults to 2000. reranker_model_name_or_path (str, optional): BERT model that will be fetched from the Hugging Face Model Hub. It is possible to turn off the reranker by passing a None or False value to this argument. Defaults to "bert-base-uncased". reranker_model_type (str, optional): Transformer encoder model type. Defaults to "bert". """ self.segmenter_model = Beamsearch( model_name_or_path=segmenter_model_name_or_path, model_type=segmenter_model_type, device=segmenter_device, gpu_batch_size=segmenter_gpu_batch_size ) if reranker_model_name_or_path: self.reranker_model = Reranker( model_name_or_path=reranker_model_name_or_path, model_type=reranker_model_type, gpu_batch_size=reranker_gpu_batch_size ) else: self.reranker_model = None def segment( self, word_list: List[str], topk: int = 20, steps: int = 13, alpha: float = 0.222, beta: float = 0.111, use_reranker: bool = True, return_ranks: bool = False, trim_hashtags: bool = True) -> Any : """Segment a list of hashtags. Args: word_list (List[str]): A list of hashtag strings. topk (int, optional): top-k parameter for the Beamsearch algorithm. A lower top-k value will speed up the algorithm. However, this will decrease the amount of candidate segmentations in a rank. Defaults to 20. steps (int, optional): steps parameter for the Beamsearch algorithm. A lower amount of steps will speed up the algorithm. However, the algorithm will never detect a number of words larger than amount of steps. Defaults to 13. alpha (float, optional): alpha parameter for the top-2 ensemble. It controls the weight given to the segmenter candidates. Reasonable values range from 0 to 1. Defaults to 0.222. beta (float, optional): beta parameter for the top-2 ensemble. It controls the weight given to the reranker candidates. Reasonable values range from 0 to 1. Defaults to 0.111. use_reranker (bool, optional): Whether or not to run the reranker. Defaults to True. return_ranks (bool, optional): Return not just the segmented hashtags but also the a dictionary of the ranks. Defaults to False. trim_hashtags (bool, optional): Automatically remove "#" characters from the beginning of the hashtags. Defaults to True. Returns: Any: A list of segmented hashtags if return_ranks == False. A dictionary of the ranks and the segmented hashtags if return_ranks == True. """ if trim_hashtags: word_list = \ [ x.lstrip("#") for x in word_list ] segmenter_run = self.segmenter_model.run( word_list, topk=topk, steps=steps ) ensemble = None if use_reranker: reranker_run = self.reranker_model.rerank(segmenter_run) ensemble = top2_ensemble( segmenter_run, reranker_run, alpha=alpha, beta=beta ) ensemble_prob_dict = enforce_prob_dict( ensemble, score_field="ensemble_rank") segs = ensemble_prob_dict.get_segmentations( astype="list", gold_array=word_list ) else: segmenter_prob_dict = enforce_prob_dict( segmenter_run, score_field="score" ) segs = segmenter_prob_dict.get_segmentations( astype="list", gold_array=word_list ) if not return_ranks: return segs else: segmenter_df = segmenter_run.to_dataframe().reset_index(drop=True) if use_reranker: reranker_df = reranker_run.to_dataframe().reset_index(drop=True) else: reranker_df = None return { "segmenter": segmenter_df, "reranker": reranker_df, "ensemble": ensemble, "segmentations": segs }
Methods
def segment(self, word_list: List[str], topk: int = 20, steps: int = 13, alpha: float = 0.222, beta: float = 0.111, use_reranker: bool = True, return_ranks: bool = False, trim_hashtags: bool = True) ‑> Any
-
Segment a list of hashtags.
Args
word_list
:List[str]
- A list of hashtag strings.
topk
:int
, optional- top-k parameter for the Beamsearch algorithm. A lower top-k value will speed up the algorithm. However, this will decrease the amount of candidate segmentations in a rank. Defaults to 20.
steps
:int
, optional- steps parameter for the Beamsearch algorithm. A lower amount of steps will speed up the algorithm. However, the algorithm will never detect a number of words larger than amount of steps. Defaults to 13.
alpha
:float
, optional- alpha parameter for the top-2 ensemble. It controls the weight given to the segmenter candidates. Reasonable values range from 0 to 1. Defaults to 0.222.
beta
:float
, optional- beta parameter for the top-2 ensemble. It controls the weight given to the reranker candidates. Reasonable values range from 0 to 1. Defaults to 0.111.
use_reranker
:bool
, optional- Whether or not to run the reranker. Defaults to True.
return_ranks
:bool
, optional- Return not just the segmented hashtags but also the a dictionary of the ranks. Defaults to False.
trim_hashtags
:bool
, optional- Automatically remove "#" characters from the beginning of the hashtags. Defaults to True.
Returns
Any
- A list of segmented hashtags if return_ranks == False. A dictionary of the ranks and the segmented hashtags if return_ranks == True.
Expand source code
def segment( self, word_list: List[str], topk: int = 20, steps: int = 13, alpha: float = 0.222, beta: float = 0.111, use_reranker: bool = True, return_ranks: bool = False, trim_hashtags: bool = True) -> Any : """Segment a list of hashtags. Args: word_list (List[str]): A list of hashtag strings. topk (int, optional): top-k parameter for the Beamsearch algorithm. A lower top-k value will speed up the algorithm. However, this will decrease the amount of candidate segmentations in a rank. Defaults to 20. steps (int, optional): steps parameter for the Beamsearch algorithm. A lower amount of steps will speed up the algorithm. However, the algorithm will never detect a number of words larger than amount of steps. Defaults to 13. alpha (float, optional): alpha parameter for the top-2 ensemble. It controls the weight given to the segmenter candidates. Reasonable values range from 0 to 1. Defaults to 0.222. beta (float, optional): beta parameter for the top-2 ensemble. It controls the weight given to the reranker candidates. Reasonable values range from 0 to 1. Defaults to 0.111. use_reranker (bool, optional): Whether or not to run the reranker. Defaults to True. return_ranks (bool, optional): Return not just the segmented hashtags but also the a dictionary of the ranks. Defaults to False. trim_hashtags (bool, optional): Automatically remove "#" characters from the beginning of the hashtags. Defaults to True. Returns: Any: A list of segmented hashtags if return_ranks == False. A dictionary of the ranks and the segmented hashtags if return_ranks == True. """ if trim_hashtags: word_list = \ [ x.lstrip("#") for x in word_list ] segmenter_run = self.segmenter_model.run( word_list, topk=topk, steps=steps ) ensemble = None if use_reranker: reranker_run = self.reranker_model.rerank(segmenter_run) ensemble = top2_ensemble( segmenter_run, reranker_run, alpha=alpha, beta=beta ) ensemble_prob_dict = enforce_prob_dict( ensemble, score_field="ensemble_rank") segs = ensemble_prob_dict.get_segmentations( astype="list", gold_array=word_list ) else: segmenter_prob_dict = enforce_prob_dict( segmenter_run, score_field="score" ) segs = segmenter_prob_dict.get_segmentations( astype="list", gold_array=word_list ) if not return_ranks: return segs else: segmenter_df = segmenter_run.to_dataframe().reset_index(drop=True) if use_reranker: reranker_df = reranker_run.to_dataframe().reset_index(drop=True) else: reranker_df = None return { "segmenter": segmenter_df, "reranker": reranker_df, "ensemble": ensemble, "segmentations": segs }