Module hashformers.experiments.utils
Expand source code
import pandas as pd
import copy
import numpy as np
from hashformers.experiments.evaluation import (
filter_top_k
)
def project_scores(
a,
b,
segmentation_field="segmentation",
score_field="score"):
b_view = b[[segmentation_field, score_field]]\
.drop_duplicates(subset=[segmentation_field])
df = pd.merge(a, b_view, on=segmentation_field, how='left')
df = df.drop([score_field+'_x'], axis=1)
df = df.rename(columns={
score_field+'_y': score_field
})
df = df.sort_values(by=score_field, ascending=True)
return df
def filter_and_project_scores(
a,
b,
characters_field="hashtag",
segmentation_field="segmentation"):
models = copy.deepcopy([a,b])
for idx, m in enumerate(models):
models[idx] = models[idx]\
.sort_values(by=[characters_field, segmentation_field])
models[0] = filter_top_k(models[0], 2, fill=True)
models[1] = project_scores(models[0], models[1])
for idx, m in enumerate(models):
models[idx] = models[idx]\
.sort_values(by=[characters_field, segmentation_field])\
.reset_index(drop=True)
return models
def calculate_diff_scores(
a,
b,
characters_field="hashtag",
score_field="score"):
models = copy.deepcopy([a,b])
for idx, m in enumerate(models):
models[idx] = models[idx]\
.sort_values(by=[characters_field, score_field])
score_pairs = models[idx][score_field].values.reshape(-1,2)
models[idx]['rank'] = \
score_pairs.argsort().flatten()
models[idx]['diff'] = \
np.repeat(np.subtract.reduce(score_pairs, axis=1).flatten(), 2)
models[idx]['diff'] = \
models[idx]['diff'].fillna(0.0)
return models
def build_ensemble_df(
a,
b
):
models = filter_and_project_scores(a, b)
models = calculate_diff_scores(models[0], models[1])
for idx, m in enumerate(models):
models[idx]['diff'] = np.abs(models[idx]['diff'].values)
models[0]['diff_2'] = models[1]['diff']
models[0]['rank_2'] = models[1]['rank']
return models[0]
Functions
def build_ensemble_df(a, b)
-
Expand source code
def build_ensemble_df( a, b ): models = filter_and_project_scores(a, b) models = calculate_diff_scores(models[0], models[1]) for idx, m in enumerate(models): models[idx]['diff'] = np.abs(models[idx]['diff'].values) models[0]['diff_2'] = models[1]['diff'] models[0]['rank_2'] = models[1]['rank'] return models[0]
def calculate_diff_scores(a, b, characters_field='hashtag', score_field='score')
-
Expand source code
def calculate_diff_scores( a, b, characters_field="hashtag", score_field="score"): models = copy.deepcopy([a,b]) for idx, m in enumerate(models): models[idx] = models[idx]\ .sort_values(by=[characters_field, score_field]) score_pairs = models[idx][score_field].values.reshape(-1,2) models[idx]['rank'] = \ score_pairs.argsort().flatten() models[idx]['diff'] = \ np.repeat(np.subtract.reduce(score_pairs, axis=1).flatten(), 2) models[idx]['diff'] = \ models[idx]['diff'].fillna(0.0) return models
def filter_and_project_scores(a, b, characters_field='hashtag', segmentation_field='segmentation')
-
Expand source code
def filter_and_project_scores( a, b, characters_field="hashtag", segmentation_field="segmentation"): models = copy.deepcopy([a,b]) for idx, m in enumerate(models): models[idx] = models[idx]\ .sort_values(by=[characters_field, segmentation_field]) models[0] = filter_top_k(models[0], 2, fill=True) models[1] = project_scores(models[0], models[1]) for idx, m in enumerate(models): models[idx] = models[idx]\ .sort_values(by=[characters_field, segmentation_field])\ .reset_index(drop=True) return models
def project_scores(a, b, segmentation_field='segmentation', score_field='score')
-
Expand source code
def project_scores( a, b, segmentation_field="segmentation", score_field="score"): b_view = b[[segmentation_field, score_field]]\ .drop_duplicates(subset=[segmentation_field]) df = pd.merge(a, b_view, on=segmentation_field, how='left') df = df.drop([score_field+'_x'], axis=1) df = df.rename(columns={ score_field+'_y': score_field }) df = df.sort_values(by=score_field, ascending=True) return df