"""Snip statistics functions"""
from collections import defaultdict, Counter
from dataclasses import dataclass
from functools import reduce
from typing import Iterable, Tuple, Union
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from py2store.util import groupby
from slang.util import lazyprop, running_mean_gen, row_euclidean_distance
from slang.util import snips_to_str as dflt_snips_to_str
GroupId = int
FV = Union[float, Iterable[float]]
GroupedFV = Tuple[GroupId, FV]
# Note: ClassifiedMoments and ClassifiedMomentsFitter untested and paused for now
# @dataclass
# TODO: Review-w-mp
class ClassifiedMoments:
_zero_var = 33e-33
def __init__(self, n_classes_: int, initial_count: int = 1):
self.n_classes_ = n_classes_
self.initial_count = initial_count
self.initialize_stats()
def initialize_stats(self):
self.count_ = np.ones(self.n_classes_) * self.initial_count
self.sum_ = np.zeros(self.n_classes_)
self.sum2_ = np.zeros(self.n_classes_)
return self
@property
def global_count_(self):
return len(self.count_)
@property
def global_sum_(self):
return sum(self.sum_)
@property
def global_sum2_(self):
return sum(self.sum2_)
@property
def global_mean_(self):
return self.global_sum_ / self.global_count_
@property
def global_var_(self):
return max(
self._zero_var,
(self.global_sum2_ / self.global_count_) - self.global_mean_ ** 2,
) # TODO: verify
@property
def global_std_(self):
return np.sqrt(self.global_var_)
@property
def inverse_frequency_(self):
return 1 / self.count_
@property
def mean_(self):
return self.sum_ / (self.count_)
@property
def var_(self):
"""Variance: E[X**2] - E[X]**2 (https://en.wikipedia.org/wiki/Variance)"""
return (self.sum2_ / self.count_) - self.mean_ ** 2 #
def inverse_frequency_for_groups(self, groups):
return 1 / self.count_[groups]
def var_for_groups(self, groups):
count = self.count_[groups]
return (self.sum2_[groups] / count) - (self.sum_[groups] / count) ** 2
def predict_within_group(self, gfvs: Iterable[GroupedFV]):
groups, fvs = list(map(np.array, zip(*gfvs)))
return np.array(
list(
zip(
self.inverse_frequency_for_groups(groups),
fvs / np.sqrt(self.var_for_groups(groups)),
)
)
)
def predict_global(self, gfvs: Iterable[GroupedFV]):
groups, fvs = list(map(np.array, zip(*gfvs)))
return np.array(
list(zip(self.inverse_frequency_for_groups(groups), fvs / self.global_std_))
)
predict = predict_global
def __call__(self, gfv: GroupedFV):
return self.predict([gfv])[0]
# Make ABC or mother class
[docs]class ClassifiedMomentsFitter(ClassifiedMoments, BaseEstimator):
def fit_partial_single(self, gfv: GroupedFV):
group, fv = gfv
self.count_[group] += 1
self.sum_[group] += fv
self.sum2_[group] += fv ** 2
return self
# # TODO: Make this work
# def __iadd__(self, gfv: GroupedFV):
# return self.fit_partial_single(gfv)
def _group_x_by_y(self, X, y):
d = defaultdict(list)
# TODO: Better than a for loop? (d.update(self.snip_and_distance(fvs)) doesn't work, but want something like))
for label, x in zip(y, X):
d[label].append(x)
return d
def fit_partial(self, gfvs: Iterable[GroupedFV], y=None):
for gfv in gfvs:
self.fit_partial_single(gfv)
return self
def fit(self, gfvs: Iterable[GroupedFV], y=None):
if not self.n_classes_:
self.n_classes_ = np.max(y)
self.initialize_stats()
return self.fit_partial(gfvs)
def _is_a_tuple_of_aligned_iterables(x):
return hasattr(x, '__len__') and len(x) == 2
def iterate_over_pairs(pairs):
if _is_a_tuple_of_aligned_iterables(pairs):
yield from zip(*pairs)
else:
yield from pairs # assume pairs is an iterable of pairs
def mk_snips_of_tag_dict(tags, snips):
return groupby(zip(tags, snips), lambda x: x[0], lambda x: x[1])
# TODO: Fillna is not additive smoothing, so should make it so?
class TagSnipStats:
def __init__(self, snips, tags, snips_to_str=None, fillna=0, tag_order=None):
"""
:param snips: The snips to analyze
:param tags: The corresponding tags (one per snip)
:param snips_to_str: A function that outputs a string of characters corresponding to the snip sequence
:param fillna: The pseudocount that will be used in the probability estimator.
That is, the value to assign to non-observed (snip, tag) pairs count.
See https://en.wikipedia.org/wiki/Additive_smoothing
:param tag_order:
"""
self.snips = snips
self.tags = tags
self.snips_to_str = snips_to_str
self.fillna = fillna
self.tag_order = tag_order
@lazyprop
def snip_tag_counts(self):
df = df_of_snip_count_for_tag(
(self.tags, self.snips), self.snips_to_str, self.fillna, self.tag_order
)
if self.tag_order is None:
self.tag_order = df.columns.values
total_count = df.sum(axis=1)
snip_order = (total_count.sort_values(ascending=False)).index.values
df = df.loc[snip_order]
return df
@lazyprop
def snip_count_for_tag(self):
return tag_snip_count_dict_from_tags_and_snips(self.tags, self.snips)
@lazyprop
def snip_order(self):
return self.snip_tag_counts.index.values
@lazyprop
def log_bayes_factor(self):
return np.log2(bayes_factor_df_from_snip_count_df(self.snip_tag_counts))
@lazyprop
def snips_of_tag(self):
return mk_snips_of_tag_dict(self.tags, self.snips)
def scores_for_tag(self, tag, snips=None):
if snips is None:
snips = self.snips
return np.array(list(map(self.log_bayes_factor[tag].loc.__getitem__, snips)))
def plot_snip_count_for_tag(
self,
snips_to_str=None,
figsize=(14, 10),
tag_order=None,
output_fig=False,
ylabel_rotation=90,
):
return plot_snip_count_for_tag(
self.snip_count_for_tag,
snips_to_str=snips_to_str,
figsize=figsize,
tag_order=tag_order,
output_fig=output_fig,
ylabel_rotation=ylabel_rotation,
)
# TODO: Make a decorator that takes care of the `x = x or getattr(self, x, None)` pattern
def plot_bars_of_tag_snip_stats(
self,
figsize=(24, 18),
output_fig=False,
ylabel_fontsize=None,
ylabel_rotation=90,
tag_order=None,
snip_order=None,
snips_to_str=None,
):
tag_order = tag_order or self.tag_order
snip_order = snip_order or self.snip_order
snips_to_str = snips_to_str or self.snips_to_str
bar_plot_of_tag_snip_stats(
self.log_bayes_factor,
snips_to_str=snips_to_str,
figsize=figsize,
snip_order=snip_order,
tag_order=tag_order,
output_fig=output_fig,
ylabel_fontsize=ylabel_fontsize,
ylabel_rotation=ylabel_rotation,
)
def plot_tag_scores(
self,
tags=None,
snips=None,
chk_size=1,
chk_step=1,
figsize=(24, 18),
ylabel_fontsize=20,
ylabel_rotation=0,
predict_thresh=0,
normal_style='-k',
over_predict_style='ob',
):
if tags is None:
tags = self.tag_order
if snips is None:
snips = np.array(reduce(lambda x, y: x + y, self.snips_of_tag.values(), []))
gen = tag_scores_gen(self.scores_for_tag, tags, snips, chk_size, chk_step)
tag_snips_cursor = 0
n_tags = len(tags)
plt.figure(figsize=figsize)
for i, (tag, scores) in enumerate(gen, 1):
scores = np.array(scores)
x_vals = np.arange(len(scores))
plt.subplot(n_tags, 1, i)
plt.plot(scores, normal_style)
plt.plot([0, len(scores)], [0, predict_thresh], ':', color='k', alpha=0.8)
over_predict_thresh = scores >= predict_thresh
plt.plot(
x_vals[over_predict_thresh],
scores[over_predict_thresh],
over_predict_style,
)
# n_tag_snips = len(tss.snips_of_tag[tag])
# these_snip_scores = snip_scores[tag_snips_cursor:(tag_snips_cursor + n_tag_snips)]
# tag_snips_idx = list(range(tag_snips_cursor, tag_snips_cursor + len(these_snip_scores)))
# plt.plot(tag_snips_idx, these_snip_scores, '-o', color='k');
# tag_snips_cursor += n_tag_snips
plt.axis('tight')
plt.ylabel(tag, fontsize=ylabel_fontsize, rotation=ylabel_rotation)
def plot_smoothed_log_bayes_factors(
self, chk_size=1, chk_step=None, tag_order=None
):
return plot_smoothed_log_bayes_factors(
self.snips_of_tag,
self.log_bayes_factor,
chk_size=chk_size,
chk_step=chk_step,
tags=tag_order,
)
def tags_and_snips_str_gen(self, tags=None, snips=None, snips_to_str=None):
snips_to_str = snips_to_str or self.snips_to_str or dflt_snips_to_str
if tags is None and snips is None:
snips_of_tag = self.snips_of_tag
else:
snips_of_tag = mk_snips_of_tag_dict(tags, snips)
for tag, snips in snips_of_tag.items():
yield tag, snips_to_str(snips)
def mk_tags_and_snips_str_string(
self,
tags=None,
snips=None,
snips_to_str=None,
tag_snips_format_str='{:<21}: {}\n\n',
):
s = ''
for tag, snips_str in self.tags_and_snips_str_gen(tags, snips, snips_to_str):
s += tag_snips_format_str.format(tag, snips_str)
return s
def print_tags_and_snips_str(
self,
tags=None,
snips=None,
snips_to_str=None,
tag_snips_format_str='{:<21}: {}\n\n',
):
print(
self.mk_tags_and_snips_str_string(
tags, snips, snips_to_str, tag_snips_format_str
)
)
# class SnipStats:
# """Compute frequency and centroid distance statistics on snips
# """
#
# def __init__(self, pseudocount=0, tag_order=None, alphabet_size=None):
# self.pseudocount = pseudocount
# self.tag_order = tag_order
# self._alphabet_size = alphabet_size # TODO: Use to tell TagSnipStats it should fill until there
#
# @lazyprop
# def alphabet_size(self):
# return self._alphabet_size or self.log_bayes_factor_.index.max()
#
# def fit(self, snips, tags=None):
# self.tag_snip_stats = TagSnipStats(snips, tags, fillna=self.pseudocount, tag_order=self.tag_order)
# self.log_bayes_factor_ = self.tag_snip_stats.log_bayes_factor.sort_index()
# self.classes_ = self.tag_snip_stats.tag_order
# return self
[docs]class BayesFactors:
"""BayesFactors classifier with sklearn-like interface. predict_probas are the log2 of the bayes factor.
It is sklearn-like, but to avoid depending on sklearn, it isn't a subclass of BaseEstimator and ClassifierMixin.
It just has the essentials of the classifier: a `fit`, a `predict_proba` and a derived `predict` method, and
a `classes_` attribute that indices the columns of the `predict_proba` matrix.
"""
def __init__(self, pseudocount=0, tag_order=None, alphabet_size=None):
self.pseudocount = pseudocount
self.tag_order = tag_order
self._alphabet_size = (
alphabet_size # TODO: Use to tell TagSnipStats it should fill until there
)
@lazyprop
def alphabet_size(self):
return self._alphabet_size or self.log_bayes_factor_.index.max()
def fit(self, snips, tags):
self.tag_snip_stats = TagSnipStats(
snips, tags, fillna=self.pseudocount, tag_order=self.tag_order
)
self.log_bayes_factor_ = self.tag_snip_stats.log_bayes_factor.sort_index()
self.classes_ = self.tag_snip_stats.tag_order
return self
@classmethod
def from_params(cls, log_bayes_factor_, classes_=None):
self = cls()
if classes_ is None:
classes_ = log_bayes_factor_.columns.values
self.log_bayes_factor_ = log_bayes_factor_[
classes_
] # To assert columns contents and ensure order
self.classes_ = classes_
def scores_for_tag(self, tag, snips):
return np.array(list(map(self.log_bayes_factor_[tag].loc.__getitem__, snips)))
def predict_proba(self, snips):
scores_for_tag = self.scores_for_tag
return np.vstack(list(scores_for_tag(tag, snips) for tag in self.classes_)).T
[docs] def predict(self, snips):
"""
Predict class labels for each snip of snips sequence.
"""
indices = self.predict_proba(snips).argmax(axis=1)
return self.classes_[indices]
def _assert_sanity(self):
assert set(np.diff(sorted(self.log_bayes_factor_.index))) == {
1
}, 'some snips are missing!'
_diagnosis = _assert_sanity # but deprecating _diagnosis
# TODO: self.snip_to_score([0]) blows up, but self.snip_to_score([0, 1, 2]) gives me a score (should blow up)
def __call__(self, snip):
return self.predict_proba([snip])[0]
def mk_model_caller(kind='asis'):
if kind == 'predict_proba':
def model_caller(self, snip):
return self.predict_proba([snip])[0]
elif kind == 'tag_probs':
def model_caller(self, snip):
return {
tag: prob
for tag, prob in zip(self.classes_, self.predict_proba([snip])[0])
}
elif kind == 'predict':
def model_caller(self, snip):
return self.predict([snip])[0]
else:
raise ValueError(f'Unknown kind: {kind}')
return model_caller
[docs]class PredictProbaBF(BayesFactors):
def __call__(self, snip):
return self.predict_proba([snip])[0]
[docs]class TagProbsBF(BayesFactors):
def __call__(self, snip):
return {
tag: prob for tag, prob in zip(self.classes_, self.predict_proba([snip])[0])
}
[docs]class PredictBF(BayesFactors):
def __call__(self, snip):
return self.predict([snip])[0]
[docs]def tag_slice_iter_from_slices_of_tag_dict(slices_of_tag):
"""
Get an iterator of (tag, (bt, tt)) pairs
:param slices_of_tag: a {tag: [(bt, tt),...], ...} dict listing slices annotated by tags
:return: a tag, (bt, tt) iterator
"""
for tag, slices in slices_of_tag.items():
for sl in slices:
yield tag, sl
# def snip_count_for_tag_from_wf_and_tag_slice_iter(wf, tag_slice_iter, snips_of_wf, sr=None):
# """
#
# :param wf: waveform of oto.sound.audio.Sound object
# :param tag_slice_iter:
# :param snips_of_wf:
# :param sr:
# :return:
# """
#
# # from slang.utils.audio_core import Sound
# snip_count_for_tag = defaultdict(Counter)
#
# for tag, sl in tag_slice_iter:
# chk = sound[slice(*sl)].wf # wf chunk for sl slice
# chk_snips = snips_of_wf(chk)
# snip_count_for_tag[tag].update(chk_snips)
#
# return dict(snip_count_for_tag)
def tag_snip_count_dict_from_tags_and_snips(tags, snips):
snip_count_for_tag = defaultdict(dict)
for (tag, snip), count in Counter(zip(tags, snips)).items():
snip_count_for_tag[tag][snip] = count
return dict(snip_count_for_tag)
[docs]def df_of_snip_count_for_tag(
snip_count_for_tag, snips_to_str=None, fillna=0, tag_order=None
):
"""
A df representation of snip_count_for_tag
:param snip_count_for_tag: {tag: {snip: count, ...},...} dict
:param snips_to_str: A function that transforms snip lists into strings (mapping each snip to a character)
:param fillna: What to fill missing values with
:param tag_order: Serves both to specify an order of the tags, and to specify a subset of tags if we don't want all
:return: A dataframe of snip (in rows) counts for each tag (in columns)
"""
if isinstance(snip_count_for_tag, tuple) and len(snip_count_for_tag) == 2:
snip_count_for_tag = tag_snip_count_dict_from_tags_and_snips(
*snip_count_for_tag
)
df = pd.DataFrame(snip_count_for_tag).fillna(fillna)
if tag_order is not None:
df = df[tag_order]
df.index.names = ['snip']
if snips_to_str is not None:
df = df.reset_index(drop=False)
df['snip'] = list(snips_to_str(df.index.values))
df = df.set_index('snip')
return df
def snip_order_from_snip_count_df(snip_count_df):
total_count = snip_count_df.sum(axis=1)
return (total_count.sort_values(ascending=False)).index.values
def bayes_factor_df_from_snip_count_df(snip_count_df):
smoothed_count = snip_count_df + 1
total_snip_count = smoothed_count.sum(axis=1)
prob_given_tag = smoothed_count / smoothed_count.sum(axis=0)
# prob_given_not_tag = total_snip_count.sub(smoothed_count, level=0, fill_value=0, axis=0)
prob_given_not_tag = -smoothed_count.sub(total_snip_count, axis=0)
prob_given_not_tag /= prob_given_not_tag.sum(axis=0)
return prob_given_tag / prob_given_not_tag
def log_bayes_factor_bayes_factor_df_from_snip_count_df(snip_count_df):
return np.log2(bayes_factor_df_from_snip_count_df(snip_count_df))
[docs]def bar_plot_of_tag_snip_stats(
snip_stats_for_tag,
snips_to_str=None,
figsize=(14, 10),
snip_order=None,
tag_order=None,
output_fig=False,
ylabel_fontsize=None,
ylabel_rotation=90,
):
"""
Multiplot of snip count bars for each tag (in a different row). First row is the total count for each snip.
:param snip_count_for_tag: {tag: {snip: count, ...},...} nested dict
:param snips_to_str: A function that transforms snip lists into strings (mapping each snip to a character)
:param figsize:
:param output_fig:
:param tag_order: Serves both to specify an order of the tags, and to specify a subset of tags if we don't want all
:param ylabel_rotation: Will be applied to the ylabel
:return:
"""
if figsize is not None:
fig = plt.figure(figsize=figsize)
else:
fig = plt.gcf()
if snips_to_str is None:
snips_to_str = lambda snips: list(map(str, snips))
if not isinstance(snip_stats_for_tag, pd.DataFrame):
snip_stats_for_tag = pd.DataFrame(snip_stats_for_tag)
if snip_order is not None:
snip_stats_for_tag = snip_stats_for_tag.loc[snip_order, :]
if tag_order is not None:
snip_stats_for_tag = snip_stats_for_tag[tag_order]
n_tags = len(snip_stats_for_tag.columns)
n_snips = len(snip_stats_for_tag)
ax_list = list()
for i, tag in enumerate(snip_stats_for_tag.columns, 1):
sr = snip_stats_for_tag[tag]
ax = plt.subplot(n_tags + 1, 1, i)
ax_list.append(ax)
positive_lidx = sr > 0
sr.plot(kind='bar', color=positive_lidx.map({True: 'b', False: '#D3D3D3'}))
h = plt.ylabel(tag)
h.set_rotation(ylabel_rotation)
if ylabel_fontsize is not None:
h.set_fontsize(ylabel_fontsize)
plt.xlabel('')
if i == 1:
plt.xticks(
list(range(n_snips)), snips_to_str(snip_stats_for_tag.index.values)
)
ax.xaxis.tick_top()
else:
plt.xticks(list(range(n_snips)), ' ' * n_snips)
plt.xticks(list(range(n_snips)), snips_to_str(snip_stats_for_tag.index.values))
if n_snips < 50:
for ax in ax_list:
# ax.grid(True, axis='x')
ax.grid(True)
if output_fig:
return fig
[docs]def plot_snip_count_for_tag(
snip_count_for_tag,
snips_to_str=None,
figsize=(14, 10),
tag_order=None,
output_fig=False,
ylabel_fontsize=None,
ylabel_rotation=90,
):
"""
Multiplot of snip count bars for each tag (in a different row). First row is the total count for each snip.
:param snip_count_for_tag: {tag: {snip: count, ...},...} nested dict
:param snips_to_str: A function that transforms snip lists into strings (mapping each snip to a character)
:param figsize:
:param output_fig:
:param tag_order: Serves both to specify an order of the tags, and to specify a subset of tags if we don't want all
:param ylabel_rotation: Will be applied to the ylabel
:return:
"""
if figsize is not None:
fig = plt.figure(figsize=figsize)
else:
fig = plt.gcf()
if snips_to_str is None:
snips_to_str = lambda snips: list(map(str, snips))
df = df_of_snip_count_for_tag(snip_count_for_tag, fillna=0, tag_order=tag_order)
n_tags = len(df.columns)
df['ALL'] = df.sum(axis=1)
df = df.sort_values('ALL', ascending=False)
n_snips = len(df)
ax_list = list()
for i, tag in enumerate(df.columns, 1):
sr = df[tag]
ax = plt.subplot(n_tags + 1, 1, i)
ax_list.append(ax)
sr.plot(kind='bar')
h = plt.ylabel(tag)
h.set_rotation(ylabel_rotation)
if ylabel_fontsize is not None:
h.set_fontsize(ylabel_fontsize)
plt.xlabel('')
if i == 1:
plt.xticks(list(range(n_snips)), snips_to_str(df.index.values))
ax.xaxis.tick_top()
else:
plt.xticks(list(range(n_snips)), ' ' * n_snips)
plt.xticks(list(range(n_snips)), snips_to_str(df.index.values))
for ax in ax_list:
ax.grid(True, axis='x')
if output_fig:
return fig
def plot_tag_scores_for_snips(
snips_of_tag,
snip_tag_score_df,
tag_order=None,
smoothing_window_size=1,
figsize=(24, 18),
ylabel_fontsize=15,
ylabel_rotation=0,
):
assert isinstance(
snip_tag_score_df, pd.DataFrame
), 'isinstance(snip_tag_score_df, pd.DataFrame)'
def scores_of_snips(snips, tag):
return list(map(snip_tag_score_df[tag].loc.__getitem__, snips))
if tag_order is None:
tag_order = list(snips_of_tag.keys())
n_tags = len(tag_order)
all_snips = reduce(lambda x, y: x + y, snips_of_tag.values(), [])
plt.figure(figsize=figsize)
tag_snips_cursor = 0
for i, tag in enumerate(tag_order, 1):
plt.subplot(n_tags, 1, i)
snip_scores = list(
running_mean_gen(scores_of_snips(all_snips, tag), smoothing_window_size)
)
plt.plot(snip_scores, '-')
plt.plot([0, len(snip_scores)], [0, 0], ':k')
n_tag_snips = len(snips_of_tag[tag])
these_snip_scores = snip_scores[
tag_snips_cursor : (tag_snips_cursor + n_tag_snips)
]
tag_snips_idx = list(
range(tag_snips_cursor, tag_snips_cursor + len(these_snip_scores))
)
plt.plot(tag_snips_idx, these_snip_scores, 'k-')
tag_snips_cursor += n_tag_snips
plt.axis('tight')
plt.ylabel(tag, fontsize=ylabel_fontsize, rotation=ylabel_rotation)
def tags_and_snips_to_snip_of_tag(tags, snips):
snips_of_tag = defaultdict(list)
for tag, snip in zip(tags, snips):
snips_of_tag[tag].append(snip)
return dict(snips_of_tag)
# def _tag_order_from_df(df_with_tags_as_columns_and_snips_as_indices):
# snip_order = df_with_tags_as_columns_and_snips_as_indices.index.values
# tag_order = df_with_tags_as_columns_and_snips_as_indices.columns.values
# return snip_order, tag_order
# def scores_of_snips(tag, snips, snip_log_bayes_factor_of_tag):
# lbf_for_tag = snip_log_bayes_factor_of_tag[tag]
# if isinstance(snip_log_bayes_factor_of_tag, pd.DataFrame):
# log_bayes_factor_for_snip = lbf_for_tag.loc.__getitem__
# elif hasattr(lbf_for_tag, '__getitem__'):
# log_bayes_factor_for_snip = lbf_for_tag.__getitem__
# else:
# assert callable(lbf_for_tag), "At this point lbf_for_tag can only be callable"
# log_bayes_factor_for_snip = lbf_for_tag
#
# return list(map(log_bayes_factor_for_snip, snips))
def tag_scores_gen(scores_for_tag, tags, snips, chk_size=1, chk_step=1):
if isinstance(tags, (str, int, float)):
tags = [tags]
for tag in tags:
yield tag, list(
running_mean_gen(scores_for_tag(tag, snips), chk_size, chk_step)
)
def plot_tag_scores(
scores_for_tag,
tags,
snips,
chk_size=1,
chk_step=1,
figsize=(24, 18),
ylabel_fontsize=20,
ylabel_rotation=0,
predict_thresh=0,
normal_style='-k',
over_predict_style='ob',
):
gen = tag_scores_gen(scores_for_tag, tags, snips, chk_size, chk_step)
tag_snips_cursor = 0
n_tags = len(tags)
plt.figure(figsize=figsize)
for i, (tag, scores) in enumerate(gen, 1):
scores = np.array(scores)
x_vals = np.arange(len(scores))
plt.subplot(n_tags, 1, i)
plt.plot(scores, normal_style)
plt.plot([0, len(scores)], [0, predict_thresh], ':', color='k', alpha=0.8)
over_predict_thresh = scores >= predict_thresh
plt.plot(
x_vals[over_predict_thresh], scores[over_predict_thresh], over_predict_style
)
# n_tag_snips = len(tss.snips_of_tag[tag])
# these_snip_scores = snip_scores[tag_snips_cursor:(tag_snips_cursor + n_tag_snips)]
# tag_snips_idx = list(range(tag_snips_cursor, tag_snips_cursor + len(these_snip_scores)))
# plt.plot(tag_snips_idx, these_snip_scores, '-o', color='k');
# tag_snips_cursor += n_tag_snips
plt.axis('tight')
plt.ylabel(tag, fontsize=ylabel_fontsize, rotation=ylabel_rotation)
def scores_of_snips(tag, snips, snip_log_bayes_factor_of_tag):
return list(map(snip_log_bayes_factor_of_tag[tag].loc.__getitem__, snips))
def plot_smoothed_log_bayes_factors(
snips_of_tag,
snip_log_bayes_factor_of_tag,
chk_size=1,
chk_step=1,
tags=None,
ylabel_fontsize=15,
ylabel_rotation=0,
):
# dflt_snip_order, dflt_tag_order = _tag_order_from_df(snip_tag_counts)
if tags:
snip_log_bayes_factor_of_tag = snip_log_bayes_factor_of_tag[tags]
else:
tags = list(snip_log_bayes_factor_of_tag.columns)
n_tags = len(tags)
all_snips = reduce(lambda x, y: x + y, snips_of_tag.values(), [])
plt.figure(figsize=(24, 18))
tag_snips_cursor = 0
for i, tag in enumerate(tags, 1):
plt.subplot(n_tags, 1, i)
snip_scores = list(
running_mean_gen(
scores_of_snips(tag, all_snips, snip_log_bayes_factor_of_tag),
chk_size,
chk_step,
)
)
plt.plot(snip_scores, '-')
plt.plot([0, len(snip_scores)], [0, 0], ':k')
n_tag_snips = len(snips_of_tag[tag])
these_snip_scores = snip_scores[
tag_snips_cursor : (tag_snips_cursor + n_tag_snips)
]
tag_snips_idx = list(
range(tag_snips_cursor, tag_snips_cursor + len(these_snip_scores))
)
plt.plot(tag_snips_idx, these_snip_scores, 'k-')
tag_snips_cursor += n_tag_snips
plt.axis('tight')
plt.ylabel(tag, fontsize=ylabel_fontsize, rotation=ylabel_rotation)
def snip_scores_from_lookup(snips, snip_to_score):
if isinstance(snip_to_score, (pd.Series, dict)):
snip_to_score = snip_to_score.__getitem__
elif isinstance(snip_to_score, pd.DataFrame):
_snip_to_score = {
k: snip_to_score[k].loc.__getitem__ for k in list(snip_to_score.columns)
}
snip_to_score = lambda snip: {k: lookup(snip) for k, lookup in _snip_to_score}
return map(snip_to_score, snips)
# assert isinstance(snip_to_score, pd.DataFrame), \
# "snip_to_score needs to be DataFrame whose index values are snips and columns are the different score kinds " \
# "you want to compute"