"""Snipping: Feature vector quantization"""
from typing import Callable
import itertools
from collections import Counter, defaultdict
import numpy as np
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import KMeans
from slang.chunkers import fixed_step_chunker
from slang.snip_stats import BayesFactors
from slang.core import Snipper
from slang.util import row_euclidean_distance, mk_callable
class DfltWfToChk:
def __init__(self, chk_size=2048, chk_step=None):
if chk_step is None:
chk_step = chk_size
self.chk_size = chk_size
self.chk_step = chk_step
def fit(self, *args, **kwargs):
return self # no fitting required
def __call__(self, wf):
yield from fixed_step_chunker(
wf, chk_size=self.chk_size, chk_step=self.chk_step
)
[docs]@mk_callable('single_transform')
class PcaChkToFv(PCA):
def __init__(self, n_components=5, **kwargs):
super().__init__(n_components=n_components, **kwargs)
# def __call__(self, fv):
# return self.transform([fv])[0]
[docs]@mk_callable('single_transform')
class LdaChkToFv(LinearDiscriminantAnalysis):
def __init__(self, n_components=5, **kwargs):
super().__init__(n_components=n_components, **kwargs)
DfltChkToFv = PcaChkToFv
class FvToSnip: # TODO: Mixin? ABC?
# fit: optional
# __call__: required
# fvs_to_snips: required?
# fvs_to_snip_distance_pairs: required?
@property
def fv_of_snip(self):
"""array providing representative fv for each snip"""
raise NotImplemented(
'Not implemented -- should be implemented in subclass of FvToSnipMixin'
)
# return None
from slang.stypes import FV, FVs
# TODO: choose n_clusters from len(X) at fit time
# TODO: post_fit_proc shouldn't be here, but external and optional
[docs]@mk_callable('single_predict')
class KMeansFvToSnip(KMeans, FvToSnip):
stats_of_snip = None # will be filled when instance is fit
# cluster_centers_ = None # will be filled when instance is fit
# n_clusters = None
#
# fv_of_snip = cluster_centers_ # Snipper's api expectation
# alphabet_size = n_clusters # Snipper's api expectation
@property
def fv_of_snip(self):
return self.cluster_centers_
@property
def alphabet_size(self):
return self.n_clusters
def __init__(self, n_clusters=47, **kwargs):
super().__init__(n_clusters=n_clusters, **kwargs)
# def __call__(self, fv: FV):
# return self.predict([fv])[0]
[docs] def fit(self, fvs: FVs, y=None, sample_weight=None):
super().fit(fvs, y, sample_weight)
# self.post_fit_proc(fvs, y, sample_weight=sample_weight)
return self
# fvs_to_snips = KMeans.predict # alias for predict
# ... replaced (because need to handle iterator of fvs) by:
def fvs_to_snips(self, fvs, sample_weight=None):
# return self.predict(list(fvs), sample_weight=sample_weight)
if isinstance(fvs, (list, tuple, np.ndarray)):
return self.predict(fvs, sample_weight=sample_weight)
else:
return map(lambda x: self.predict([x]), fvs)
[docs] def fvs_to_snip_distance_pairs(self, fvs: FVs):
"""iterator of (snip, distance_to_snip_centroid) pairs"""
fvs = np.array(list(fvs))
snips = self.fvs_to_snips(fvs)
return zip(snips, row_euclidean_distance(fvs, self.fv_of_snip[snips]))
# def post_fit_proc(self, fvs, y=None, **kwargs):
# d = defaultdict(list)
# # TODO: Better than a for loop? (d.update(self.snip_and_distance(fvs)) doesn't work, but want something like))
# for snip, distance in self.fvs_to_snip_distance_pairs(fvs):
# d[snip].append(distance)
#
# # TODO: A numpy array might be more useful here
# # self.stats_of_snip = np.empty(self.alphabet_size) * np.nan
#
# self.stats_of_snip = {snip:
# dict(count=len(distances),
# dist_sum=sum(distances),
# dist_sum2=sum(np.array(distances) ** 2))
# for snip, distances in d.items()}
#
# return self
[docs]class KMeansFvToSnipDist(KMeansFvToSnip):
def __call__(self, fv: FV):
return list(self.fvs_to_snip_distance_pairs([fv]))[0]
DfltFvToSnip = KMeansFvToSnip
from collections.abc import Iterable
[docs]def is_iterable(x):
"""Similar in nature to :func:`callable`, ``is_iterable`` returns
``True`` if an object is `iterable`_, ``False`` if not.
>>> is_iterable([])
True
>>> is_iterable(1)
False"""
return isinstance(x, Iterable)
def _assure_pair(iterables):
""" """
if hasattr(iterables, '__len__'):
if len(iterables) == 2:
return iterables
elif len(iterables) == 1 and is_iterable(iterables[0]):
return list(zip(*iterables[0]))
# TODO: else?... raise? If None on purpose, say it and why!
else:
return list(zip(*iterables))
_get_pairs = _assure_pair # but deprecating
def _assure_zipped(iterables):
""" """
if hasattr(iterables, '__len__'):
if len(iterables) == 1:
return iterables
else:
return zip(*iterables)
else:
return zip(*iterables)
def _is_a_tuple_of_aligned_iterables(x):
return hasattr(x, '__len__') and len(x) == 2
def iterate_over_pairs(pairs):
if _is_a_tuple_of_aligned_iterables(pairs):
yield from zip(*pairs)
else:
yield from pairs # assume pairs is an iterable of pairs
[docs]class FittableSnipper(Snipper):
wf_to_chks: Callable
chk_to_fv: Callable # TODO: Make a "Callable and Fittable" type
fv_to_snip: FvToSnip # TODO: Make a "Callable and Fittable" type
dflt_cls_of_name: dict = {
'wf_to_chks': DfltWfToChk,
'chk_to_fv': PcaChkToFv,
'fv_to_snip': DfltFvToSnip,
}
@classmethod
def mk_component(cls, obj, role=None, assert_func=None):
if isinstance(obj, type):
obj = obj()
elif obj is None:
obj = cls.dflt_cls_of_name.get(role)()
elif not callable(obj):
assert isinstance(obj, dict)
obj = cls.dflt_cls_of_name.get(role)(**obj)
else:
pass # as is
if assert_func is not None:
assert assert_func(obj)
return obj
def __init__(self, wf_to_chks=None, chk_to_fv=None, fv_to_snip=None):
wf_to_chks = self.mk_component(wf_to_chks, 'wf_to_chks', assert_func=callable)
chk_to_fv = self.mk_component(chk_to_fv, 'chk_to_fv', assert_func=callable)
fv_to_snip = self.mk_component(fv_to_snip, 'fv_to_snip', assert_func=callable)
super().__init__(wf_to_chks, chk_to_fv, fv_to_snip)
# TODO: Find a more concise way to take care of block above. Like... (but not working)
# _locals = locals()
# for obj_role in ['wf_to_chks', 'chk_to_fv', 'fv_to_snip', 'snip_to_score']:
# _locals[obj_role] = self.mk_component(_locals[obj_role], obj_role, assert_func=callable)
# TODO: Make the next three methods more DRY (see also ClassificationSnipper.fit_snip_to_score)
def fit_wf_to_chks(self, *wfs_tags):
if hasattr(self.wf_to_chks, 'fit'):
chks, tags = _assure_pair(
wfs_tags
) # need to generalize to situations with no tags
self.wf_to_chks.fit(chks, tags)
return self
def fit_chk_to_fv(self, *chks_tags):
if hasattr(self.chk_to_fv, 'fit'):
chks, tags = _assure_pair(chks_tags)
self.chk_to_fv.fit(chks, tags)
return self
def fit_fv_to_snip(self, *fvs_tags):
if hasattr(self.fv_to_snip, 'fit'):
fvs, tags = _assure_pair(fvs_tags)
self.fv_to_snip.fit(fvs, tags)
return self
# TODO: Dagify all this. Too hardcoded. No caching. No flexibility
def chk_tag_gen(self, wf_tag_gen):
for wf, tag in wf_tag_gen():
for chk in self.wf_to_chks(wf):
yield chk, tag
def fv_tag_gen(self, wf_tag_gen):
for chk, tag in self.chk_tag_gen(wf_tag_gen):
yield self.chk_to_fv(chk), tag
def snip_tag_gen(self, wf_tag_gen):
for fv, tag in self.fv_tag_gen(wf_tag_gen):
yield self.fv_to_snip(fv), tag
def fit_pipeline(self, wf_tags_gen):
self.fit_wf_to_chks(wf_tags_gen())
self.fit_chk_to_fv(self.chk_tag_gen(wf_tags_gen))
self.fit_fv_to_snip(self.fv_tag_gen(wf_tags_gen))
def fit(self, X, y=None):
if y is None:
wf_tags_gen = lambda: itertools.product(X, [y])
else:
wf_tags_gen = lambda: zip(X, y)
return self.fit_pipeline(wf_tags_gen)
[docs]class ClassificationSnipper(FittableSnipper):
snip_to_score: Callable # TODO: Make a "Callable and Fittable" type
dflt_cls_of_name: dict = dict(
FittableSnipper.dflt_cls_of_name, snip_to_score=BayesFactors
)
def __init__(
self,
wf_to_chks=DfltWfToChk(),
chk_to_fv=DfltChkToFv,
fv_to_snip=DfltFvToSnip,
snip_to_score=BayesFactors,
):
super().__init__(wf_to_chks, chk_to_fv, fv_to_snip)
snip_to_score = self.mk_component(
snip_to_score, 'snip_to_score', assert_func=callable
)
self.snip_to_score = snip_to_score
def fit_snip_to_score(self, *snips_tags):
if hasattr(self.snip_to_score, 'fit'):
snips, tags = _assure_pair(snips_tags)
self.snip_to_score.fit(snips, tags)
return self
# TODO: Dagify all this. Too hardcoded. No caching. No flexibility
def fit_pipeline(self, wf_tags_gen):
super().fit_pipeline(wf_tags_gen)
self.fit_snip_to_score(self.snip_tag_gen(wf_tags_gen))
# from slang.snip_stats import ClassifiedMomentsFitter
# from itertools import chain
#
#
# class OutlierSnipper(FittableSnipper):
# snip_to_score: Callable # TODO: Make a "Callable and Fittable" type
#
# dflt_cls_of_name: dict = dict(FittableSnipper.dflt_cls_of_name, snip_to_score=ClassifiedMomentsFitter)
#
# def __init__(self, wf_to_chks=DfltWfToChk(),
# chk_to_fv=DfltChkToFv,
# fv_to_snip=DfltFvToSnip,
# snip_to_score=ClassifiedMomentsFitter):
# super().__init__(wf_to_chks, chk_to_fv, fv_to_snip)
#
# snip_to_score = self.mk_component(snip_to_score, 'snip_to_score', assert_func=callable)
# self.snip_to_score = snip_to_score
#
# def fit_snip_to_score(self, *dist_snips):
# if hasattr(self.snip_to_score, 'fit'):
# dist, snips = _assure_pair(dist_snips)
# self.snip_to_score.fit(dist, snips)
# return self
#
# # TODO: Dagify all this. Too hardcoded. No caching. No flexibility
#
# def fit_pipeline(self, wf_gen):
# # super().fit_pipeline(wf_tags_gen)
# self.fit_wf_to_chks(wf_gen())
# self.fit_chk_to_fv(self.chk_tag_gen(wf_tags_gen))
# fvs = map(self.chk_to_fv, chain.from_iterable(map(self.wf_to_chks,
# self.fit_fv_to_snip(self.fv_tag_gen(wf_tags_gen))
#
# self.fit_snip_to_score(self.snip_tag_gen(wf_tags_gen))
# def snips_of_wf(self, wf: Waveform) -> Snips:
# warn("The name 'snips_of_wf' be replaced by 'wf_to_snips' soon.")
# for chk in self.wf_to_chks(wf):
# fv = self.chk_to_fv(chk)
# yield self.fv_to_snip(fv)
#
# def wf_to_fvs(self, wf: Waveform) -> FVs:
# for chk in self.wf_to_chks(wf):
# yield self.chk_to_fv(chk)
#
# def chk_to_snip(self, chk: Chunk) -> Snip:
# return self.fv_to_snip(self.chk_to_fv(chk))
#
# def wf_to_snips(self, wf: Waveform) -> Snips:
# for chk in self.wf_to_chks(wf):
# fv = self.chk_to_fv(chk)
# yield self.fv_to_snip(fv)
SlangClassifier = ClassificationSnipper # alias for back compatibility