Skip to content
Snippets Groups Projects
Commit 7ecf02da authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

Added and fixed tests

parent 82f097e9
No related branches found
No related tags found
No related merge requests found
Pipeline #
......@@ -8,7 +8,7 @@ import numpy as np
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.shared.common import \
apply_subsets, apply_categories
apply_subsets, apply_categories, apply_id_filter
T = TypeVar('T')
......@@ -23,15 +23,22 @@ class BoxplotTask(AnalyticTask):
def main(self,
variables: List[pd.DataFrame],
categories: List[pd.DataFrame],
id_filter: List[T],
subsets: List[List[T]]) -> dict:
""" Compute boxplot statistics for the given parameters.
:param variables: List of numerical variables
:param categories: List of categorical variables used to group numerical
variables.
:param id_filter: List of ids that will be considered for analysis. If
empty all ids will be used.
:param subsets: List of subsets used as another way to group the
numerical variables.
"""
if not len(variables):
raise ValueError("Must at least specify one "
"non empty numerical variable.")
df = reduce(lambda l, r: l.merge(r, on='id', how='outer'), variables)
df = apply_id_filter(df=df, id_filter=id_filter)
df = apply_subsets(df=df, subsets=subsets)
variable_names = df.columns.tolist()
variable_names.remove('id')
......@@ -43,8 +50,15 @@ class BoxplotTask(AnalyticTask):
for variable in variable_names:
for subset in list(set(df['subset'].tolist())):
for category in list(set(df['category'].tolist())):
values = df[(df['subset'] == subset) & (df['category'] == category)][variable]
values = df[(df['subset'] == subset) & (df['category'] == category)][variable].tolist()
values = [value for value in values if not np.isnan(value)]
if not values:
continue
stats = self.boxplot_statistics(values)
if not results['statistics'].get(variable):
results['statistics'][variable] = {}
if not results['statistics'][variable].get(category):
results['statistics'][variable][category] = {}
results['statistics'][variable][category][subset] = stats
return results
......@@ -58,10 +72,16 @@ class BoxplotTask(AnalyticTask):
median = np.percentile(values, 50)
u_qrt = np.percentile(values, 75)
iqr = u_qrt - l_qrt
values.sort()
# lower whisker as defined by John W. Tukey
l_wsk = next(value for value in values if value >= l_qrt - 1.5 * iqr)
values.sort(reverse=True)
# upper whisker as defined by John W. Tukey
u_wsk = next(value for value in values if value <= u_qrt + 1.5 * iqr)
return {
'l_qrt': l_qrt,
'median': median,
'u_qrt': u_qrt,
'l_wsk': l_qrt - 1.5 * iqr,
'u_wsk': u_qrt + 1.5 * iqr
'l_wsk': l_wsk,
'u_wsk': u_wsk
}
"""Module containing the Celery Task for the Correlation Analysis."""
from typing import List, TypeVar, Tuple
from functools import reduce
import pandas as pd
import numpy as np
......@@ -8,7 +8,7 @@ from scipy import stats
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.shared.common import \
apply_subsets, apply_categories
apply_subsets, apply_categories, apply_id_filter
T = TypeVar('T')
......@@ -45,7 +45,7 @@ class CorrelationTask(AnalyticTask):
df = self.merge_x_y(x, y)
(x_label, y_label) = self.get_axis_labels(df)
df = self.apply_id_filter(df, id_filter)
df = apply_id_filter(df=df, id_filter=id_filter)
df = apply_subsets(df=df, subsets=subsets)
df = apply_categories(df=df, categories=annotations)
global_stats = self.compute_stats(df, method, x_label, y_label)
......@@ -87,19 +87,6 @@ class CorrelationTask(AnalyticTask):
y_label = colnames[1]
return (x_label, y_label)
@staticmethod
def apply_id_filter(df: pd.DataFrame, id_filter: list) -> pd.DataFrame:
"""Throw away all rows whose id is not in id_filter.
:param df: The DataFrame to filter.
:param id_filter: The filter.
:return: The filtered DataFrame.
"""
if id_filter:
df = df[df['id'].isin(id_filter)]
if df.shape[0] == 0:
raise ValueError("The current selection does not match any data.")
return df
@staticmethod
def compute_stats(df: pd.DataFrame, method: str,
x_label: str, y_label: str) -> dict:
......
......@@ -44,7 +44,7 @@ def apply_categories(df: pd.DataFrame,
:param categories: List of category DataFrames
:return: The base DataFrame with an additional 'category' column
"""
if categories:
if len(categories):
# merge all dfs into one
data = reduce(lambda l, r: l.merge(r, on='id', how='outer'), categories)
# remember ids
......@@ -62,4 +62,19 @@ def apply_categories(df: pd.DataFrame,
# merge category data into main df
df = df.merge(data, on='id', how='left')
# get unique categories
return df
\ No newline at end of file
else:
df = df.assign(category='')
return df
def apply_id_filter(df: pd.DataFrame, id_filter: list) -> pd.DataFrame:
"""Throw away all rows whose id is not in id_filter.
:param df: The DataFrame to filter.
:param id_filter: The filter.
:return: The filtered DataFrame.
"""
if id_filter:
df = df[df['id'].isin(id_filter)]
if df.shape[0] == 0:
raise ValueError("The current selection does not match any data.")
return df
"""This module contains the tests for the Boxplot analysis code."""
import json
import numpy as np
import pandas as pd
from fractalis.analytics.tasks.boxplot.main import BoxplotTask
# noinspection PyMissingOrEmptyDocstring,PyMissingTypeHints
class TestBoxplotAnalytics:
pass
task = BoxplotTask()
def test_functional_1(self):
arr_1 = np.c_[range(10), np.random.randint(0, 100, size=(10, 1))]
arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_3 = np.c_[range(5, 15), np.random.randint(0, 100, size=(10, 1))]
arr_4 = np.c_[range(100, 102), np.random.randint(0, 100, size=(2, 1))]
df_1 = pd.DataFrame(arr_1, columns=['id', 'A'])
df_2 = pd.DataFrame(arr_2, columns=['id', 'B'])
df_3 = pd.DataFrame(arr_3, columns=['id', 'C'])
df_4 = pd.DataFrame(arr_4, columns=['id', 'D'])
results = self.task.main(variables=[df_1, df_2, df_3, df_4],
categories=[],
id_filter=[],
subsets=[])
assert 'data' in results
assert 'statistics' in results
assert len(json.loads(results['data'])) == 22
assert len(results['statistics']) == 5
assert len(results['statistics']['A']) == 1
assert len(results['statistics']['A']['']) == 1
stats = results['statistics']['A'][''][0]
assert not np.isnan(stats['median'])
assert not np.isnan(stats['l_qrt'])
assert not np.isnan(stats['u_qrt'])
assert not np.isnan(stats['l_wsk'])
assert not np.isnan(stats['u_wsk'])
......@@ -11,18 +11,19 @@ from fractalis.analytics.tasks.correlation.main import CorrelationTask
# noinspection PyMissingOrEmptyDocstring,PyMissingTypeHints
class TestCorrelation:
task = CorrelationTask()
def test_functional_1(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(20))],
annotations=[])
result = self.task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(20))],
annotations=[])
assert result['coef']
assert result['p_value']
assert result['slope']
......@@ -34,128 +35,120 @@ class TestCorrelation:
assert result['y_label'] == 'B'
def test_functional_2(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x,
y=y,
id_filter=list(range(10)),
method='pearson',
subsets=[list(range(5, 15))],
annotations=[])
result = self.task.main(x=x,
y=y,
id_filter=list(range(10)),
method='pearson',
subsets=[list(range(5, 15))],
annotations=[])
df = json.loads(result['data'])
assert len(df) == 5
def test_functional_3(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result_1 = task.main(x=x,
y=y,
id_filter=list(range(20)),
method='pearson',
subsets=[],
annotations=[])
result_2 = task.main(x=x,
y=y,
id_filter=list(range(20)),
method='pearson',
subsets=[list(range(20))],
annotations=[])
result_1 = self.task.main(x=x,
y=y,
id_filter=list(range(20)),
method='pearson',
subsets=[],
annotations=[])
result_2 = self.task.main(x=x,
y=y,
id_filter=list(range(20)),
method='pearson',
subsets=[list(range(20))],
annotations=[])
assert result_1 == result_2
def test_functional_4(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
with pytest.raises(ValueError):
task.main(x=x,
y=y,
id_filter=[],
method='foo',
subsets=[list(range(20))],
annotations=[])
self.task.main(x=x,
y=y,
id_filter=[],
method='foo',
subsets=[list(range(20))],
annotations=[])
def test_functional_5(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(15, 25))],
annotations=[])
result = self.task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(15, 25))],
annotations=[])
df = json.loads(result['data'])
assert len(df) == 5
def test_functional_6(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20, 40), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
with pytest.raises(ValueError):
task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(20))],
annotations=[])
self.task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(20))],
annotations=[])
def test_functional_7(self):
task = CorrelationTask()
arr_1 = np.c_[range(10), np.random.randint(0, 100, size=(10, 1))]
arr_2 = np.c_[range(5, 20), np.random.randint(0, 100, size=(15, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
with pytest.raises(ValueError):
task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(10, 20))],
annotations=[])
self.task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(10, 20))],
annotations=[])
def test_functional_8(self):
task = CorrelationTask()
arr_1 = np.c_[range(10), np.random.randint(0, 100, size=(10, 1))]
arr_2 = np.c_[range(5, 20), np.random.randint(0, 100, size=(15, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(5, 20))],
annotations=[])
result = self.task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(5, 20))],
annotations=[])
df = json.loads(result['data'])
assert len(df) == 5
def test_functional_9(self):
task = CorrelationTask()
arr_1 = np.c_[range(10), np.random.randint(0, 100, size=(10, 1))]
arr_2 = np.c_[range(5, 20), np.random.randint(0, 100, size=(15, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[
list(range(5)),
list(range(5, 10)),
list(range(10, 20))
],
annotations=[])
result = self.task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[
list(range(5)),
list(range(5, 10)),
list(range(10, 20))
],
annotations=[])
assert not np.isnan(result['coef'])
assert len(result['subsets']) == 3
assert np.isnan(result['subsets'][0]['coef'])
......@@ -163,62 +156,50 @@ class TestCorrelation:
assert np.isnan(result['subsets'][2]['coef'])
def test_functional_10(self):
task = CorrelationTask()
arr_1 = np.c_[range(2), np.random.randint(0, 100, size=(2, 1))]
arr_2 = np.c_[range(1, 3), np.random.randint(0, 100, size=(2, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(4))],
annotations=[])
result = self.task.main(x=x,
y=y,
id_filter=[],
method='pearson',
subsets=[list(range(4))],
annotations=[])
df = json.loads(result['data'])
assert np.isnan(result['coef'])
assert len(df) == 1
def test_merge_x_y(self):
task = CorrelationTask()
df1 = pd.DataFrame([[1, 'a'], [2, float('nan')], [3, 'a']],
columns=['id', 'A'])
df2 = pd.DataFrame([[1, 'b'], [2, 'b'], [4, 'b']],
columns=['id', 'B'])
result = task.merge_x_y(df1, df2)
result = self.task.merge_x_y(df1, df2)
assert result.shape[0] == 1
assert list(result['id']) == [1]
def test_apply_id_filter(self):
task = CorrelationTask()
df = pd.DataFrame([[1, 'a'], [2, 'a']], columns=['id', 'A'])
result = task.apply_id_filter(df, [2, 3])
assert result.shape[0] == 1
assert list(result['id']) == [2]
def test_compute_stats_1(self):
task = CorrelationTask()
df = pd.DataFrame([[1, 1, 2],
[2, 3, 4],
[3, 5, 6]], columns=['id', 'A', 'B'])
result = task.compute_stats(df, 'pearson', 'A', 'B')
result = self.task.compute_stats(df, 'pearson', 'A', 'B')
assert not np.isnan(result['coef'])
assert not np.isnan(result['p_value'])
assert not np.isnan(result['slope'])
assert not np.isnan(result['intercept'])
def test_compute_stats_2(self):
task = CorrelationTask()
df = pd.DataFrame([[1, 1, 2],
[2, float('nan'), 4],
[3, 5, float('nan')]], columns=['id', 'A', 'B'])
result = task.compute_stats(df, 'pearson', 'A', 'B')
result = self.task.compute_stats(df, 'pearson', 'A', 'B')
assert np.isnan(result['coef'])
assert np.isnan(result['p_value'])
assert np.isnan(result['slope'])
assert np.isnan(result['intercept'])
def test_get_axis_labels(self):
task = CorrelationTask()
df = pd.DataFrame([['a', 1, 'b']], columns=['A', 'id', 'B'])
result = task.get_axis_labels(df)
result = self.task.get_axis_labels(df)
assert result == ('A', 'B')
......@@ -28,3 +28,9 @@ class TestCommonTasks:
result = common.apply_categories(df=df, categories=[category_1, category_2])
assert list(result['category'])[0:2] == ['x&&y', 'y&&z']
assert np.isnan(list(result['category'])[2])
def test_apply_id_filter(self):
df = pd.DataFrame([[1, 'a'], [2, 'a']], columns=['id', 'A'])
result = common.apply_id_filter(df=df, id_filter=[2, 3])
assert result.shape[0] == 1
assert list(result['id']) == [2]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment