Skip to content
Snippets Groups Projects
Commit a8019dc5 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

Heavily improved correlation analysis and added tests

parent 27c49bc6
No related branches found
No related tags found
No related merge requests found
Pipeline #
from typing import List
import pandas as pd
import numpy as np
from scipy import stats
......@@ -9,14 +11,66 @@ class CorrelationTask(AnalyticTask):
name = 'compute-correlation'
def main(self, x, y, ids, method='pearson'):
def main(self, x: pd.DataFrame, y: pd.DataFrame, id_filter: List[str],
method: str, subsets: List[List[str]]) -> dict:
if x.shape[0] == 0 or y.shape[0] == 0:
raise ValueError("X or Y contain no data.")
if x.shape[1] < 2 or y.shape[1] < 2:
raise ValueError("X or Y are malformed.")
if method not in ['pearson', 'spearman', 'kendall']:
raise ValueError("Unknown method '{}'".format(method))
if len(subsets) == 0:
raise ValueError("No subsets specified.")
df = pd.merge(x, y, on='id')
df = df.dropna()
if ids:
df = df[df['id'].isin(ids)]
df_noid = df.drop('id', 1)
x_list = df_noid.ix[:, 0].values.tolist()
y_list = df_noid.ix[:, 1].values.tolist()
if df.shape[0] == 0:
raise ValueError("X and Y do not share any ids.")
if id_filter:
df = df[df['id'].isin(id_filter)]
if df.shape[0] == 0:
raise ValueError("The current selection does not match any data.")
output = {
'subsets': {}
}
_df = pd.DataFrame()
for i, subset in enumerate(subsets):
df_subset = df[df['id'].isin(subset)]
subset_col = pd.Series([i] * df_subset.shape[0])
df_subset = df_subset.assign(subset=subset_col)
output['subsets'][i] = self.compute_stats(df_subset)
_df = _df.append(df_subset)
df = _df
del _df
if df.shape[0] == 0:
raise ValueError("No data match given subsets. Keep in mind that X "
"and Y are intersected before the subsets are "
"applied.")
global_stats = self.compute_stats(df.drop_duplicates('id'))
output.update(global_stats)
output['method'] = method
output['data'] = df.to_json()
output['x_label'] = list(df)[0]
output['y_label'] = list(df)[1]
return output
@staticmethod
def compute_stats(df: pd.DataFrame) -> dict:
if df.shape[0] < 2:
return {
'coef': float('nan'),
'p_value': float('nan'),
'slope': float('nan'),
'intercept': float('nan')
}
df = df.drop('id', 1)
x_list = df.ix[:, 0].values.tolist()
y_list = df.ix[:, 1].values.tolist()
corr_coef, p_value = stats.pearsonr(x_list, y_list)
slope, intercept, *_ = np.polyfit(x_list, y_list, deg=1)
return {
......@@ -24,8 +78,4 @@ class CorrelationTask(AnalyticTask):
'p_value': p_value,
'slope': slope,
'intercept': intercept,
'method': method,
'data': df.to_json(),
'x_label': list(df_noid)[0],
'y_label': list(df_noid)[1]
}
import json
import pytest
import pandas as pd
import numpy as np
from fractalis.analytics.tasks.correlation.main import CorrelationTask
# noinspection PyMissingOrEmptyDocstring,PyMissingTypeHints
class TestCorrelation:
def test_returns_expected_output_1(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x, y=y, id_filter=[],
method='pearson', subsets=[list(range(20))])
assert result['coef']
assert result['p_value']
assert result['slope']
assert result['intercept']
assert result['subsets']
assert result['method']
assert result['data']
assert result['x_label']
assert result['y_label']
def test_returns_expected_output_2(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x, y=y, id_filter=list(range(10)),
method='pearson', subsets=[list(range(5, 15))])
df = json.loads(result['data'])
assert len(df['id']) == 5
def test_returns_expected_output_3(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
with pytest.raises(ValueError):
task.main(x=x, y=y, id_filter=list(range(20)),
method='pearson', subsets=[])
def test_returns_expected_output_4(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
with pytest.raises(ValueError):
task.main(x=x, y=y, id_filter=[],
method='foo', subsets=[list(range(20))])
def test_returns_expected_output_5(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x, y=y, id_filter=[],
method='pearson', subsets=[list(range(15, 25))])
df = json.loads(result['data'])
assert len(df['id']) == 5
def test_returns_expected_output_6(self):
task = CorrelationTask()
arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
arr_2 = np.c_[range(20, 40), np.random.randint(0, 100, size=(20, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
with pytest.raises(ValueError):
task.main(x=x, y=y, id_filter=[],
method='pearson', subsets=[list(range(20))])
def test_returns_expected_output_7(self):
task = CorrelationTask()
arr_1 = np.c_[range(10), np.random.randint(0, 100, size=(10, 1))]
arr_2 = np.c_[range(5, 20), np.random.randint(0, 100, size=(15, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
with pytest.raises(ValueError):
task.main(x=x, y=y, id_filter=[],
method='pearson', subsets=[list(range(10, 20))])
def test_returns_expected_output_8(self):
task = CorrelationTask()
arr_1 = np.c_[range(10), np.random.randint(0, 100, size=(10, 1))]
arr_2 = np.c_[range(5, 20), np.random.randint(0, 100, size=(15, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x, y=y, id_filter=[],
method='pearson', subsets=[list(range(5, 20))])
df = json.loads(result['data'])
assert len(df['id']) == 5
def test_returns_expected_output_9(self):
task = CorrelationTask()
arr_1 = np.c_[range(10), np.random.randint(0, 100, size=(10, 1))]
arr_2 = np.c_[range(5, 20), np.random.randint(0, 100, size=(15, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x, y=y, id_filter=[], method='pearson',
subsets=[
list(range(5)),
list(range(5, 10)),
list(range(10, 20))
])
assert not np.isnan(result['coef'])
assert len(result['subsets']) == 3
assert np.isnan(result['subsets'][0]['coef'])
assert not np.isnan(result['subsets'][1]['coef'])
assert np.isnan(result['subsets'][2]['coef'])
def test_returns_expected_output_10(self):
task = CorrelationTask()
arr_1 = np.c_[range(2), np.random.randint(0, 100, size=(2, 1))]
arr_2 = np.c_[range(1, 3), np.random.randint(0, 100, size=(2, 1))]
x = pd.DataFrame(arr_1, columns=['id', 'A'])
y = pd.DataFrame(arr_2, columns=['id', 'B'])
result = task.main(x=x, y=y, id_filter=[], method='pearson',
subsets=[list(range(4))])
df = json.loads(result['data'])
assert np.isnan(result['coef'])
assert len(df['id']) == 1
import json
import pytest
import numpy as np
from fractalis.analytics.tasks.correlation.main import CorrelationTask
class TestCorrelation:
@pytest.mark.skip(reason="Not implemented yet.")
def test_returns_valid_response(self):
job = CorrelationTask()
x = np.random.rand(10).tolist()
y = np.random.rand(10).tolist()
result = job.main(x=x, y=y, ids=[])
try:
result = json.loads(result)
except ValueError:
assert False
assert result.coef
assert result.p_value
assert result.slope
assert result.intercept
assert result.data
assert result.x_label
assert result.y_label
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment