Heavily improved correlation analysis and added tests

a8019dc5 · Sascha Herzinger · 27c49bc6 · a8019dc5 · a8019dc5 · 27c49bc6
Commit a8019dc5 authored 7 years ago by Sascha Herzinger
--- a/fractalis/analytics/tasks/correlation/main.py
+++ b/fractalis/analytics/tasks/correlation/main.py
+from typing import List
+
 import pandas as pd
 import numpy as np
 from scipy import stats
@@ -9,14 +11,66 @@ class CorrelationTask(AnalyticTask):

    name = 'compute-correlation'

-    def main(self, x, y, ids, method='pearson'):
+    def main(self, x: pd.DataFrame, y: pd.DataFrame, id_filter: List[str],
+             method: str, subsets: List[List[str]]) -> dict:
+        if x.shape[0] == 0 or y.shape[0] == 0:
+            raise ValueError("X or Y contain no data.")
+        if x.shape[1] < 2 or y.shape[1] < 2:
+            raise ValueError("X or Y are malformed.")
+        if method not in ['pearson', 'spearman', 'kendall']:
+            raise ValueError("Unknown method '{}'".format(method))
+        if len(subsets) == 0:
+            raise ValueError("No subsets specified.")
+
        df = pd.merge(x, y, on='id')
        df = df.dropna()
-        if ids:
-            df = df[df['id'].isin(ids)]
-        df_noid = df.drop('id', 1)
-        x_list = df_noid.ix[:, 0].values.tolist()
-        y_list = df_noid.ix[:, 1].values.tolist()
+        if df.shape[0] == 0:
+            raise ValueError("X and Y do not share any ids.")
+
+        if id_filter:
+            df = df[df['id'].isin(id_filter)]
+        if df.shape[0] == 0:
+            raise ValueError("The current selection does not match any data.")
+
+        output = {
+            'subsets': {}
+        }
+
+        _df = pd.DataFrame()
+        for i, subset in enumerate(subsets):
+            df_subset = df[df['id'].isin(subset)]
+            subset_col = pd.Series([i] * df_subset.shape[0])
+            df_subset = df_subset.assign(subset=subset_col)
+            output['subsets'][i] = self.compute_stats(df_subset)
+            _df = _df.append(df_subset)
+        df = _df
+        del _df
+        if df.shape[0] == 0:
+            raise ValueError("No data match given subsets. Keep in mind that X "
+                             "and Y are intersected before the subsets are "
+                             "applied.")
+
+        global_stats = self.compute_stats(df.drop_duplicates('id'))
+
+        output.update(global_stats)
+        output['method'] = method
+        output['data'] = df.to_json()
+        output['x_label'] = list(df)[0]
+        output['y_label'] = list(df)[1]
+        return output
+
+    @staticmethod
+    def compute_stats(df: pd.DataFrame) -> dict:
+        if df.shape[0] < 2:
+            return {
+                'coef': float('nan'),
+                'p_value': float('nan'),
+                'slope': float('nan'),
+                'intercept': float('nan')
+            }
+        df = df.drop('id', 1)
+        x_list = df.ix[:, 0].values.tolist()
+        y_list = df.ix[:, 1].values.tolist()
        corr_coef, p_value = stats.pearsonr(x_list, y_list)
        slope, intercept, *_ = np.polyfit(x_list, y_list, deg=1)
        return {
@@ -24,8 +78,4 @@ class CorrelationTask(AnalyticTask):
            'p_value': p_value,
            'slope': slope,
            'intercept': intercept,
-            'method': method,
-            'data': df.to_json(),
-            'x_label': list(df_noid)[0],
-            'y_label': list(df_noid)[1]
        }
--- a/tests/correlation/test_correlation.py
+++ b/tests/correlation/test_correlation.py
+import json
+
+import pytest
+import pandas as pd
+import numpy as np
+
+from fractalis.analytics.tasks.correlation.main import CorrelationTask
+
+
+# noinspection PyMissingOrEmptyDocstring,PyMissingTypeHints
+class TestCorrelation:
+
+    def test_returns_expected_output_1(self):
+        task = CorrelationTask()
+        arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
+        arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
+        x = pd.DataFrame(arr_1, columns=['id', 'A'])
+        y = pd.DataFrame(arr_2, columns=['id', 'B'])
+        result = task.main(x=x, y=y, id_filter=[],
+                           method='pearson', subsets=[list(range(20))])
+        assert result['coef']
+        assert result['p_value']
+        assert result['slope']
+        assert result['intercept']
+        assert result['subsets']
+        assert result['method']
+        assert result['data']
+        assert result['x_label']
+        assert result['y_label']
+
+    def test_returns_expected_output_2(self):
+        task = CorrelationTask()
+        arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
+        arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
+        x = pd.DataFrame(arr_1, columns=['id', 'A'])
+        y = pd.DataFrame(arr_2, columns=['id', 'B'])
+        result = task.main(x=x, y=y, id_filter=list(range(10)),
+                           method='pearson', subsets=[list(range(5, 15))])
+        df = json.loads(result['data'])
+        assert len(df['id']) == 5
+
+    def test_returns_expected_output_3(self):
+        task = CorrelationTask()
+        arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
+        arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
+        x = pd.DataFrame(arr_1, columns=['id', 'A'])
+        y = pd.DataFrame(arr_2, columns=['id', 'B'])
+        with pytest.raises(ValueError):
+            task.main(x=x, y=y, id_filter=list(range(20)),
+                      method='pearson', subsets=[])
+
+    def test_returns_expected_output_4(self):
+        task = CorrelationTask()
+        arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
+        arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
+        x = pd.DataFrame(arr_1, columns=['id', 'A'])
+        y = pd.DataFrame(arr_2, columns=['id', 'B'])
+        with pytest.raises(ValueError):
+            task.main(x=x, y=y, id_filter=[],
+                      method='foo', subsets=[list(range(20))])
+
+    def test_returns_expected_output_5(self):
+        task = CorrelationTask()
+        arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
+        arr_2 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
+        x = pd.DataFrame(arr_1, columns=['id', 'A'])
+        y = pd.DataFrame(arr_2, columns=['id', 'B'])
+        result = task.main(x=x, y=y, id_filter=[],
+                           method='pearson', subsets=[list(range(15, 25))])
+        df = json.loads(result['data'])
+        assert len(df['id']) == 5
+
+    def test_returns_expected_output_6(self):
+        task = CorrelationTask()
+        arr_1 = np.c_[range(20), np.random.randint(0, 100, size=(20, 1))]
+        arr_2 = np.c_[range(20, 40), np.random.randint(0, 100, size=(20, 1))]
+        x = pd.DataFrame(arr_1, columns=['id', 'A'])
+        y = pd.DataFrame(arr_2, columns=['id', 'B'])
+        with pytest.raises(ValueError):
+            task.main(x=x, y=y, id_filter=[],
+                      method='pearson', subsets=[list(range(20))])
+
+    def test_returns_expected_output_7(self):
+        task = CorrelationTask()
+        arr_1 = np.c_[range(10), np.random.randint(0, 100, size=(10, 1))]
+        arr_2 = np.c_[range(5, 20), np.random.randint(0, 100, size=(15, 1))]
+        x = pd.DataFrame(arr_1, columns=['id', 'A'])
+        y = pd.DataFrame(arr_2, columns=['id', 'B'])
+        with pytest.raises(ValueError):
+            task.main(x=x, y=y, id_filter=[],
+                      method='pearson', subsets=[list(range(10, 20))])
+
+    def test_returns_expected_output_8(self):
+        task = CorrelationTask()
+        arr_1 = np.c_[range(10), np.random.randint(0, 100, size=(10, 1))]
+        arr_2 = np.c_[range(5, 20), np.random.randint(0, 100, size=(15, 1))]
+        x = pd.DataFrame(arr_1, columns=['id', 'A'])
+        y = pd.DataFrame(arr_2, columns=['id', 'B'])
+        result = task.main(x=x, y=y, id_filter=[],
+                           method='pearson', subsets=[list(range(5, 20))])
+        df = json.loads(result['data'])
+        assert len(df['id']) == 5
+
+    def test_returns_expected_output_9(self):
+        task = CorrelationTask()
+        arr_1 = np.c_[range(10), np.random.randint(0, 100, size=(10, 1))]
+        arr_2 = np.c_[range(5, 20), np.random.randint(0, 100, size=(15, 1))]
+        x = pd.DataFrame(arr_1, columns=['id', 'A'])
+        y = pd.DataFrame(arr_2, columns=['id', 'B'])
+        result = task.main(x=x, y=y, id_filter=[], method='pearson',
+                           subsets=[
+                               list(range(5)),
+                               list(range(5, 10)),
+                               list(range(10, 20))
+                           ])
+        assert not np.isnan(result['coef'])
+        assert len(result['subsets']) == 3
+        assert np.isnan(result['subsets'][0]['coef'])
+        assert not np.isnan(result['subsets'][1]['coef'])
+        assert np.isnan(result['subsets'][2]['coef'])
+
+    def test_returns_expected_output_10(self):
+        task = CorrelationTask()
+        arr_1 = np.c_[range(2), np.random.randint(0, 100, size=(2, 1))]
+        arr_2 = np.c_[range(1, 3), np.random.randint(0, 100, size=(2, 1))]
+        x = pd.DataFrame(arr_1, columns=['id', 'A'])
+        y = pd.DataFrame(arr_2, columns=['id', 'B'])
+        result = task.main(x=x, y=y, id_filter=[], method='pearson',
+                           subsets=[list(range(4))])
+        df = json.loads(result['data'])
+        assert np.isnan(result['coef'])
+        assert len(df['id']) == 1
--- a/tests/correlation/test_main.py
+++ b/tests/correlation/test_main.py
-import json
-
-import pytest
-import numpy as np
-
-from fractalis.analytics.tasks.correlation.main import CorrelationTask
-
-
-class TestCorrelation:
-
-    @pytest.mark.skip(reason="Not implemented yet.")
-    def test_returns_valid_response(self):
-        job = CorrelationTask()
-        x = np.random.rand(10).tolist()
-        y = np.random.rand(10).tolist()
-        result = job.main(x=x, y=y, ids=[])
-        try:
-            result = json.loads(result)
-        except ValueError:
-            assert False
-        assert result.coef
-        assert result.p_value
-        assert result.slope
-        assert result.intercept
-        assert result.data
-        assert result.x_label
-        assert result.y_label