Fixed a bug where row and columns were mixed up

8819e3bb · Sascha Herzinger · c9f6e3fc · 8819e3bb · 8819e3bb
Commit 8819e3bb authored 7 years ago by Sascha Herzinger
--- a/fractalis/analytics/tasks/heatmap/cluster.py
+++ b/fractalis/analytics/tasks/heatmap/cluster.py
@@ -17,14 +17,14 @@ logger = logging.getLogger(__name__)

 class ClusteringTask(AnalyticTask):

-    name = 'compute-clustering'
+    name = 'compute-cluster'

-    def main(self, df: str, cluster_algo: str,
+    def main(self, df: dict, cluster_algo: str,
             options: dict) -> dict:
        try:
-            df = pd.read_json(df)
+            df = pd.DataFrame.from_dict(df)
        except Exception:
-            error = "Failed to parse string to data frame."
+            error = "Failed to parse input data frame."
            logger.error(error)
            raise ValueError(error)
        # fill NAs with col medians so the clustering algorithms will work
@@ -49,9 +49,9 @@ class ClusteringTask(AnalyticTask):
                    "perform a hierarchical clustering."
            logger.error(error)
            raise ValueError(error)
-        row_names, row_clusters = self._hclust(df.T, method,
+        row_names, row_clusters = self._hclust(df, method,
                                               metric, n_row_clusters)
-        col_names, col_clusters = self._hclust(df, method,
+        col_names, col_clusters = self._hclust(df.T, method,
                                               metric, n_col_clusters)
        return {
            'row_names': row_names,
@@ -62,9 +62,9 @@ class ClusteringTask(AnalyticTask):

    def _hclust(self, df: pd.DataFrame,
                method: str, metric: str, n_clusters: int) -> Tuple[List, List]:
-        names = list(df)
-        series = np.array(df)
-        z = hclust.linkage(series, method=method, metric=metric)
+        names = list(df.index)
+        values = df.values
+        z = hclust.linkage(values, method=method, metric=metric)
        cluster = [x[0] for x in hclust.cut_tree(z,
                                                 n_clusters=[n_clusters])]
        cluster_count = Counter(cluster)
@@ -93,8 +93,8 @@ class ClusteringTask(AnalyticTask):
            logger.error(error)
            raise ValueError(error)

-        row_names, row_clusters = self._kmeans(df.T, n_row_centroids)
-        col_names, col_clusters = self._kmeans(df, n_col_centroids)
+        row_names, row_clusters = self._kmeans(df, n_row_centroids)
+        col_names, col_clusters = self._kmeans(df.T, n_col_centroids)
        return {
            'row_names': row_names,
            'col_names': col_names,
@@ -103,9 +103,9 @@ class ClusteringTask(AnalyticTask):
        }

    def _kmeans(self, df: pd.DataFrame, n_centroids) -> Tuple[List, List]:
-        names = list(df)
-        series = np.array(df).astype('float')
-        cluster = list(kmeans2(series, k=n_centroids, minit='points')[1])
+        names = list(df.index)
+        values = df.as_matrix().astype('float')
+        cluster = list(kmeans2(values, k=n_centroids, minit='points')[1])
        cluster_count = Counter(cluster)
        # sort elements by their cluster size
        sorted_cluster = sorted(zip(names, cluster),

--- a/tests/heatmap/test_cluster.py
+++ b/tests/heatmap/test_cluster.py
@@ -13,7 +13,7 @@ class TestClustering:

    task = ClusteringTask()

-    valid_df = json.dumps({
+    df = {
        'A': {
            'a': 50,
            'b': 2,
@@ -29,20 +29,9 @@ class TestClustering:
            'b': 4,
            'c': 60
        }
-    })
+    }

    def test_hclust_raises_with_invalid_param_1(self):
-        with pytest.raises(ValueError) as e:
-            options = {
-                'method': 'single',
-                'metric': 'euclidean',
-                'n_row_clusters': 2,
-                'n_col_clusters': 2
-            }
-            self.task.main(df='{//foo', cluster_algo='hclust', options=options)
-            assert 'parse string to data frame' in e
-
-    def test_hclust_raises_with_invalid_param_2(self):
        with pytest.raises(ValueError) as e:
            options = {
                'method': 'abc',
@@ -50,11 +39,10 @@ class TestClustering:
                'n_row_clusters': 2,
                'n_col_clusters': 2
            }
-            self.task.main(df=self.valid_df,
-                           cluster_algo='hclust', options=options)
+            self.task.main(df=self.df, cluster_algo='hclust', options=options)
            assert 'Invalid method' in e

-    def test_hclust_raises_with_invalid_param_3(self):
+    def test_hclust_raises_with_invalid_param_2(self):
        with pytest.raises(ValueError) as e:
            options = {
                'method': 'single',
@@ -62,19 +50,17 @@ class TestClustering:
                'n_row_clusters': 2,
                'n_col_clusters': 2
            }
-            self.task.main(df=self.valid_df,
-                           cluster_algo='hclust', options=options)
+            self.task.main(df=self.df, cluster_algo='hclust', options=options)
            assert 'Invalid metric' in e

-    def test_hclust_raises_with_invalid_param_4(self):
+    def test_hclust_raises_with_invalid_param_3(self):
        with pytest.raises(ValueError) as e:
            options = {
                'method': 'single',
                'metric': 'abc',
                'n_row_clusters': 2,
            }
-            self.task.main(df=self.valid_df,
-                           cluster_algo='hclust', options=options)
+            self.task.main(df=self.df, cluster_algo='hclust', options=options)
            assert 'mandatory parameters' in e

    def test_hclust_returns_valid_result(self):
@@ -84,7 +70,7 @@ class TestClustering:
            'n_row_clusters': 2,
            'n_col_clusters': 2
        }
-        result = self.task.main(df=self.valid_df,
+        result = self.task.main(df=self.df,
                                cluster_algo='hclust', options=options)
        assert 'row_names' in result
        assert 'col_names' in result
@@ -95,33 +81,21 @@ class TestClustering:
        assert [0, 0, 1] == result['row_cluster']
        assert [0, 0, 1] == result['col_cluster']

-
    def test_kmean_raises_with_invalid_param_1(self):
-        with pytest.raises(ValueError) as e:
-            options = {
-                'n_row_centroids': 2,
-                'n_col_centroids': 2
-            }
-            self.task.main(df='{//foo', cluster_algo='kmeans', options=options)
-            assert 'parse string to data frame' in e
-
-    def test_kmean_raises_with_invalid_param_2(self):
        with pytest.raises(ValueError) as e:
            options = {
                'n_row_centroids': 2,
                'n_col_centroids': 'abc'
            }
-            self.task.main(df=self.valid_df,
-                           cluster_algo='kmeans', options=options)
+            self.task.main(df=self.df, cluster_algo='kmeans', options=options)
            assert 'invalid' in e

-    def test_kmean_raises_with_invalid_param_3(self):
+    def test_kmean_raises_with_invalid_param_2(self):
        with pytest.raises(ValueError) as e:
            options = {
                'n_row_centroids': 2,
            }
-            self.task.main(df=self.valid_df,
-                           cluster_algo='kmeans', options=options)
+            self.task.main(df=self.df, cluster_algo='kmeans', options=options)
            assert 'mandatory parameters' in e

    def test_kmean_returns_valid_result(self):
@@ -129,7 +103,7 @@ class TestClustering:
            'n_row_centroids': 2,
            'n_col_centroids': 2
        }
-        result = self.task.main(df=self.valid_df,
+        result = self.task.main(df=self.df,
                                cluster_algo='kmeans', options=options)
        assert 'row_names' in result
        assert 'col_names' in result