Skip to content
Snippets Groups Projects
Commit 7bf7cc34 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

Added sorting and cutting to heatmap algo

parent 0a6d4fe7
No related branches found
No related tags found
No related merge requests found
......@@ -5,7 +5,6 @@ from functools import reduce
import logging
import pandas as pd
from scipy.stats import zscore
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.heatmap.stats import StatisticTask
......@@ -28,6 +27,7 @@ class HeatmapTask(AnalyticTask):
categoricals: List[pd.DataFrame],
ranking_method: str,
id_filter: List[T],
max_rows: int,
subsets: List[List[T]]) -> dict:
# merge input data into single df
df = reduce(lambda a, b: a.append(b), numerical_arrays)
......@@ -49,18 +49,13 @@ class HeatmapTask(AnalyticTask):
"the subset sample ids do not match the data."
logger.error(error)
raise ValueError(error)
for subset in subsets:
if not subset:
error = "One or more of the specified subsets does not " \
"match any sample id for the given array data."
logger.error(error)
raise ValueError(error)
# make matrix of input data
_df = df.pivot(index='feature', columns='id', values='value')
# create z-score matrix used for visualising the heatmap
z_df = _df.apply(zscore, axis=1)
z_df = _df.apply(lambda row: (row - row.mean()) / row.std(ddof=0),
axis=1)
# compute statistic for ranking
stats = self.stat_task.main(df=_df, subsets=subsets,
......@@ -73,6 +68,14 @@ class HeatmapTask(AnalyticTask):
df = df.merge(z_df, on=['id', 'feature'])
df.columns = ['id', 'feature', 'value', 'zscore']
# sort by ranking_value
df['sort_value'] = df['feature'].apply(
lambda x: stats[stats['feature'] == x][ranking_method][0])
df = df.sort_values('sort_value', ascending=False).drop('sort_value', 1)
# discard rows according to max_rows
df = df[df['feature'].isin(df['feature'].unique()[:max_rows])]
return {
'data': df.to_json(orient='records'),
'stats': stats.to_json(orient='records')
......
......@@ -70,7 +70,7 @@ class StatisticTask(AnalyticTask):
# prepare the df in case an id exists in more than one subset
if len(subsets) < 2:
error = "Limma analysis requires at least " \
"two groups for comparison."
"two non-empty groups for comparison."
logger.error(error)
raise ValueError(error)
if df.shape[0] < 1 or df.shape[1] < 2:
......
"""This module provides tests for the heatmap analysis main module."""
import json
import pytest
import pandas as pd
import numpy as np
from fractalis.analytics.tasks.heatmap.main import HeatmapTask
......@@ -11,7 +14,7 @@ class TestHeatmap:
task = HeatmapTask()
def test_functional_1(self):
def test_functional(self):
numerical_arrays = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
[102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
......@@ -24,43 +27,47 @@ class TestHeatmap:
categoricals=[],
ranking_method='B',
id_filter=[],
max_rows=100,
subsets=subsets)
assert 'data' in result
assert 'stats' in result
def test_main_raises_if_invalid_data(self):
def test_functional_with_nans_and_missing(self):
numerical_arrays = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
[102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
[102, 'foo', 10],
[103, 'foo', float('nan')], [103, 'bar', 16],
[104, 'foo', 20], [104, 'bar', 21]],
columns=['id', 'feature', 'value'])
]
subsets = [[1, 2, 3, 4]] # does not match sample colnames
with pytest.raises(ValueError) as e:
self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='mean',
id_filter=[],
subsets=subsets)
assert 'subset sample ids do not match the data' in e
subsets = [[101, 102], [103, 104]]
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='B',
id_filter=[],
max_rows=100,
subsets=subsets)
stats = json.loads(result['stats'])
assert stats[0] != stats[1]
def test_main_raises_if_invalid_subsets(self):
def test_main_raises_if_invalid_data(self):
numerical_arrays = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
[102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
[104, 'foo', 20], [104, 'bar', 21]],
columns=['id', 'feature', 'value'])
]
subsets = [[101, 102, 103], [123]]
subsets = [[1, 2, 3, 4]] # does not match sample colnames
with pytest.raises(ValueError) as e:
self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='mean',
id_filter=[],
max_rows=100,
subsets=subsets)
assert 'specified subsets does not match' in e
assert 'data set is too small' in e
def test_empty_subset_equals_full_subset(self):
numerical_arrays = [
......@@ -74,6 +81,7 @@ class TestHeatmap:
categoricals=[],
ranking_method='mean',
id_filter=[],
max_rows=100,
subsets=[])
result_2 = self.task.main(numerical_arrays=numerical_arrays,
......@@ -81,5 +89,95 @@ class TestHeatmap:
categoricals=[],
ranking_method='mean',
id_filter=[],
max_rows=100,
subsets=[[101, 102, 103, 104]])
assert result_1 == result_2
def test_multiple_numerical_array_data(self):
numerical_arrays = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
[102, 'foo', 10], [102, 'bar', 11],
[103, 'foo', 15], [103, 'bar', 16],
[104, 'foo', 20], [104, 'bar', 21]],
columns=['id', 'feature', 'value']),
pd.DataFrame([[101, 'baz', 10], [102, 'baz', 11],
[105, 'foo', 20], [105, 'baz', 21],
[106, 'bar', 15]],
columns=['id', 'feature', 'value'])
]
subsets = [[101, 102, 106], [103, 104, 105]]
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='B',
id_filter=[],
max_rows=100,
subsets=subsets)
assert 'data' in result
assert 'stats' in result
def test_zscore_is_not_nan_if_data_misses_values(self):
numerical_arrays = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
[102, 'foo', 10], [102, 'bar', 11],
[103, 'foo', 15], [103, 'bar', 16],
[104, 'foo', 20], [104, 'bar', 21]],
columns=['id', 'feature', 'value']),
pd.DataFrame([[101, 'baz', 10], [102, 'baz', 11],
[105, 'foo', 20], [105, 'baz', 21],
[106, 'bar', 15]],
columns=['id', 'feature', 'value'])
]
subsets = [[101, 102, 106], [103, 104, 105]]
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='B',
id_filter=[],
max_rows=100,
subsets=subsets)
data = json.loads(result['data'])
data = pd.DataFrame(data)
assert not np.isnan(np.min(data['zscore']))
def test_results_are_sorted(self):
numerical_arrays = [
pd.DataFrame([[101, 'A', 5], [102, 'A', 5],
[101, 'B', 2], [102, 'B', 2],
[101, 'C', 8], [102, 'C', 8],
[101, 'D', 10], [102, 'D', 10]],
columns=['id', 'feature', 'value'])
]
subsets = []
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='mean',
id_filter=[],
max_rows=100,
subsets=subsets)
data = json.loads(result['data'])
data = pd.DataFrame(data)
feature_col = data['feature'].tolist()
assert ['D', 'D', 'C', 'C', 'A', 'A', 'B', 'B'] == feature_col
def test_max_rows_works(self):
numerical_arrays = [
pd.DataFrame([[101, 'A', 5], [102, 'A', 5],
[101, 'B', 2], [102, 'B', 2],
[101, 'C', 8], [102, 'C', 8],
[101, 'D', 10], [102, 'D', 10]],
columns=['id', 'feature', 'value'])
]
subsets = []
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='mean',
id_filter=[],
max_rows=2,
subsets=subsets)
data = json.loads(result['data'])
data = pd.DataFrame(data)
feature_col = data['feature'].tolist()
assert ['D', 'D', 'C', 'C'] == feature_col
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment