implemented limma fitting for heatmap analysis

f7878378 · Sascha Herzinger · 4147ecbb · f7878378 · f7878378
Commit f7878378 authored 7 years ago by Sascha Herzinger
--- a/fractalis/analytics/tasks/heatmap/main.py
+++ b/fractalis/analytics/tasks/heatmap/main.py
 """Module containing analysis code for heatmap analytics."""

+from copy import deepcopy
 from typing import List, TypeVar
 from functools import reduce

 import pandas as pd
 from scipy.stats import zscore
+from rpy2 import robjects as R
+from rpy2.robjects import r, pandas2ri
+from rpy2.robjects.packages import importr

 from fractalis.analytics.task import AnalyticTask


+importr('limma')
+pandas2ri.activate()
+
 T = TypeVar('T')


@@ -27,7 +34,7 @@ class HeatmapTask(AnalyticTask):
        df = df.drop('variable', axis=1)
        zscores = df.apply(zscore, axis=1)

-        #prepare output for front-end
+        # prepare output for front-end
        df = df.transpose()
        df.columns = variables
        df.index.name = 'id'
@@ -45,4 +52,52 @@ class HeatmapTask(AnalyticTask):

        return {
            'data': df.to_json(orient='index')
-        }
\ No newline at end of file
+        }
+
+    def getLimmaStats(self, df: pd.DataFrame, subsets: List[List[T]]):
+        # we consider all subset ids rather than df ids because an id might be
+        # in multiple subsets. In such a case we have to copy a row in the df.
+        flattened_subsets = [x for subset in subsets for x in subset]
+        df = df[flattened_subsets]
+        ids = list(df)
+
+        # creating the design vector according to the subsets
+        design_vector = [None] * len(ids)
+        subsets_copy = deepcopy(subsets)
+        for i, id in enumerate(ids):
+            for j, subset in enumerate(subsets_copy):
+                try:
+                    subset.index(id)  # raises an Exception if not found
+                    subset.remove(id)
+                    design_vector[i] = str(j + 1)
+                    break
+                except ValueError:
+                    assert j != len(subsets_copy) - 1
+
+        assert None not in design_vector
+
+        # create group names
+        groups = ['group{}'.format(i + 1) for i in list(range(len(subsets)))]
+
+        # create a string for each pairwise comparison of the groups
+        comparisons = []
+        for i in reversed(range(len(subsets))):
+            for j in range(i):
+                comparisons.append('group{}-group{}'.format(i+1, j+1))
+
+        # fitting according to limma doc Chapter 8: Linear Models Overview
+        r_form = R.Formula('~ 0+factor(c({}))'.format(','.join(design_vector)))
+        r_design = r['model.matrix'](r_form)
+        r_design.colnames = R.StrVector(groups)
+        r_data = pandas2ri.py2ri(df)
+        # the next two lines are necessary if column ids are not unique, because
+        # the python to r transformation drops those columns
+        r_ids = R.StrVector(['X{}'.format(id) for id in ids])
+        r_data = r_data.rx(r_ids)
+        r_fit = r['lmFit'](r_data, r_design)
+        r_contrast_matrix = r['makeContrasts'](*comparisons, levels=r_design)
+        r_fit_2 = r['contrasts.fit'](r_fit, r_contrast_matrix)
+        r_fit_2 = r['eBayes'](r_fit_2)
+        r_results = r['topTable'](r_fit_2, number=float('inf'), sort='none')
+        results = pandas2ri.ri2py(r_results)
+        return results
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,8 @@ setup(
        'requests',
        'PyYAML',
        'pycrypto',
-        'pycryptodomex'
+        'pycryptodomex',
+        'rpy2'
    ],
    setup_requires=[
        'pytest-runner',