Skip to content
Snippets Groups Projects
Commit 6cbd51bd authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

added id filter to pca

parent 75a99bbc
No related branches found
No related tags found
No related merge requests found
Pipeline #
......@@ -39,6 +39,10 @@ class PCATask(AnalyticTask):
df = df.pivot(index='feature', columns='id', values='value')
df = df.T
# apply id filter
if id_filter:
df = df[df.index.isin(id_filter)]
# save ids so we can re-assign them after pca
ids = df.index.tolist()
......
......@@ -2,10 +2,11 @@
import logging
import requests
from pandas import DataFrame
from fractalis.data.etl import ETL
from fractalis.data.etls.transmart.shared import extract_data
from fractalis.data.etls.transmart import observations_pb2
logger = logging.getLogger(__name__)
......@@ -21,7 +22,30 @@ class HighdimETL(ETL):
return handler == 'transmart' and descriptor['data_type'] == 'highdim'
def extract(self, server: str, token: str, descriptor: dict) -> dict:
return extract_data(server=server, descriptor=descriptor, token=token)
r = requests.get(url='{}/v2/observations'.format(server),
params={
'constraint': '{{"type": "concept","path": "{}"}}'
''.format(descriptor["path"]),
'projection': 'log_intensity',
'type': 'autodetect'
},
headers={
'Accept': 'application/x-protobuf',
'Authorization': 'Bearer {}'.format(token)
},
timeout=2000)
if r.status_code != 200:
error = "Data extraction failed. Target server responded with " \
"status code {}.".format(r.status_code)
logger.error(error)
raise ValueError(error)
try:
pass # TODO
except Exception as e:
logger.exception(e)
raise ValueError("Data extraction failed. "
"Got unexpected data format.")
def transform(self, raw_data: dict, descriptor: dict) -> DataFrame:
rows = []
......
This diff is collapsed.
......@@ -45,3 +45,20 @@ class TestPCATask:
assert data['id'].tolist() == [101, 102, 103, 104, 105]
assert data['subset'].unique().tolist() == [0]
assert data['category'].unique().tolist() == ['a', None]
def test_id_filter_works(self):
features = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
[102, 'foo', 10], [102, 'bar', 11],
[103, 'foo', 15], [103, 'bar', 16],
[104, 'foo', 20], [104, 'bar', 21]],
columns=['id', 'feature', 'value'])
]
result = self.task.main(features=features,
categories=[],
n_components=2,
whiten=False,
id_filter=[101, 104],
subsets=[])
data = pd.read_json(result['data'])
assert data['id'].unique().tolist() == [101, 104]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment