Note: We no longer publish the latest version of our code here. We primarily use a kumc-bmi github organization. The heron ETL repository, in particular, is not public. Peers in the informatics community should see MultiSiteDev for details on requesting access.

source: heron_load/kumc_etl.py @ 0:42ad7288920a

heron-michigan tip
Last change on this file since 0:42ad7288920a was 0:42ad7288920a, checked in by Matt Hoag <mhoag@…>, 6 years ago

Merge with demo_concepts_3800

File size: 7.3 KB
Line 
1'''kumc_etl.py -- Extract/Transform/Load tasks for data from KUMC
2--------------------------------------------------------------------------
3
4Bulk Transfer Files
5**********************
6
7The i2b2 audit download date is taken from the BSR bulk transfer file
8from the start of the HeronLoad__ process:
9
10  >>> import heron_build
11  >>> options = _option  # un-hide for testing
12  >>> _, flat = heron_build.flat_options(
13  ...         read_config=heron_build.config_res(options.config))
14  >>> options.update(flat)
15  >>> len(options.bsr_dump) > 0
16  True
17
18And one of the UCH bulk transfer files:
19
20  >>> len(options.uhc_dump) > 0
21  True
22
23And the SS DMF bulk transfer file:
24
25  >>> len(options.ssdmf_dump) > 0
26  True
27
28And the tumor registry NAACCR extract file:
29
30  >>> len(options.naaccr_dump) > 0
31  True
32
33__ http://informatics.kumc.edu/work/wiki/HeronLoad
34
35
36.. todo: check in BSR test data
37
38For production settings, see `heron-prod1.ini` and `heron-prod2.ini`.
39
40:copyright: Copyright 2010-2013 University of Kansas Medical Center
41            part of the `HERON open source codebase`__;
42            see NOTICE file for license details.
43
44__ http://informatics.kumc.edu/work/wiki/HERON
45'''
46
47from paver.easy import options as _option, task, needs
48
49
50def _paver_import_work_around():
51    import sys, os
52    sys.path.append(os.path.dirname(__file__))
53_paver_import_work_around()
54
55from heron_build import view_loader, curated_data, single_loader, run_scripts
56from i2b2_deid import do_deid_dimensions, do_deid_facts
57
58# hide options a bit
59_option(bsr_source_desc=None,
60        bsr_dump=None,
61        uhc_dump=None,
62        ssdmf_dump=None,
63        naaccr_dump=None)
64
65kumc_view_kwds = dict(server='id_server', section='id')
66kumc_loader_kwds = dict(capture_release_list=['dump'],
67                        dump=lambda options: options.path_for.bsr_dump)
68
69kumc_dim_loader_kwds = dict(postprocess=lambda options, job_id:
70                            do_deid_dimensions(options, [job_id]),
71                            **kumc_loader_kwds)
72
73kumc_fact_loader_kwds = dict(postprocess=lambda options, up:
74                             do_deid_facts(options, [up],
75                                           deid_instance_num=True),
76                             **kumc_loader_kwds)
77
78
79@task
80@view_loader('bsr_i2b2_transform.sql', **kumc_view_kwds)
81def make_bsr_views(options):
82    '''Set up views to transform data from the BSR.
83
84    `CREATE VIEW` and `SELECT ANY TABLE` privileges seem to be needed.
85    '''
86
87
88@task
89@needs('dblink_id_deid',
90       'make_bsr_views')
91@single_loader(
92    'load_bsr_dimensions',
93    script='bsr_dimensions_load.sql',
94    label='BSR mappings and dimensions',
95    source_hint='BSR',
96    **kumc_dim_loader_kwds)
97def load_bsr_dimensions(options):
98    '''Load BSR mappings and dimensions
99    '''
100
101
102@task
103@needs('make_bsr_views')
104@single_loader(
105    'load_bsr_facts',
106    script='bsr_clinical_facts_load.sql',
107    label='BSR clinical facts',
108    source_hint='BSR',
109    **kumc_fact_loader_kwds)
110def load_bsr_clinical_facts(options):
111    '''Load, de-id BSR clinical facts.
112    '''
113
114
115@task
116@needs('load_bsr_dimensions', 'load_bsr_clinical_facts')
117def bsr_load():
118    '''Convenience task; just calls
119     load_bsr_dimensions and load_bsr_clinical_facts.
120    '''
121
122
123@task
124@curated_data('deid_db', 'BSR_TERMS', 'curated_data/BSR_simple_ontology.csv')
125def load_bsr_categories(options):
126    pass
127
128
129@task
130@needs('dblink_id_deid', 'make_bsr_views')
131@single_loader(
132    'load_ssdmf_facts',
133    script='ssdmf_load.sql',
134    label='SS DMF facts',
135    source_hint='SSN',
136    **kumc_fact_loader_kwds)
137def ssdmf_load(options):
138    '''Load SS DMF facts, dimensions; de-id facts.
139    '''
140
141
142@task
143@curated_data('deid_db', 'SEER_SITE_TERMS',
144              'curated_data/seer_recode_terms.csv')
145def load_seer_terms(options):
146    pass
147
148
149@task
150@needs('dblink_id_deid', 'load_seer_terms')
151@view_loader(('naaccr_txform.sql', 'seer_recode.sql'),
152             **kumc_view_kwds)
153def make_tumor_views(options):
154    '''Set up views to transform NAACCR data from the tumor registry.
155
156    `CREATE VIEW` and `SELECT ANY TABLE` privileges seem to be needed.
157    '''
158
159
160@task
161@needs('dblink_id_deid', 'make_tumor_views')
162@single_loader(
163    'load_tumor_facts',
164    script='naaccr_facts_load.sql',
165    label='Tumor Registry facts',
166    source_hint='tumor_registry',
167    **kumc_fact_loader_kwds
168)
169def load_tumor_facts(options):
170    '''Load, de-id Tumor Registry facts.
171    '''
172
173
174@task
175@needs('dblink_id_deid', 'load_seer_terms', 'make_tumor_views')
176@run_scripts('naaccr_concepts_load.sql',
177             capture_release_list=['db'],
178             db=lambda options: options.id_db)
179def load_tumor_concepts(options):
180    '''Load concepts for Tumor Registry fact.
181    '''
182
183
184@task
185@needs('load_tumor_concepts')
186@curated_data('deid_db', 'naaccr_shortcut_table',
187              'curated_data/naaccr_shortcuts.csv')
188@run_scripts('create_concept_shortcuts.sql',
189             capture_release_list=['db'],
190             db=lambda options: options.deid_db,
191             variables=dict(shortcut_table_name='naaccr_shortcut_table',
192                            shortcut_prefix='\i2b2',
193                            source_cd='tumor_registry',
194                            heron_terms='naaccr_ontology')
195             )
196def load_tumor_shortcuts(options):
197    '''Load shortcut concepts for the Tumor Registry
198    '''
199
200
201@task
202@needs('dblink_id_deid', 'make_ncdr_constants')
203@single_loader(
204    'load_ncdr_facts',
205    script='ncdr_facts_load.sql',
206    label='NCDR facts',
207    source_hint='NCDR',
208    **kumc_fact_loader_kwds
209)
210def load_ncdr_facts(options):
211    '''Load, de-id NCDR facts.
212    '''
213
214
215@task
216@needs('dblink_id_deid', 'load_ncdr_manual_selections',
217       'make_ncdr_constants')
218@run_scripts('ncdr_concepts_load.sql',
219             capture_release_list=['db'],
220             db=lambda options: options.deid_db)
221def load_ncdr_concepts(options):
222    '''Load concepts for NCDR.
223    '''
224
225
226@task
227@needs('dblink_id_deid','load_ncdr_manual_selections')
228@run_scripts('ncdr_constants.sql',
229             capture_release_list=['db'],
230             db=lambda options: options.deid_db)
231def make_ncdr_constants(options):
232    '''Create views with NCDR-specific constants defined.
233    '''
234
235
236@task
237@view_loader('redcap_i2b2_transform.sql', **kumc_view_kwds)
238def make_redcap_views(options):
239    '''Set up views to transform REDCap data
240    '''
241
242
243@task
244@needs('dblink_id_deid', 'make_redcap_views')
245@single_loader(
246    'load_redcap_clinical_facts',
247    script='redcap_clinical_facts_load.sql',
248    label='REDCap Clinical Facts',
249    source_hint='REDCap',
250    **kumc_fact_loader_kwds
251)
252def load_redcap_clinical_facts(options):
253    ''' Load and de-id REDCap clinical Facts
254    '''
255
256
257@task
258@view_loader('uhc_i2b2_transform.sql', **kumc_view_kwds)
259def make_uhc_views(options):
260    '''Set up views to transform UHC data.
261
262    .. todo:: better diagnositc when `CREATE VIEW` or `SELECT ANY TABLE`
263              privileges are missing.
264    '''
265
266
267@task
268@curated_data('id_db', 'risktypelabels',
269              'curated_data/risk_type_labels.csv')
270def load_risk_type_curated_data(options):
271    '''
272    Load curated data related to risk type and expected length of stay
273    '''
274
275
276@task
277@needs('dblink_id_deid', 'make_uhc_views')
278@single_loader(
279    'load_uhc_clinical_facts',
280    script='uhc_clinical_facts_load.sql',
281    label='UHC clinical facts',
282    source_hint='UHC',
283    **kumc_fact_loader_kwds)
284def load_uhc_clinical_facts(options):
285    '''Load, de-id UHC clinical facts.
286    '''
Note: See TracBrowser for help on using the repository browser.