Note: We no longer publish the latest version of our code here. We primarily use a kumc-bmi github organization. The heron ETL repository, in particular, is not public. Peers in the informatics community should see MultiSiteDev for details on requesting access.

source: heron_load/heron_create.py @ 0:42ad7288920a

heron-michigan tip
Last change on this file since 0:42ad7288920a was 0:42ad7288920a, checked in by Matt Hoag <mhoag@…>, 6 years ago

Merge with demo_concepts_3800

File size: 11.2 KB
Line 
1'''heron_create -- create i2b2 star schemas
2-------------------------------------------
3
4The :func:`heron_init` task builds two repositories:
5  1. the identified repository, containing the raw data, and
6  2. the de-identified repositiory
7
8Each repository is an i2b2__ datamart (per *Clinical Research Chart
9(CRC) Design Document* v1.4).
10
11__ https://www.i2b2.org/
12
13Database initialization scripts from i2b2 sources are used for this process::
14
15   >>> options = _option  # un-hide for testing
16   >>> options.i2b2_source
17   'mock_i2b2_source'
18
19We build a number of i2b2 projects for integrating REDCap as well::
20
21   >>> options.redcap_project_qty
22   '20'
23
24
25Tablespaces
26***********
27
28Schemas are created with default tablespaces, and indexes
29are relocated to a configurable tablespace::
30
31  >>> (options.id_tablespace, options.id_index_tablespace)
32  ('USERS', 'USERS')
33  >>> (options.deid_tablespace, options.deid_index_tablespace)
34  ('USERS', 'USERS')
35
36
37:copyright: Copyright 2010-2013 University of Kansas Medical Center
38            part of the `HERON open source codebase`__;
39            see NOTICE file for license details.
40
41__ http://informatics.kumc.edu/work/wiki/HERON
42'''
43
44from paver.easy import task, needs
45# TODO: use structured_logging rathr than paver_env.info
46from paver.tasks import environment as paver_env
47
48
49from db_util import run_script
50from heron_build import ID_SCHEMA, DEID_SCHEMA
51import db_util
52
53
54@task
55def create_etl_accts(options,
56                     script='create_etl_acct.sql'):
57    # Due to use of links, username and password must be the same
58    # across id, deid.
59    name, password = options.id_username, options.env_for.id_password_env
60
61    run_script(_dba(options.id_server), fn=script,
62               variables=dict(name=name,
63                              passwd=password,
64                              tspace=options.id_tablespace))
65    run_script(_dba(options.deid_server), fn=script,
66               variables=dict(name=name,
67                              passwd=password,
68                              tspace=options.deid_tablespace))
69
70
71def _dba(server,
72         section='DBA'):
73    return server.account(section)
74
75
76@task
77def create_id_schema(options):
78    '''Create schema for identified star schema.
79
80    Note Well: Any existing schema named NightHeronData is destroyed.
81    .. todo:: consider checking  that there's no valuable data
82
83    .. todo:: don't log password
84
85    See also `create_datamart_schema.sql`.
86
87    '''
88    _create_star_schema(_dba(options.id_server),
89                        ID_SCHEMA, options.id_tablespace,
90                        options.rng)
91
92
93def _create_star_schema(conn, name, tspace, rng,
94                        script='create_datamart_schema.sql'):
95    run_script(conn, fn=script,
96               variables=dict(name=name,
97                              passwd=db_util.genpw(rng),
98                              tspace=tspace))
99
100
101@task
102def create_deid_schemas(options):
103    '''Create schemas for de-identified datamart.
104
105    Note Well: Any existing schema is destroyed.
106    .. todo:: consider checking  that there's no valuable data
107
108    .. todo:: don't log password
109    '''
110    _create_star_schema(_dba(options.deid_server),
111                        DEID_SCHEMA,
112                        options.deid_tablespace, options.rng)
113    _create_star_schema(_dba(options.deid_server),
114                        meta(DEID_SCHEMA),
115                        options.deid_tablespace, options.rng)
116
117
118def meta(schema):
119    '''
120    >>> meta(ID_SCHEMA)
121    'NIGHTHERONMETADATA'
122    >>> meta(DEID_SCHEMA)
123    'BLUEHERONMETADATA'
124    '''
125    return schema.split('DATA')[0] + 'METADATA'
126
127
128@task
129@needs('create_id_schema')
130def create_id_datamart(options,
131                       exc_init='exc_init.sql'):
132    '''Create i2b2 datamart tables for identified repository.
133    '''
134    with options.id_server.granted_db(_dba(options.id_server),
135                                      ID_SCHEMA, options.rng) as idc:
136        _create_datamart(options, _dba(options.id_server),
137                         idc, ID_SCHEMA, options.id_index_tablespace)
138        _update_datamart(idc, ID_SCHEMA)
139    run_script(options.id_db, fn=exc_init)
140
141
142def without_text_index(script, script_code,
143                       removeText=("CREATE INDEX OF_CTX_BLOB"
144                                   " ON OBSERVATION_FACT(OBSERVATION_BLOB)"
145                                   " INDEXTYPE IS CTXSYS.CONTEXT"
146                                   "\n PARAMETERS ('SYNC (on commit)')"
147                                   "\n ;")):
148    '''Remove use of Oracle Text from script
149
150    We don't use the large text search i2b2 feature. So to avoid installing
151    Oracle Text (which is required for the search feature), modifiy the
152    source script to remove the index that depends on Oracle Text.
153
154    TODO: make this rather a feature of run_script
155    '''
156    paver_env.info('Remove Oracle Text related code from %s' % script)
157    return script_code.replace(removeText, '')
158
159
160def _create_datamart(options, dbaconn, appcon, schema, idx_tablespace,
161                     scripts=(
162        'crc_create_datamart_oracle.sql',
163        'crc_create_uploader_oracle.sql')):
164    '''Create datamart.
165    '''
166    for script in scripts:
167        scriptPath = _i2b2_script(options.path_for.i2b2_db_build_source,
168                                  script)
169        run_script(appcon, fn=script,
170                   script=without_text_index(script, scriptPath.bytes()))
171        _move_indexes(dbaconn, script, schema, idx_tablespace)
172
173    with appcon.transaction('$import(source_master)') as work:
174        # standard date syntax
175        work.execute(
176            "alter session set NLS_DATE_FORMAT = 'YYYY-MM-DD HH24:MI'")
177        db_util.import_csv(work, 'curated_data/source_master.csv',
178                           'source_master', create=False)
179
180
181def _i2b2_script(i2b2dir, n, dirs='Crcdata'):
182    def _append_dirs(base, ds):
183        for d in ds:
184            base = base / d
185        return base
186
187    return (_append_dirs(i2b2dir, (sd for sd in dirs.split('/')))
188            / 'scripts' / n)
189
190
191def _move_indexes(conn, script, schema, dest):
192    with conn.transaction(script='$move_indexes/' + script) as admin:
193        for lnum, _c, statement in db_util.iter_statement(script):
194            if statement.upper().startswith('CREATE INDEX'):
195                n = statement.split()[2]
196                admin.execute('alter index %s.%s rebuild tablespace %s' % (
197                    schema, n, dest), line=lnum)
198
199
200def _update_datamart(conn, schema,
201                     script='update_datamart.sql'):
202    run_script(conn, fn=script, variables=dict(schema=schema))
203
204
205@task
206@needs('create_deid_schemas')
207def create_deid_datamart(options):
208    '''Create i2b2 datamart tables for de-identified repository.
209
210    .. todo:: DEID_SCHEMA should have CREATE SESSION privilege after all.
211    '''
212    with options.deid_server.granted_db(_dba(options.deid_server),
213                                        DEID_SCHEMA, options.rng) as dc:
214        _create_datamart(options, options.deid_db, dc, DEID_SCHEMA,
215                         options.deid_index_tablespace)
216        _update_datamart(dc, DEID_SCHEMA)
217
218
219def _add_qt_tables(server, schema, options,
220                   script='crc_create_query_oracle.sql'):
221    '''Add QT tables to repository.
222    '''
223    with server.granted_db(_dba(server),
224                           schema, options.rng) as dc:
225        scriptPath = _i2b2_script(options.path_for.i2b2_db_build_source,
226                                  script)
227        run_script(dc, fn=script, script=scriptPath.bytes())
228
229
230@task
231def add_deid_qt_tables(options):
232    '''Add QT tables to de-identified repository.
233
234    .. todo:: DEID_SCHEMA should have CREATE SESSION privilege after all.
235    '''
236    _add_qt_tables(options.deid_server, DEID_SCHEMA, options)
237
238
239@task
240def add_id_qt_tables(options):
241    '''Add QT tables identified repository.
242
243    .. todo:: ID_SCHEMA should have CREATE SESSION privilege after all.
244    '''
245    _add_qt_tables(options.id_server, ID_SCHEMA, options)
246
247
248@task
249@needs('create_deid_datamart')
250def create_deid_metadata(options,
251                         scripts=(('Metadata',
252                                   'create_oracle_i2b2metadata_tables.sql'),
253                                  ('Metadata/demo', "schemes_insert_data.sql"),
254                                  ('Metadata/demo',
255                                   "table_access_insert_data.sql")),
256                         metadata_init_script='metadata_init.sql'):
257    '''Create i2b2 metadata tables for de-identified repository.
258    '''
259    with options.deid_server.granted_db(_dba(options.deid_server),
260                                        meta(DEID_SCHEMA), options.rng) as dc:
261        for d, script in scripts:
262            scriptPath = _i2b2_script(options.path_for.i2b2_db_build_source,
263                                      script, d)
264            run_script(dc, fn=script, script=scriptPath.bytes())
265
266        with dc.transaction('$import(schemes)') as work:
267            db_util.import_csv(work, 'curated_data/schemes.csv',
268                               'schemes', create=False)
269
270        run_script(options.deid_db, fn=metadata_init_script)
271
272
273@task
274def create_redcap_projects(options,
275                           mgr_script='create_project_mgr.sql',
276                           schema_script='create_redcap_project_schemas.sql'):
277    '''Create REDCap i2b2 metadata schemas in the de-identified repository.
278    Insert data into pm tables and hive tables to set up the REDCap projects.
279
280    The `project_id` follows the `REDCap_%` pattern expected by HERON
281    admin (`i2b2pm.py`); likewise with null `project_description`s.
282    '''
283    paver_env.info('Creating project manager user')
284    run_script(_dba(options.deid_server), fn=mgr_script,
285               variables=dict(passwd=db_util.genpw(options.rng)))
286    for i in range(int(options.redcap_project_qty)):
287        pid = i + 1
288        pname = 'REDCap i2b2 project %d' % pid
289        paver_env.info('Proj id: %s and Proj name %s', pid, pname)
290
291        rc_schema = "REDCAPMETADATA%s" % pid
292        _create_star_schema(_dba(options.deid_server),
293                            rc_schema,
294                            options.deid_tablespace, options.rng)
295
296        run_script(_dba(options.deid_server), fn=schema_script,
297                   params=dict(pid=pid,
298                               pname=pname,
299                               project_id='REDCap_%s' % pid),
300                   variables=dict(rc_schema=rc_schema,
301                                  passwd=db_util.genpw(options.rng)))
302
303
304@task
305def etl_tests_init(options, test_init_script='etl_tests_init.sql'):
306    ''' Initialize ETL tests
307    '''
308    run_script(options.id_db, fn=test_init_script)
309    run_script(options.deid_db, fn=test_init_script)
310
311
312@task
313@needs('create_id_datamart', 'create_deid_datamart',
314       'add_id_qt_tables', 'add_deid_qt_tables',
315       'create_deid_metadata', 'create_redcap_projects', 'etl_tests_init')
316def heron_init():
317    '''Create id, deid schemas and datamarts.
318
319    See :func:create_id_datamart, :func:create_deid_datamart for details.
320
321    '''
322    pass
323
324
325##############
326# Trusted code
327
328from paver.easy import options as _option
329
330# TODO: Consider relying on config file for these.
331_option(
332    redcap_project_qty='20',
333    id_tablespace='USERS',  # oracle default
334    id_index_tablespace='USERS',
335    deid_tablespace='USERS',
336    deid_index_tablespace='USERS')
Note: See TracBrowser for help on using the repository browser.