Note: We no longer publish the latest version of our code here. We primarily use a kumc-bmi github organization. The heron ETL repository, in particular, is not public. Peers in the informatics community should see MultiSiteDev for details on requesting access.

source: heron_load/redcap_i2b2_transform.sql @ 0:42ad7288920a

heron-michigan tip
Last change on this file since 0:42ad7288920a was 0:42ad7288920a, checked in by Matt Hoag <mhoag@…>, 6 years ago

Merge with demo_concepts_3800

File size: 14.7 KB
Line 
1/* redcap_i2b2_transform -- transform REDCap tables to i2b2 star schema shape
2
3Copyright (c) 2012 University of Kansas Medical Center
4part of the HERON* open source codebase; see NOTICE file for license details.
5* http://informatics.kumc.edu/work/wiki/HERON
6
7-- Based on heron_load/bsr_i2b2_transform.sql
8
9   Hmm... In Epic, UHC, etc., the fact views are in the view of the user
10   who is running the ETL job. That seems sorta goofy, but I think I
11   did it so that developers don't clobber each other's work.
12   (I (Dan) think there's a ticket about this, but I can't find it just now.)
13*/
14
15-- Test  that we are in KUMC SID
16select record_id from redcap.redcap_data where 1=0;
17
18
19/* Create Patient_Dimension view */
20create or replace view redcap.patient_dimension as
21SELECT 
22  distinct TO_NUMBER(rrd.value) as MRN
23  , mod(ora_hash(value), &&heron_etl_chunks)+1 as part
24    FROM redcap.redcap_data rrd
25    join redcap.redcap_mrn rrm
26    on rrm.project_id = rrd.project_id
27    and rrm.mrn_field = rrd.field_name
28where regexp_like(rrd.value,'^[[:digit:]]+$');
29
30
31/* Verify that one record does not have TWO MRNs #1668 */
32select case when count(*) > 0 then 1/0 else 1 end max_one_mrn_per_record from (
33
34select dups.*, d.value from (
35select count(*) qty, project_id, record, field_name
36from
37  (SELECT distinct rrd.value as MRN, rrd.record, rrd.field_name, rrd.project_id
38    FROM redcap.redcap_data rrd
39    join redcap.redcap_mrn rrm
40    on rrm.project_id = rrd.project_id
41    and rrm.mrn_field = rrd.field_name
42where regexp_like(rrd.value,'^[[:digit:]]+$')
43    )
44  group by record, project_id, field_name
45  having count(MRN) > 1
46  ) dups
47  join redcap.redcap_data d
48  on dups.project_id = d.project_id
49  and dups.field_name = d.field_name
50  and dups.record = d.record
51  order by dups.project_id, dups.record
52
53);
54
55/* Verify that two records with the SAME MRN are not included #1668*/
56select case when max(count(record))>1 then 1/0 else 1 end
57from
58  (SELECT  distinct rrd.value as MRN, rrd.record, rrd.project_id
59    FROM redcap.redcap_data rrd
60    join redcap.redcap_mrn rrm
61    on rrm.project_id = rrd.project_id
62    and rrm.mrn_field = rrd.field_name
63where regexp_like(rrd.value,'^[[:digit:]]+$')
64    )
65  group by MRN, project_id;
66 
67/* Verify that one record does not have two entries for MRN #1668*/
68select case when max(count(record))>1 then 1/0 else 1 end
69from
70  (SELECT  rrd.value as MRN, rrd.record, rrd.project_id
71    FROM redcap.redcap_data rrd
72    join redcap.redcap_mrn rrm
73    on rrm.project_id = rrd.project_id
74    and rrm.mrn_field = rrd.field_name
75where regexp_like(rrd.value,'^[[:digit:]]+$')
76    )
77  group by MRN, project_id;
78
79
80
81/* Table which combines redcap_events, redcap_log_event,
82   redcap_data and redcap_metadata
83*/
84whenever sqlerror continue;
85drop table REDCAP.REDCAP_FACTS;
86whenever sqlerror exit;
87
88CREATE table REDCAP.REDCAP_FACTS
89AS
90  SELECT record_id,
91  rda.event_id,
92  rda.record,
93  rda.value,
94  rlv.start_timestamp,
95  rlv.end_timestamp,
96  rda.project_id,
97  rda.field_name,
98  rmd.FIELD_PHI,
99  rmd.FORM_NAME,
100  rmd.FORM_MENU_DESCRIPTION,
101  rmd.FIELD_ORDER,
102  rmd.FIELD_UNITS,
103  rmd.ELEMENT_PRECEDING_HEADER,
104  rmd.ELEMENT_TYPE,
105  NVL(rmd.ELEMENT_LABEL, rmd.field_name) element_label,
106  rmd.ELEMENT_ENUM,
107  rmd.ELEMENT_NOTE,
108  rmd.ELEMENT_VALIDATION_TYPE,
109  rmd.ELEMENT_VALIDATION_MIN,
110  rmd.ELEMENT_VALIDATION_MAX,
111  rmd.ELEMENT_VALIDATION_CHECKTYPE,
112  rmd.BRANCHING_LOGIC,
113  rmd.FIELD_REQ,
114  rmd.EDOC_ID,
115  rmd.EDOC_DISPLAY_IMG,
116  rmd.CUSTOM_ALIGNMENT,
117  rmd.STOP_ACTIONS,
118  rmd.QUESTION_NUM,
119  -- To accomodate SQL fields
120  nvl(rms.option_id, rmd.OPTION_ID) option_id,
121  nvl(rms.option_desc, rmd.OPTION_DESC) option_desc,
122  pn.patient_num,
123  rre.arm_id
124    FROM redcap.redcap_data rda
125    JOIN
126    -- MRNs for observations
127     (SELECT  rrd.value as patient_num, rrd.record, rrd.project_id
128     FROM redcap.redcap_data rrd
129     join redcap.redcap_mrn rrm
130     on rrm.project_id=rrd.project_id
131     and rrm.mrn_field=rrd.field_name) pn
132    on rda.record = pn.record AND rda.project_id=pn.project_id
133    JOIN
134    -- Arm_id
135      redcap.redcap_events rre
136      ON rre.project_id=rda.project_id AND rre.event_id=rda.event_id
137    LEFT JOIN
138    -- Timestamps for observations
139      (SELECT to_date(MIN(ts),'YYYYMMDDHH24MISS') AS start_timestamp,
140      to_date(MAX(ts),'YYYYMMDDHH24MISS') AS end_timestamp,
141      object_type, event_id, pk, project_id
142      FROM redcap.redcap_log_event rlv
143      -- To include records that have been imported #1668
144      WHERE event IN ('INSERT', 'UPDATE') AND object_type='redcap_data'
145      GROUP BY object_type, event_id, pk, project_id
146      ) rlv
147      ON rlv.project_id=rda.project_id
148      AND rlv.pk=rda.record
149      AND rlv.event_id=rda.event_id
150    LEFT JOIN
151      redcap.redcap_metadata rmd
152      ON rda.field_name=rmd.field_name AND rda.project_id=rmd.project_id
153    LEFT JOIN
154      redcap.redcap_metadata_sql rms
155      ON rda.field_name=rms.field_name AND rda.project_id=rms.project_id
156  WHERE
157  (
158    (rmd.element_type IN ('radio','select','checkbox')
159    AND TO_CHAR(rmd.option_id)=rda.value)
160  OR
161    (rmd.element_type='sql'
162    and rda.value=rms.option_id)
163  OR
164   (rmd.element_type='yesno' AND rda.value IN ('1','0'))
165  OR
166    (rmd.element_type='text'
167    AND rmd.element_validation_type IN ('date_dmy','date_mdy','date_ymd')
168    AND regexp_like(rda.value,'^[0-9]{4}-[0-9]{2}-[0-9]{2}$'))
169  OR
170    (rmd.element_type='text'
171    AND rmd.element_validation_type IN ('float', 'int')
172     -- http://psoug.org/reference/regexp.html
173    AND regexp_like(rda.value,'^[[:digit:]]+$'))
174  )
175  AND rmd.field_phi IS NULL
176  -- To exclude redcap records without any time stamp attached to them
177  AND rlv.START_TIMESTAMP IS NOT NULL;
178
179/*To make sure there are no projects & records with no log events*/
180select
181case when count(*) > 0 then 1/0 else 1 end
182from (
183select * from redcap.redcap_facts
184where START_TIMESTAMP is NULL
185or end_timestamp is null);
186               
187/*To make sure the options are rightly linked to the values*/ 
188select
189case when counting! = 0 then 1/0 else 1 end
190from
191(select count(*) as counting from  REDCAP.REDCAP_FACTS
192where element_type  IN ('radio','select','checkbox')
193AND TO_CHAR(option_id)!=value);
194
195/*To make sure yes and no are properly assigned between field changes*/
196select
197case when counting >2 then 1/0 else 1 end
198from (
199select count(*) counting from (
200select distinct value from REDCAP.REDCAP_FACTS
201where element_type = 'yesno' )
202)
203;
204
205 select to_date(value, 'YYYY-MM-DD') from redcap.redcap_facts
206 where element_type='text'and element_validation_type in ('date_mdy','date_ymd')
207 and regexp_like(value,'^[0-9]{4}-[0-9]{2}-[0-9]{2}$')
208 ;
209
210
211/*
212  Visits view
213  Using a combo of project_id and event and MRN as encounter_ide
214  Also testing to avoid multiple records with same MRN that can clobber this
215*/
216create or replace view redcap.redcap_visit_dimension as
217select distinct 'RC_'||project_id||'_'||event_id||'_'||patient_num encounter_ide
218  , 'A' as encounter_ide_status
219  , rf.patient_num
220  , rf.start_timestamp start_date
221  , rf.end_timestamp end_date
222  , 1 as part
223from redcap.redcap_facts rf
224where patient_num IS NOT NULL
225;
226
227
228select * from redcap.redcap_visit_dimension;
229
230/* To check if there are two redcap records with the same MRN
231-- This test will not be valid for multiple projects
232select case
233when max(count(patient_num))>1 then 1/0
234else 1
235end
236from redcap.redcap_visit_dimension group by patient_num;
237*/
238
239
240/*
241TODO: Is creating a view from a view advisable? We do not need to do it if
242we can figure out how to deal with all the field values.
243TODO: To accomodate multiple events
244*/
245create or replace view redcap.observation_fact_redcap as
246select
247  end_date
248, start_date
249, encounter_ide
250, concept_cd
251, concept_cd2
252/*, case when length(concept_cd)>50
253    then substr(concept_cd, 0, instr(concept_cd, '|', 1)-1)||'|'||ora_hash(substr(concept_cd, instr(concept_cd, '|', 1)+1))
254    else concept_cd
255  end concept_cd*/
256, valtype_cd
257, tval_char
258, nval_num
259, patient_ide
260, part
261, instance_num
262, '@' provider_id
263, '@' modifier_cd
264, null valueflag_cd
265, null units_cd
266, null location_cd
267, to_number(null) confidence_num
268, to_date(null) update_date
269from
270(
271
272/* radio','select','checkbox', 'sql' fields */
273select
274 end_timestamp end_date
275, start_timestamp start_date
276, 'RC_'||project_id||'_'||event_id||'_'||patient_num as encounter_ide
277, 'KUMCRC'||project_id||'|'||
278  -- kumcrc|+pid+|+field_name+:+option_id<50; 7+5+1+field_name+1+option_id <50; field_name+option_id <36
279    case when length('KUMCRC'||project_id||'|'||field_name||':'||option_id) > 50 and length(field_name)>18
280      then substr(field_name, 1, 3) || '*' || to_char(ora_hash(field_name))
281      else field_name end || ':' ||
282    case when length('KUMCRC'||project_id||'|'||field_name||':'||option_id) > 50 and length(option_id) > 18
283      then substr(option_id, 1, 3) || '*' || to_char(ora_hash(option_id))
284    else option_id end concept_cd
285, 'KUMCRC'||project_id||'|'||field_name||':'||option_id concept_cd2
286, '@' valtype_cd
287, '@' tval_char
288, to_number(null) nval_num
289, patient_num patient_ide
290, mod(ora_hash(patient_num), &&heron_etl_chunks)+1 as part
291, rownum instance_num
292from redcap.redcap_facts
293where element_type IN ('radio','select','checkbox','sql')
294
295UNION ALL
296
297/* Date fields */
298select
299 to_date(value,'YYYY-MM-DD') end_date
300, to_date(value,'YYYY-MM-DD') start_date
301, 'RC_'||project_id||'_'||event_id||'_'||patient_num as encounter_ide
302-- Decode seems unnecessary
303, 'KUMCRC'||project_id||'|'||
304    case when length('KUMCRC'||project_id||'|'||field_name||':') > 50
305      then substr(field_name, 1, 3) || '*' || to_char(ora_hash(field_name))
306      else field_name end || ':' concept_cd   
307, 'KUMCRC'||project_id||'|'||field_name||decode(option_id, '0', ':',':'||option_id) concept_cd2
308, 'D' valtype_cd
309, value tval_char -- For a date field
310, to_number(null) nval_num
311, patient_num patient_ide
312, mod(ora_hash(patient_num), &&heron_etl_chunks)+1 as part
313, rownum instance_num
314from redcap.redcap_facts
315where element_type='text'
316and element_validation_type in ('date_dmy','date_mdy','date_ymd')
317and regexp_like(value,'^[0-9]{4}-[0-9]{2}-[0-9]{2}$')
318
319UNION ALL
320
321/*Number fields*/
322select
323 start_timestamp end_date
324, end_timestamp start_date
325, 'RC_'||project_id||'_'||event_id||'_'||patient_num as encounter_ide
326-- Decode seems unnecessary
327, 'KUMCRC'||project_id||'|'||
328    case when length('KUMCRC'||project_id||'|'||field_name||':') > 50
329      then substr(field_name, 1, 3) || '*' || to_char(ora_hash(field_name))
330      else field_name end || ':' concept_cd   
331, 'KUMCRC'||project_id||'|'||field_name||decode(option_id, '0', ':',':'||option_id) concept_cd2
332, 'N' valtype_cd
333, 'E' tval_char
334, TO_NUMBER(value) nval_num
335, patient_num patient_ide
336, mod(ora_hash(patient_num), &&heron_etl_chunks)+1 as part
337, rownum instance_num
338from redcap.redcap_facts
339where element_type='text'
340and element_validation_type in ('float', 'int')
341and regexp_like(value,'^[[:digit:]]+$')
342
343UNION ALL
344
345/* YES or NO fields */
346select
347 end_timestamp end_date
348, start_timestamp start_date
349, 'RC_'||project_id||'_'||event_id||'_'||patient_num as encounter_ide
350, 'KUMCRC'||project_id||'|'||
351    case when length('KUMCRC'||project_id||'|'||field_name||':') > 50
352      then substr(field_name, 1, 3) || '*' || to_char(ora_hash(field_name))
353      else field_name end || ':' ||value concept_cd   
354, 'KUMCRC'||project_id||'|'||field_name||':'||value concept_cd2
355, '@' valtype_cd
356, '@' tval_char
357, to_number(null) nval_num
358, patient_num patient_ide
359, mod(ora_hash(patient_num), &&heron_etl_chunks)+1 as part
360, rownum instance_num
361from redcap.redcap_facts
362where element_type IN ('yesno')
363
364) rdf
365where patient_ide is not null
366-- To make sure we do not consider records which do not have an MRN
367;
368
369-- Falls over when length of concept_cd goes over 50
370select case when  length(concept_cd)>50 then 1/0 else 1 end
371from redcap.observation_fact_redcap;
372
373
374-- eyeball it:
375-- select * from redcap.observation_fact_redcap;
376
377/* 
378select count(*) from redcap.observation_fact_redcap;
379
380select * from redcap.observation_fact_redcap rfo
381join
382redcap_concepts rc
383on rc.concept_cd=rfo.concept_cd;
384*/
385
386/*
387select * from redcap.redcap_facts where element_type NOT  IN ('radio','select','checkbox')  ;
388-- 263 are not radio, select or checkbox
389-- 1583 are the total number of rows
390-- 1320 are the ones that are radio, select or checkbox
391select
392value,
393case element_validation_type
394when 'date_mdy' then to_date(value,'YYYY-DD-MM')
395when 'date_ymd' then to_date(value,'YYYY-DD-MM')
396END val
397from redcap.redcap_facts
398where
399element_type='text' and element_validation_type IN ('date_mdy', 'date_ymd')
400and value is not null
401;
402
403select
404to_date(value,'YYYY-MM-DD') valu
405, value from redcap.redcap_facts
406where element_type='text' and element_validation_type in ('date_mdy','date_ymd');
407
408
409select to_date(value,'YYYY-MM-DD'), value from redcap.redcap_facts
410where element_type='text' and element_validation_type='date_ymd';
411
412select to_date(value,'YYYY-MM-DD'), value from redcap.redcap_facts
413where element_type='text' and element_validation_type='date_mdy';
414
415select distinct element_validation_type
416from redcap.redcap_facts where element_type='text' and element_validation_type IS NOT NULL;
417
418select value, field_name from redcap.redcap_facts
419where element_type='text' and element_validation_type='float';
420
421select is_number('1') from dual;
422
423select * from redcap.REDCAP_FACTS;
424select * from redcap.redcap_metadata_orig;
425*/
426
427
428
429/*select 1 as col1 from dual where
430regexp_like('1234566790870','^[[:digit:]]+$');
431
432select 1 as col1 from dual where
433regexp_like('1234-56-67','^[0-9]{4}-[0-9]{2}-[0-9]{2}$');
434*/
435
436
437/*
438-- Function to verify that the data is a number as it supposed to be
439-- This function is needed as REDCap doesn't completely enforce data validation sometimes
440-- Cribbed from : http://stackoverflow.com/questions/5082176/check-if-its-a-number-function-in-oracle
441-- 1 means param is a number, 0 means param is not a number
442
443create or replace function redcap.is_number(param varchar2) return number
444 as
445   ret number;
446 begin
447    ret := to_number(param);
448    return 1;
449 exception
450    when others then return 0;
451 end;
452/
453-- Need for / http://docs.oracle.com/cd/B19306_01/server.102/b14357/ch4.htm
454*/
455
456
457/*
458-- Function to verify that the data is a date as it supposed to be
459-- This function is needed as REDCap doesn't completely enforce data validation
460-- 1 means param is a date, 0 means param is not a date
461create or replace function redcap.is_rcdate(param varchar2) return number
462 as
463   ret DATE;
464 begin
465    ret := to_date(param,'YYYY-MM-DD');
466    return 1;
467 exception
468    when others then return 0;
469 end;
470/
471*/
Note: See TracBrowser for help on using the repository browser.