Note: We no longer publish the latest version of our code here. We primarily use a kumc-bmi github organization. The heron ETL repository, in particular, is not public. Peers in the informatics community should see MultiSiteDev for details on requesting access.

source: heron_load/naaccr_facts_load.sql @ 0:42ad7288920a

heron-michigan tip
Last change on this file since 0:42ad7288920a was 0:42ad7288920a, checked in by Matt Hoag <mhoag@…>, 6 years ago

Merge with demo_concepts_3800

File size: 6.3 KB
Line 
1/** naaccr_facts_load.sql -- load i2b2 facts from NAACCR tumor registry data
2
3Copyright (c) 2013 University of Kansas Medical Center
4part of the HERON* open source codebase; see NOTICE file for license details.
5* http://informatics.kumc.edu/work/wiki/HERON
6
7patterned after epic_facts_load.sql
8
9 * ack: "Key, Dustin" <key.d@ghc.org>
10 * Thu, 18 Aug 2011 16:16:31 -0700
11 *
12 * see also: naacr_init.sql, naacr_txform.sql
13 */
14
15/* Check that we're running in the identified repository. */
16select * from NightHeronData.observation_fact where 1=0;
17
18/* Check for NAACCR extract table (in KUMC database).
19oops... typo in schema name. keep it that way?
20*/
21select * from naacr.extract where 1=0;
22
23/* check that transformation views are in place */
24select * from tumor_reg_visits where 1=0;
25select * from tumor_reg_facts where 1=0;
26
27
28/* Exploration/analysis queries ...
29
30-- How many records did we load from the extract?
31select count(*)
32from naacr.extract ne;
33-- 65584
34
35-- How many distinct patients? How many tumors per patient?
36select count(distinct ne."Patient ID Number") as total_patients,
37round(count(*) / count(distinct ne."Patient ID Number"), 3) as tumors_per_patient
38from naacr.extract ne;
39-- 60667        1.081
40
41
42-- Patient mapping: do all of them have Patient IDs?
43select count(to_number(ne."Patient ID Number"))
44  from NAACR.EXTRACT ne;
45-- 65584, so yes.
46
47-- How many of them match MRNs from our patient mapping?
48select count(*)
49from naacr.extract ne
50join NIGHTHERONDATA.patient_mapping pm
51  on pm.patient_ide_source =
52  (select source_cd from sms_audit_info)
53  and pm.patient_ide = ne."Patient ID Number";
54-- 0. oops.
55
56-- how long are MRNs in our patient_mapping?
57select min(length(pm.patient_ide)),
58  max(length(pm.patient_ide))
59from NIGHTHERONDATA.patient_mapping pm
60where pm.patient_ide_source =
61  (select source_cd from sms_audit_info);
62-- 6 to 7 chars (bytes? never mind...)
63
64-- How long are Patient ID Numbers?
65select min(length(ne."Patient ID Number")),
66  max(length(ne."Patient ID Number"))
67from naacr.extract ne;
68-- 8. hmm.
69
70-- How many of them match after we drop the 1st digit?
71select numerator, denominator, round(numerator/denominator*100, 2) as density
72from (
73  select count(*) as numerator
74  from naacr.extract ne
75  join NIGHTHERONDATA.patient_mapping pm
76    on pm.patient_ide_source =
77       (select source_cd from sms_audit_info)
78   and pm.patient_ide = substr(ne."Patient ID Number", 2)) matches,
79  (select count(*) as denominator from naacr.extract ne)
80;
81-- 65183 out of 65584; i.e. 99.39%
82
83
84-- How many match if we convert digit-strings to numbers?
85select count(*)
86from naacr.extract ne
87join NIGHTHERONDATA.patient_mapping pm
88  on pm.patient_ide_source =
89  (select source_cd from sms_audit_info)
90  and to_number(pm.patient_ide) = to_number(ne."Patient ID Number");
91-- ORA-01722: invalid number. Bad data somewhere; so we can't tell.
92-- FWIW, the NAACCR Patient IDs all convert to_number just fine.
93-- The problem is in the Epic/SMS data.
94
95-- What can we use as a primary key?
96select count(*) from (
97select distinct ne."Accession Number--Hosp", ne."Sequence Number--Hospital"
98from naacr.extract ne);
99-- 65581. almost; all but 4.
100
101-- which 4?
102select count(*), ne."Accession Number--Hosp", ne."Sequence Number--Hospital"
103from naacr.extract ne
104group by ne."Accession Number--Hosp", ne."Sequence Number--Hospital"
105having count(*) > 1;
106
107-- are there any nulls?
108select count(*)
109from naacr.extract ne
110where ne."Accession Number--Hosp" is null;
111-- 2
112*/
113
114
115insert into NightHeronData.encounter_mapping
116  (encounter_num, encounter_ide,
117   encounter_ide_status, encounter_ide_source, project_id,
118   patient_ide, patient_ide_source,
119   import_date, upload_id, download_date, sourcesystem_cd )
120(select NightHeronData.SQ_UP_ENCDIM_ENCOUNTERNUM.nextval as encounter_num
121      , tv.encounter_ide
122      , 'A' as encounter_ide_status
123      , aud.source_cd as encounter_ide_source
124      , '@' as project_id
125      , tv.mrn as patient_ide
126      , sms_audit_info.source_cd as patient_ide_source
127      , sysdate as import_date
128      , up.upload_id
129      , :download_date
130      , up.source_cd
131  from tumor_reg_visits tv
132     , (select * from BlueHeronData.source_master@deid
133        where source_cd like 'tumor_registry@%') aud
134     , (select * from BlueHeronData.source_master@deid
135        where source_cd like 'SMS@%') sms_audit_info
136     , NightHeronData.upload_status up
137  where up.upload_id = :upload_id);
138
139
140truncate table observation_fact_upload;
141whenever sqlerror continue;
142alter table observation_fact_upload
143  disable constraint observation_fact_pk;
144whenever sqlerror exit;
145
146insert into observation_fact_upload (
147  patient_num, encounter_num, sub_encounter,
148  concept_cd,
149  provider_id,
150  start_date,
151  modifier_cd,
152  instance_num,
153  valtype_cd,
154  tval_char,
155  nval_num,
156  valueflag_cd,
157  units_cd,
158  end_date,
159  location_cd,
160  update_date,
161  import_date, upload_id, download_date, sourcesystem_cd)
162select patient_num, encounter_num, tf.encounter_ide,
163  tf.concept_cd,
164  tf.provider_id,
165  tf.start_date,
166  tf.modifier_cd,
167  tf.instance_num,
168  tf.valtype_cd,
169  tf.tval_char,
170  tf.nval_num, tf.valueflag_cd, tf.units_cd,
171  tf.end_date,
172  tf.location_cd,
173  tf.update_date,
174  sysdate, up.upload_id, :download_date, up.source_cd
175from (select * from tumor_reg_facts
176      union all
177      select * from seer_recode_facts) tf
178join NIGHTHERONDATA.patient_mapping pm
179  on pm.patient_ide_source =
180  (select source_cd from BlueHeronData.source_master@deid
181   where source_cd like 'SMS@%')
182  and pm.patient_ide = ltrim(tf.mrn, '0')
183join NIGHTHERONDATA.encounter_mapping em
184  on em.encounter_ide_source =
185  (select source_cd from BlueHeronData.source_master@deid
186   where source_cd like 'tumor_registry@%')
187 and em.encounter_ide = tf.encounter_ide
188 , NightHeronData.upload_status up
189  where up.upload_id = :upload_id
190/* don't bother with:    and part = :part */
191;
192
193commit;
194
195
196
197/* For this upload of data, check primary key constraints. */
198alter table observation_fact_upload
199  enable constraint observation_fact_pk
200  /* TODO: log errors ... ? #2117 */
201  ;
202
203
204/** Summary stats.
205
206Report how many rows are dropped when joining on patient_ide and encounter_id.
207Subsumes check for null :part (#789).
208*/
209update NightHeronData.upload_status
210  set loaded_record = (select count(*) from observation_fact_upload)
211    , no_of_record = (select count(*) from tumor_reg_facts)
212                      +
213                     (select count(*) from seer_recode_facts)
214  where upload_id = :upload_id;
Note: See TracBrowser for help on using the repository browser.