Note: We no longer publish the latest version of our code here. We primarily use a kumc-bmi github organization. The heron ETL repository, in particular, is not public. Peers in the informatics community should see MultiSiteDev for details on requesting access.

source: heron_load/naaccr_concepts_load.sql @ 0:42ad7288920a

heron-michigan tip
Last change on this file since 0:42ad7288920a was 0:42ad7288920a, checked in by Matt Hoag <mhoag@…>, 6 years ago

Merge with demo_concepts_3800

File size: 13.8 KB
Line 
1/** naaccr_load.sql -- load i2b2 concepts from NAACCR tumor registry data
2
3Copyright (c) 2013 University of Kansas Medical Center
4part of the HERON* open source codebase; see NOTICE file for license details.
5* http://informatics.kumc.edu/work/wiki/HERON
6
7 * ack: "Key, Dustin" <key.d@ghc.org>
8 * Thu, 18 Aug 2011 16:16:31 -0700
9 *
10 * see also: naacr_init.sql, naacr_txform.sql
11 */
12
13/* Check that we're running in the identified repository. */
14select * from NightHeronData.observation_fact where 1=0;
15
16/* Check for NAACCR extract table (in KUMC database).
17oops... typo in schema name. keep it that way?
18*/
19select * from naacr.extract where 1=0;
20
21/* check that transformation views are in place */
22select * from tumor_item_value tiv where 1=0;
23
24/* check that metadata_init.sql was run to create the ontology table. */
25select c_name from BlueHeronMetadata.NAACCR_ONTOLOGY@deid where 1=0;
26
27-- check that WHO materials are staged
28select * from who.topo where 1=0;
29
30
31
32/* Exploration/analysis queries ...
33
34-- How many records did we load from the extract?
35select count(*)
36from naacr.extract ne;
37-- 65584
38
39-- How many distinct patients? How many tumors per patient?
40select count(distinct ne."Patient ID Number") as total_patients,
41round(count(*) / count(distinct ne."Patient ID Number"), 3) as tumors_per_patient
42from naacr.extract ne;
43-- 60667        1.081
44
45
46-- Patient mapping: do all of them have Patient IDs?
47select count(to_number(ne."Patient ID Number"))
48  from NAACR.EXTRACT ne;
49-- 65584, so yes.
50
51*/
52
53
54
55/********
56 * Concepts
57 */
58
59/* ICD-O topographic codes for primary site */
60/* TODO: check that it's OK to throw away lvl='incl' synonyms */
61whenever sqlerror continue;
62drop table icd_o_topo;
63whenever sqlerror exit;
64create table icd_o_topo as
65with major as (
66  select * from who.topo
67  where lvl = '3'
68)
69, minor as (
70  select * from who.topo
71  where lvl = '4'
72)
73select 3 lvl, major.kode concept_cd, 'FA' as c_visualattributes,
74       major.kode || '\' path, major.title concept_name
75from major
76union all
77select 4 lvl, replace(minor.kode, '.', '') concept_cd,  'LA' as c_visualattributes,
78       major.kode || '\' || minor.kode || '\', minor.title
79from major
80join minor on minor.kode like (major.kode || '%')
81;
82
83/*
84-- eyeball it
85select * from icd_o_topo order by path;
86*/
87
88/* ICD-O-2, -3 morphology codes for histology */
89whenever sqlerror continue;
90drop table icd_o_morph;
91whenever sqlerror exit;
92create table icd_o_morph as
93with item as (
94  select '0419' itemnbr from dual -- ICD-O-2
95union all
96  select '0521' itemnbr from dual -- ICD-O-3
97),
98morph as (
99  select replace(code, 'M', '') code, label, item.itemnbr
100  from who.morph2 m, item
101  where item.itemnbr = '0419'
102union all
103  select m.code, m.notes label, item.itemnbr
104  from who.morph3 m, item
105  where item.itemnbr = '0521'
106  and m.label = 'title'
107),
108major as (
109  select substr(code, 1, 3) lo,
110         case
111           when code like '%-%'
112           then substr(code, 5, 3)
113           else code
114         end hi,
115         morph.*
116  -- only ICD-O-2 has hierarchy
117  from morph where code not like '%/%'
118union all
119  -- 975 seems to be new in ICD-O-3
120  select '975' lo, '975' hi, '975' code,
121    'Neoplasms of histiocytes and accessory lymphoid cells' label,
122    itemnbr
123  from dual, item
124),
125minor as (
126  select morph.*
127  from morph where code like '%/%'
128)
129select 3 lvl, item.itemnbr, major.code concept_cd, 'FA' as c_visualattributes,
130       major.code || '\' path, major.code || ' ' || major.label concept_name
131from major, item
132
133union all
134
135select 4 lvl, minor.itemnbr, replace(minor.code, '/', '') concept_cd,  'LA' as c_visualattributes,
136       major.code || '\' || minor.code || '\', minor.code || ' ' || minor.label
137
138from major
139join minor on substr(minor.code, 1, 3) between major.lo and major.hi
140;
141
142select case when count(*) > 0 then 1/0 else 1 end
143  all_morph2_codes_joined from (
144select *
145from who.morph2
146left join icd_o_morph
147       on regexp_replace(code, '[M/]', '') = concept_cd
148where code like '%/%' and concept_cd is null
149)
150;
151
152select case when count(*) > 0 then 1/0 else 1 end
153  all_morph3_codes_joined from (
154select *
155from who.morph3
156left join icd_o_morph
157       on regexp_replace(code, '[M/]', '') = concept_cd
158where code like '%/%' and concept_cd is null
159and label = 'title'
160)
161;
162
163
164/*
165-- eyeball it
166select * from icd_o_morph order by path;
167*/
168
169/** tumor_reg_codes - one row for each distinct concept_cd in the data
170 *  - NAACCR|III:CCC concept codes: one row per code value per coded item
171 *  - NAACCR|NNN: concept codes: one per non-coded (numeric, date, ...) item
172 */
173
174whenever sqlerror continue;
175drop table tumor_reg_codes;
176whenever sqlerror exit;
177create table tumor_reg_codes as
178-- Note: this includes both coded and other (date, numeric) items
179select distinct
180  tiv.sectionid, tiv.section
181, tiv.itemid, tiv.itemnbr, tiv.itemname
182, tiv.concept_cd, tiv.codenbr
183from tumor_item_value tiv;
184
185-- select * from tumor_reg_codes;
186-- select count(*) from tumor_reg_codes;
187
188/** tumor_reg_concepts -- one row per code from data or data dictionary
189 *
190 * A left join from the data dictionary to the data would leave out
191 * codes that appear only in the data.
192 *
193 * A left join from the data to the data dictionary would leave out
194 * codes that appear only in the data dictionary.
195 *
196 * So we take the union of these, left join it with the data dictionary,
197 * and for c_visualattributes, check whether any such data exist.
198 */
199whenever sqlerror continue;
200drop table tumor_reg_concepts;
201whenever sqlerror exit;
202create table tumor_reg_concepts as
203select coded.sectionid, coded.section, coded.itemnbr, coded.itemname
204     , concept_cd
205     , case
206       -- concepts where we have data are Active
207       when exists (
208         select 1
209         from tumor_reg_codes trc
210         where trc.itemnbr = coded.itemnbr
211         and trc.codenbr = tc.codenbr) then 'LA'
212       else 'LH'
213       end c_visualattributes
214     , tc.codenbr, coalesce(label.c_name, tc.codenbr) c_name
215from
216-- For each *coded* item from the data dictionary...
217(
218  select sectionid, section, itemnbr, itemname
219  from tumor_item_type
220  where valtype_cd = '@'
221) coded
222-- ... find all the code values...
223join (
224  -- ... from the data ...
225  select distinct itemnbr, codenbr, concept_cd
226  from tumor_reg_codes
227  where codenbr is not null
228
229  union
230
231  -- ... as well as those from the data dictionary (t_code) ...
232  select distinct to_number(ti."ItemNbr"), codenbr
233                , 'NAACCR|' || ti."ItemNbr" || ':' || codenbr concept_cd
234  from naacr.t_code tc
235  join naacr.t_item ti on tc.itemid = ti."ItemID"
236  where tc.codedcrp is not null
237  -- exclude description of codes; we just want codes
238  and tc.codenbr not like '% %'
239  and tc.codenbr not like '%<%'
240  and tc.codenbr not in ('..', '*', 'User-defined', 'nn')
241) tc on coded.itemnbr = tc.itemnbr
242-- now get labels where available from t_code
243left join (
244  select ty.itemnbr, tc.codenbr, min(tc.codenbr || ' ' || tc.codedcrp) c_name
245  from naacr.t_code tc
246  join tumor_item_type ty on tc.itemid = ty.itemid
247  group by ty.itemnbr, tc.codenbr
248) label
249  on label.itemnbr = coded.itemnbr
250 and label.codenbr = tc.codenbr
251;
252
253-- eyeball it:
254-- select * from tumor_reg_concepts order by sectionid, itemnbr, codenbr;
255-- select count(*) from tumor_reg_concepts;
256-- 1849 (in test)
257
258
259delete from BlueHeronMetadata.NAACCR_ONTOLOGY@deid;
260
261insert into BlueHeronMetadata.NAACCR_ONTOLOGY@deid (
262  c_hlevel, c_fullname, c_name, c_basecode, c_dimcode, c_visualattributes,
263  c_synonym_cd, c_facttablecolumn, c_tablename, c_columnname, c_columndatatype,
264  c_operator, m_applied_path,
265  update_date, import_date, sourcesystem_cd
266)
267select i2b2_root.c_hlevel + terms.c_hlevel as c_hlevel
268     , i2b2_root.c_fullname || naaccr_folder.path || terms.path as c_fullname
269     , terms.concept_name
270     , terms.concept_cd
271     , i2b2_root.c_fullname || naaccr_folder.path || terms.path as c_dimcode
272     , c_visualattributes,
273  norm.*,
274  sysdate as update_date, sysdate as import_date,
275  tumor_reg_source.source_cd as sourcesystem_cd
276from
277(
278select 1 as c_hlevel
279     , '' as path
280     , 'Cancer Cases' as concept_name
281     , null as concept_cd
282     , 'FA' as c_visualattributes
283from dual
284
285union all
286/* Section concepts */
287select 2 as c_hlevel
288     , 'S:' || nts.sectionid || ' ' || section || '\' as path
289     , trim(to_char(nts.sectionid, '09')) || ' ' || section as concept_name
290     , null as concept_cd
291     , case
292       when trc.sectionid is null then 'FH'
293       else 'FA'
294       end as c_visualattributes
295from NAACR.t_section nts
296left join (
297  select distinct sectionid
298  from tumor_reg_codes) trc
299  on trc.sectionid = nts.sectionid
300
301union all
302/* Item concepts */
303select 3 as c_hlevel
304     , 'S:' || ns.sectionid || ' ' || ns.section || '\'
305       || substr(trim(to_char(ni."ItemNbr", '0999')) || ' ' || ni."ItemName", 1, 40) || '\' as path
306     , trim(to_char(ni."ItemNbr", '0999')) || ' ' || ni."ItemName" as concept_name
307     , 'NAACCR|' || ni."ItemNbr" || ':' as concept_cd
308     , case
309         when ni."ItemNbr" in (
310                    -- hide Histology since
311                    -- we already have Morph--Type/Behav
312                    '0420', '0522')
313              or viz1 is null -- hide concepts where we have no data
314              then 'LH'
315         else viz1 || 'A'
316       end c_visualattributes
317from NAACR.t_section ns
318join NAACR.t_item ni on ns.sectionid = to_number(ni."SectionID")
319left join (
320  select distinct itemnbr,
321    case when codenbr is null then 'L' else 'F' end as viz1
322  from tumor_reg_codes) trc
323  on ni."ItemNbr" = trc.itemnbr
324
325union all
326/* Code concepts */
327select distinct 4 as c_hlevel
328     , 'S:' || sectionid || ' ' || section || '\'
329       || substr(trim(to_char(itemnbr, '0999')) || ' ' || itemname, 1, 40) || '\'
330       || substr(c_name, 1, 40) || '\'
331       as concept_path
332     , c_name as concept_name
333     , concept_cd
334     , case
335       when itemnbr in (
336                    -- hide Histology since
337                    -- we already have Morph--Type/Behav
338                    '0420', '0522')
339       then 'LH'
340       else c_visualattributes
341       end as c_visualattributes
342from tumor_reg_concepts
343where itemnbr not in (400, 419, 521) -- separate code for primary site, Morph.
344
345union all
346
347/* Primary site concepts */
348select distinct lvl + 1 as c_hlevel
349     , 'S:' || sectionid || ' ' || section || '\'
350       || substr(trim(to_char(itemnbr, '0999')) || ' ' || itemname, 1, 40) || '\'
351       || icdo.path
352       as concept_path
353     , icdo.concept_name concept_name
354     , 'NAACCR|400:' || icdo.concept_cd concept_cd
355     , icdo.c_visualattributes
356from icd_o_topo icdo, tumor_reg_concepts
357where itemnbr  = 400
358
359/* Morph--Type/Behav concepts */
360union all
361select distinct lvl + 1 as c_hlevel
362     , 'S:' || sectionid || ' ' || section || '\'
363       || substr(trim(to_char(tr.itemnbr, '0999')) || ' ' || itemname, 1, 40) || '\'
364       || icdo.path
365       as concept_path
366     , icdo.concept_name concept_name
367     , substr(tr.concept_cd, 1, length('NAACCR|400:')) || icdo.concept_cd concept_cd
368     , icdo.c_visualattributes
369from icd_o_morph icdo, tumor_reg_concepts tr
370where tr.itemnbr in (419, 521)
371
372union all
373/* SEER Site Summary concepts*/
374select 2 as c_hlevel
375     , 'SEER Site\' as path
376     , 'SEER Site Summary' as concept_name
377     , null as concept_cd
378     , 'FA' as c_visualattributes
379from dual
380
381union all
382
383select 3 + hlevel as c_hlevel
384     , 'SEER Site\' || path || '\' as path
385     , name as concept_name
386     , case when basecode is null then null
387       else 'SEER_SITE:' || basecode end as concept_cd
388     , visualattributes as c_visualattributes
389from seer_site_terms@deid
390) terms
391, (select 'naaccr\' as path
392     , 'NAACCR' as concept_name
393     from dual) naaccr_folder
394, (select 0 c_hlevel, '\i2b2\' c_fullname from dual) i2b2_root
395, BlueHeronMetadata.normal_concept@deid norm
396, (select * from BlueHeronData.source_master@deid
397   where source_cd like 'tumor_registry@%') tumor_reg_source;
398
399
400/* Regression tests for earlier bugs. */
401select case when count(*) = 4 then 1 else 1/0 end naaccr_morph_bugs_fixed
402from (
403select distinct c_basecode
404from BlueHeronMetadata.NAACCR_ONTOLOGY@deid
405where c_basecode in ('NAACCR|521:97323', 'NAACCR|521:80413',
406                     'NAACCR|521:98353', 'NAACCR|400:C619')
407);
408
409
410insert into etl_test_values (test_domain, test_name, test_value, result_id, result_date, detail_num_1, detail_char_1)
411select 'Cancer Cases' test_domain, 'item_terms_indep_data' test_name
412     , case when ont.c_basecode is null then 0 else 1 end test_value
413     , sq_result_id.nextval result_id
414     , sysdate result_date
415     , ti.itemnbr, ti.itemname
416from (
417select "ItemNbr" itemnbr, null codecrp, "ReqStatus"
418     , 'NAACCR|' || "ItemNbr" || ':' c_basecode, "ItemName" itemname
419from naacr.t_item) ti
420left join (-- avoid link/LOB error ORA-22992
421  select c_basecode, c_name
422  from BlueHeronMetadata.NAACCR_ONTOLOGY@deid) ont
423  on ont.c_basecode = ti.c_basecode
424where ont.c_basecode is null
425and ti."ReqStatus" != 'Retired'
426;
427
428insert into etl_test_values (test_domain, test_name, test_value, result_id, result_date, detail_num_1, detail_char_1, detail_char_2)
429select 'Cancer Cases' test_domain, 'code_terms_indep_data' test_name
430     , case when ont.c_basecode is null then 0 else 1 end test_value
431     , sq_result_id.nextval result_id
432     , sysdate result_date
433     , ti."ItemNbr", ti.codenbr, substr(ti."ItemName" || ' / ' || ti.codedcrp, 1, 255)
434from (
435select "ItemNbr", "ItemName", "ReqStatus", "AllowValue", codenbr
436     , 'NAACCR|' || "ItemNbr" || ':' || codenbr c_basecode, codedcrp
437from naacr.t_code tc
438join naacr.t_item ti on ti."ItemID" = tc.itemid
439) ti
440left join (-- avoid link/LOB error ORA-22992
441  select c_basecode, c_name
442  from BlueHeronMetadata.NAACCR_ONTOLOGY@deid) ont
443  on ont.c_basecode = ti.c_basecode
444where ont.c_basecode is null
445and ti."ReqStatus" != 'Retired'
446
447-- skip numeric values that are actually codes
448and ti."AllowValue" != '10-digit number'
449and ti."AllowValue" not like 'Census Tract Codes%'
450and ti.codenbr not in ('00000000', '88888888', '99999999')
451
452-- comments on/descriptions of codes
453and ti.codenbr not like '<_>%'
454and ti.codenbr not like '% %'
455;
456
457drop table icd_o_topo;
458drop table icd_o_morph;
Note: See TracBrowser for help on using the repository browser.