Note: We no longer publish the latest version of our code here. We primarily use a kumc-bmi github organization. The heron ETL repository, in particular, is not public. Peers in the informatics community should see MultiSiteDev for details on requesting access.

source: heron_load/seer_recode.sql @ 0:42ad7288920a

heron-michigan tip
Last change on this file since 0:42ad7288920a was 0:42ad7288920a, checked in by Matt Hoag <mhoag@…>, 6 years ago

Merge with demo_concepts_3800

File size: 20.4 KB
Line 
1/** seer_recode -- recode primary site, histology into SEER site summary
2
3Copyright (c) 2012-2013 University of Kansas Medical Center
4part of the HERON* open source codebase; see NOTICE file for license details.
5* http://informatics.kumc.edu/work/wiki/HERON
6
7Largely derived from:
8  SEER Site Recode ICD-O-3 (1/27/2003) Definition
9  http://seer.cancer.gov/siterecode/icdo3_d01272003/
10
11by way of seer_recode.py
12
13  http://informatics.kumc.edu/work/browser/tumor_reg/seer_recode.py
14 */
15
16-- test that we're in the KUMC sid with the NAACCR data
17-- note mis-spelling of schema name: naacr
18select "Accession Number--Hosp" from naacr.extract where 1=0;
19
20create or replace view seer_recode_aux as
21select MRN
22     , ne.case_index
23     , start_date
24     , ne.site, ne.histology,
25case
26/* Lip */ when (site between 'C000' and 'C009')
27  and  not (histology between '9050' and '9055'
28   or histology = '9140'
29   or histology between '9590' and '9992') then '20010'
30
31/* Tongue */ when (site between 'C019' and 'C029')
32  and  not (histology between '9050' and '9055'
33   or histology = '9140'
34   or histology between '9590' and '9992') then '20020'
35
36/* Salivary Gland */ when (site between 'C079' and 'C089')
37  and  not (histology between '9050' and '9055'
38   or histology = '9140'
39   or histology between '9590' and '9992') then '20030'
40
41/* Floor of Mouth */ when (site between 'C040' and 'C049')
42  and  not (histology between '9050' and '9055'
43   or histology = '9140'
44   or histology between '9590' and '9992') then '20040'
45
46/* Gum and Other Mouth */ when (site between 'C030' and 'C039'
47   or site between 'C050' and 'C059'
48   or site between 'C060' and 'C069')
49  and  not (histology between '9050' and '9055'
50   or histology = '9140'
51   or histology between '9590' and '9992') then '20050'
52
53/* Nasopharynx */ when (site between 'C110' and 'C119')
54  and  not (histology between '9050' and '9055'
55   or histology = '9140'
56   or histology between '9590' and '9992') then '20060'
57
58/* Tonsil */ when (site between 'C090' and 'C099')
59  and  not (histology between '9050' and '9055'
60   or histology = '9140'
61   or histology between '9590' and '9992') then '20070'
62
63/* Oropharynx */ when (site between 'C100' and 'C109')
64  and  not (histology between '9050' and '9055'
65   or histology = '9140'
66   or histology between '9590' and '9992') then '20080'
67
68/* Hypopharynx */ when (site = 'C129'
69   or site between 'C130' and 'C139')
70  and  not (histology between '9050' and '9055'
71   or histology = '9140'
72   or histology between '9590' and '9992') then '20090'
73
74/* Other Oral Cavity and Pharynx */ when (site = 'C140'
75   or site between 'C142' and 'C148')
76  and  not (histology between '9050' and '9055'
77   or histology = '9140'
78   or histology between '9590' and '9992') then '20100'
79
80/* Esophagus */ when (site between 'C150' and 'C159')
81  and  not (histology between '9050' and '9055'
82   or histology = '9140'
83   or histology between '9590' and '9992') then '21010'
84
85/* Stomach */ when (site between 'C160' and 'C169')
86  and  not (histology between '9050' and '9055'
87   or histology = '9140'
88   or histology between '9590' and '9992') then '21020'
89
90/* Small Intestine */ when (site between 'C170' and 'C179')
91  and  not (histology between '9050' and '9055'
92   or histology = '9140'
93   or histology between '9590' and '9992') then '21030'
94
95/* Cecum */ when (site = 'C180')
96  and  not (histology between '9050' and '9055'
97   or histology = '9140'
98   or histology between '9590' and '9992') then '21041'
99
100/* Appendix */ when (site = 'C181')
101  and  not (histology between '9050' and '9055'
102   or histology = '9140'
103   or histology between '9590' and '9992') then '21042'
104
105/* Ascending Colon */ when (site = 'C182')
106  and  not (histology between '9050' and '9055'
107   or histology = '9140'
108   or histology between '9590' and '9992') then '21043'
109
110/* Hepatic Flexure */ when (site = 'C183')
111  and  not (histology between '9050' and '9055'
112   or histology = '9140'
113   or histology between '9590' and '9992') then '21044'
114
115/* Transverse Colon */ when (site = 'C184')
116  and  not (histology between '9050' and '9055'
117   or histology = '9140'
118   or histology between '9590' and '9992') then '21045'
119
120/* Splenic Flexure */ when (site = 'C185')
121  and  not (histology between '9050' and '9055'
122   or histology = '9140'
123   or histology between '9590' and '9992') then '21046'
124
125/* Descending Colon */ when (site = 'C186')
126  and  not (histology between '9050' and '9055'
127   or histology = '9140'
128   or histology between '9590' and '9992') then '21047'
129
130/* Sigmoid Colon */ when (site = 'C187')
131  and  not (histology between '9050' and '9055'
132   or histology = '9140'
133   or histology between '9590' and '9992') then '21048'
134
135/* Large Intestine, NOS */ when (site between 'C188' and 'C189'
136   or site = 'C260')
137  and  not (histology between '9050' and '9055'
138   or histology = '9140'
139   or histology between '9590' and '9992') then '21049'
140
141/* Rectosigmoid Junction */ when (site = 'C199')
142  and  not (histology between '9050' and '9055'
143   or histology = '9140'
144   or histology between '9590' and '9992') then '21051'
145
146/* Rectum */ when (site = 'C209')
147  and  not (histology between '9050' and '9055'
148   or histology = '9140'
149   or histology between '9590' and '9992') then '21052'
150
151/* Anus, Anal Canal and Anorectum */ when (site between 'C210' and 'C212'
152   or site = 'C218')
153  and  not (histology between '9050' and '9055'
154   or histology = '9140'
155   or histology between '9590' and '9992') then '21060'
156
157/* Liver */ when (site = 'C220')
158  and  not (histology between '9050' and '9055'
159   or histology = '9140'
160   or histology between '9590' and '9992') then '21071'
161
162/* Intrahepatic Bile Duct */ when (site = 'C221')
163  and  not (histology between '9050' and '9055'
164   or histology = '9140'
165   or histology between '9590' and '9992') then '21072'
166
167/* Gallbladder */ when (site = 'C239')
168  and  not (histology between '9050' and '9055'
169   or histology = '9140'
170   or histology between '9590' and '9992') then '21080'
171
172/* Other Biliary */ when (site between 'C240' and 'C249')
173  and  not (histology between '9050' and '9055'
174   or histology = '9140'
175   or histology between '9590' and '9992') then '21090'
176
177/* Pancreas */ when (site between 'C250' and 'C259')
178  and  not (histology between '9050' and '9055'
179   or histology = '9140'
180   or histology between '9590' and '9992') then '21100'
181
182/* Retroperitoneum */ when (site = 'C480')
183  and  not (histology between '9050' and '9055'
184   or histology = '9140'
185   or histology between '9590' and '9992') then '21110'
186
187/* Peritoneum, Omentum and Mesentery */ when (site between 'C481' and 'C482')
188  and  not (histology between '9050' and '9055'
189   or histology = '9140'
190   or histology between '9590' and '9992') then '21120'
191
192/* Other Digestive Organs */ when (site between 'C268' and 'C269'
193   or site = 'C488')
194  and  not (histology between '9050' and '9055'
195   or histology = '9140'
196   or histology between '9590' and '9992') then '21130'
197
198/* Nose, Nasal Cavity and Middle Ear */ when (site between 'C300' and 'C301'
199   or site between 'C310' and 'C319')
200  and  not (histology between '9050' and '9055'
201   or histology = '9140'
202   or histology between '9590' and '9992') then '22010'
203
204/* Larynx */ when (site between 'C320' and 'C329')
205  and  not (histology between '9050' and '9055'
206   or histology = '9140'
207   or histology between '9590' and '9992') then '22020'
208
209/* Lung and Bronchus */ when (site between 'C340' and 'C349')
210  and  not (histology between '9050' and '9055'
211   or histology = '9140'
212   or histology between '9590' and '9992') then '22030'
213
214/* Pleura */ when (site = 'C384')
215  and  not (histology between '9050' and '9055'
216   or histology = '9140'
217   or histology between '9590' and '9992') then '22050'
218
219/* Trachea, Mediastinum and Other Respiratory Organs */ when (site = 'C339'
220   or site between 'C381' and 'C383'
221   or site = 'C388'
222   or site = 'C390'
223   or site = 'C398'
224   or site = 'C399')
225  and  not (histology between '9050' and '9055'
226   or histology = '9140'
227   or histology between '9590' and '9992') then '22060'
228
229/* Bones and Joints */ when (site between 'C400' and 'C419')
230  and  not (histology between '9050' and '9055'
231   or histology = '9140'
232   or histology between '9590' and '9992') then '23000'
233
234/* Soft Tissue including Heart */ when (site = 'C380'
235   or site between 'C470' and 'C479'
236   or site between 'C490' and 'C499')
237  and  not (histology between '9050' and '9055'
238   or histology = '9140'
239   or histology between '9590' and '9992') then '24000'
240
241/* Melanoma of the Skin */ when (site between 'C440' and 'C449')
242  and (histology between '8720' and '8790') then '25010'
243
244/* Other Non-Epithelial Skin */ when (site between 'C440' and 'C449')
245  and  not (histology between '8000' and '8005'
246   or histology between '8010' and '8046'
247   or histology between '8050' and '8084'
248   or histology between '8090' and '8110'
249   or histology between '8720' and '8790'
250   or histology between '9050' and '9055'
251   or histology = '9140'
252   or histology between '9590' and '9992') then '25020'
253
254/* Breast */ when (site between 'C500' and 'C509')
255  and  not (histology between '9050' and '9055'
256   or histology = '9140'
257   or histology between '9590' and '9992') then '26000'
258
259/* Cervix Uteri */ when (site between 'C530' and 'C539')
260  and  not (histology between '9050' and '9055'
261   or histology = '9140'
262   or histology between '9590' and '9992') then '27010'
263
264/* Corpus Uteri */ when (site between 'C540' and 'C549')
265  and  not (histology between '9050' and '9055'
266   or histology = '9140'
267   or histology between '9590' and '9992') then '27020'
268
269/* Uterus, NOS */ when (site = 'C559')
270  and  not (histology between '9050' and '9055'
271   or histology = '9140'
272   or histology between '9590' and '9992') then '27030'
273
274/* Ovary */ when (site = 'C569')
275  and  not (histology between '9050' and '9055'
276   or histology = '9140'
277   or histology between '9590' and '9992') then '27040'
278
279/* Vagina */ when (site = 'C529')
280  and  not (histology between '9050' and '9055'
281   or histology = '9140'
282   or histology between '9590' and '9992') then '27050'
283
284/* Vulva */ when (site between 'C510' and 'C519')
285  and  not (histology between '9050' and '9055'
286   or histology = '9140'
287   or histology between '9590' and '9992') then '27060'
288
289/* Other Female Genital Organs */ when (site between 'C570' and 'C589')
290  and  not (histology between '9050' and '9055'
291   or histology = '9140'
292   or histology between '9590' and '9992') then '27070'
293
294/* Prostate */ when (site = 'C619')
295  and  not (histology between '9050' and '9055'
296   or histology = '9140'
297   or histology between '9590' and '9992') then '28010'
298
299/* Testis */ when (site between 'C620' and 'C629')
300  and  not (histology between '9050' and '9055'
301   or histology = '9140'
302   or histology between '9590' and '9992') then '28020'
303
304/* Penis */ when (site between 'C600' and 'C609')
305  and  not (histology between '9050' and '9055'
306   or histology = '9140'
307   or histology between '9590' and '9992') then '28030'
308
309/* Other Male Genital Organs */ when (site between 'C630' and 'C639')
310  and  not (histology between '9050' and '9055'
311   or histology = '9140'
312   or histology between '9590' and '9992') then '28040'
313
314/* Urinary Bladder */ when (site between 'C670' and 'C679')
315  and  not (histology between '9050' and '9055'
316   or histology = '9140'
317   or histology between '9590' and '9992') then '29010'
318
319/* Kidney and Renal Pelvis */ when (site = 'C649'
320   or site = 'C659')
321  and  not (histology between '9050' and '9055'
322   or histology = '9140'
323   or histology between '9590' and '9992') then '29020'
324
325/* Ureter */ when (site = 'C669')
326  and  not (histology between '9050' and '9055'
327   or histology = '9140'
328   or histology between '9590' and '9992') then '29030'
329
330/* Other Urinary Organs */ when (site between 'C680' and 'C689')
331  and  not (histology between '9050' and '9055'
332   or histology = '9140'
333   or histology between '9590' and '9992') then '29040'
334
335/* Eye and Orbit */ when (site between 'C690' and 'C699')
336  and  not (histology between '9050' and '9055'
337   or histology = '9140'
338   or histology between '9590' and '9992') then '30000'
339
340/* Brain */ when (site between 'C710' and 'C719')
341  and  not (histology between '9050' and '9055'
342   or histology = '9140'
343   or histology between '9530' and '9539'
344   or histology between '9590' and '9992') then '31010'
345
346/* Cranial Nerves Other Nervous System */ when (site between 'C710' and 'C719')
347  and (histology between '9530' and '9539') then '31040'
348
349/* None */ when (site between 'C700' and 'C709'
350   or site between 'C720' and 'C729')
351  and  not (histology between '9050' and '9055'
352   or histology = '9140'
353   or histology between '9590' and '9992') then '31040'
354
355/* Thyroid */ when (site = 'C739')
356  and  not (histology between '9050' and '9055'
357   or histology = '9140'
358   or histology between '9590' and '9992') then '32010'
359
360/* Other Endocrine including Thymus */ when (site = 'C379'
361   or site between 'C740' and 'C749'
362   or site between 'C750' and 'C759')
363  and  not (histology between '9050' and '9055'
364   or histology = '9140'
365   or histology between '9590' and '9992') then '32020'
366
367/* Hodgkin - Nodal */ when (site = 'C024'
368   or site between 'C098' and 'C099'
369   or site = 'C111'
370   or site = 'C142'
371   or site = 'C379'
372   or site = 'C422'
373   or site between 'C770' and 'C779')
374  and (histology between '9650' and '9667') then '33011'
375
376/* Hodgkin - Extranodal */ when (site = 'All other sites')
377  and (histology between '9650' and '9667') then '33012'
378
379/* NHL - Nodal */ when (site = 'C024'
380   or site = 'C098'
381   or site = 'C099'
382   or site = 'C111'
383   or site = 'C142'
384   or site = 'C379'
385   or site = 'C422'
386   or site between 'C770' and 'C779')
387  and (histology between '9590' and '9597'
388   or histology between '9670' and '9671'
389   or histology = '9673'
390   or histology = '9675'
391   or histology between '9678' and '9680'
392   or histology = '9684'
393   or histology between '9687' and '9691'
394   or histology = '9695'
395   or histology between '9698' and '9702'
396   or histology = '9705'
397   or histology between '9708' and '9709'
398   or histology = '9712'
399   or histology between '9714' and '9719'
400   or histology between '9724' and '9729'
401   or histology = '9735'
402   or histology between '9737' and '9738'
403   or histology between '9811' and '9818'
404   or histology = '9823'
405   or histology = '9827'
406   or histology = '9837') then '33041'
407
408/* NHL - Extranodal */ when  not (site = 'C024'
409   or site between 'C098' and 'C099'
410   or site = 'C111'
411   or site = 'C142'
412   or site = 'C379'
413   or site = 'C422'
414   or site between 'C770' and 'C779')
415  and (histology between '9590' and '9597'
416   or histology between '9670' and '9671'
417   or histology = '9673'
418   or histology = '9675'
419   or histology between '9678' and '9680'
420   or histology = '9684'
421   or histology between '9687' and '9691'
422   or histology = '9695'
423   or histology between '9698' and '9702'
424   or histology = '9705'
425   or histology between '9708' and '9709'
426   or histology = '9712'
427   or histology between '9714' and '9719'
428   or histology between '9724' and '9729'
429   or histology = '9735'
430   or histology between '9737' and '9738') then '33042'
431
432/* None */ when  not (site = 'C024'
433   or site between 'C098' and 'C099'
434   or site = 'C111'
435   or site = 'C142'
436   or site = 'C379'
437   or site between 'C420' and 'C422'
438   or site = 'C424'
439   or site between 'C770' and 'C779')
440  and (histology between '9811' and '9818'
441   or histology = '9823'
442   or histology = '9827'
443   or histology = '9837') then '33042'
444
445/* Myeloma */ when (histology between '9731' and '9732'
446   or histology = '9734') then '34000'
447
448/* Acute Lymphocytic Leukemia */ when (histology = '9826'
449   or histology between '9835' and '9836') then '35011'
450
451/* None */ when (site = 'C420'
452   or site = 'C421'
453   or site = 'C424')
454  and (histology between '9811' and '9818'
455   or histology = '9837') then '35011'
456
457/* Chronic Lymphocytic Leukemia */ when (site = 'C420'
458   or site = 'C421'
459   or site = 'C424')
460  and (histology = '9823') then '35012'
461
462/* Other Lymphocytic Leukemia */ when (histology = '9820'
463   or histology between '9832' and '9834'
464   or histology = '9940') then '35013'
465
466/* Acute Myeloid Leukemia */ when (histology = '9840'
467   or histology = '9861'
468   or histology between '9865' and '9867'
469   or histology = '9869'
470   or histology between '9871' and '9874'
471   or histology between '9895' and '9897'
472   or histology = '9898'
473   or histology between '9910' and '9911'
474   or histology = '9920') then '35021'
475
476/* Acute Monocytic Leukemia */ when (histology = '9891') then '35031'
477
478/* Chronic Myeloid Leukemia */ when (histology = '9863'
479   or histology between '9875' and '9876'
480   or histology between '9945' and '9946') then '35022'
481
482/* Other Myeloid/Monocytic Leukemia */ when (histology = '9860'
483   or histology = '9930') then '35023'
484
485/* Other Acute Leukemia */ when (histology = '9801'
486   or histology between '9805' and '9809'
487   or histology = '9931') then '35041'
488
489/* Aleukemic, subleukemic and NOS */ when (histology = '9733'
490   or histology = '9742'
491   or histology = '9800'
492   or histology = '9831'
493   or histology = '9870'
494   or histology = '9948'
495   or histology between '9963' and '9964') then '35043'
496
497/* None */ when (site = 'C420'
498   or site = 'C421'
499   or site = 'C424')
500  and (histology = '9827') then '35043'
501
502/* Mesothelioma */ when (histology between '9050' and '9055') then '36010'
503
504/* Kaposi Sarcoma */ when (histology = '9140') then '36020'
505
506/* Miscellaneous */ when (histology between '9740' and '9741'
507   or histology between '9750' and '9769'
508   or histology = '9950'
509   or histology between '9960' and '9962'
510   or histology between '9965' and '9967'
511   or histology between '9970' and '9971'
512   or histology = '9975'
513   or histology = '9980'
514   or histology between '9982' and '9987'
515   or histology = '9989'
516   or histology between '9991' and '9992') then '37000'
517
518/* None */ when (site between 'C760' and 'C768'
519   or site = 'C809')
520  and  not (histology between '9050' and '9055'
521   or histology = '9140'
522   or histology between '9590' and '9992') then '37000'
523
524/* None */ when (site between 'C420' and 'C424')
525  and  not (histology between '9050' and '9055'
526   or histology = '9140'
527   or histology between '9590' and '9992') then '37000'
528
529/* None */ when (site between 'C770' and 'C779')
530  and  not (histology between '9050' and '9055'
531   or histology = '9140'
532   or histology between '9590' and '9992') then '37000'
533
534/* Invalid */ else '99999'
535end
536
537as recode
538from
539 (select ne."Patient ID Number" as MRN
540       , ne.case_index
541       , ne."Primary Site" as site
542       , substr(ne."Morph--Type&Behav ICD-O-3", 1, 4) histology
543       , to_date(case length(ne."Date of Diagnosis")
544               when 8 then ne."Date of Diagnosis"
545               when 6 then ne."Date of Diagnosis" || '01'
546               when 4 then ne."Date of Diagnosis" || '0101'
547               end, 'yyyymmdd') as start_date
548  from naacr.extract ne
549  where ne."Date of Diagnosis" is not null
550    and ne."Accession Number--Hosp" is not null) ne;
551
552
553create or replace view seer_recode_facts as
554select MRN
555     , ne.case_index as encounter_ide
556     , 'SEER_SITE:' || recode concept_cd, '@' item_name
557     , '@' provider_id
558     , start_date
559     , '@' modifier_cd
560     , 1 instance_num
561     , '@' as valtype_cd
562     , '@' as tval_char
563     , to_number(null) as nval_num
564     , null as valueflag_cd
565     , null as units_cd
566     , start_date as end_date
567     , '@' location_cd
568     , to_date(null) as update_date
569from seer_recode_aux ne;
570
571/*
572select count(*), concept_cd
573from seer_recode_facts
574group by concept_cd
575order by 1 desc;
576*/
577
578/**
579 * Verify the above algorithm vs. results of John K.'s SAS code.
580select count(*) from naacr.extract;
581select count(*) from seer_recode_facts;
582select count(*) from seer_jk;
583
584select count(distinct mrn)
585from seer_recode_facts
586where concept_cd='SEER_SITE:22030';
587-- 6443 here; 6407 in i2b2. hmm.
588
589
590select jk.*, sf.recode,
591  sf.site,
592  sf.histology
593from
594seer_recode_aux sf
595left join seer_jk jk
596on jk.accno = sf."Accession Number--Hosp"
597  and jk.SeqNoHos = sf."Sequence Number--Hospital"
598  and jk.sitenew = sf.site
599  and jk.histo3 = sf.histology
600where sf.recode != jk.site_recode
601order by jk.site_recode
602;
603 */
604
605
606/* TODO: Handle null histology.
607select case
608 when (site between 'C019' and 'C029')
609  and  not (histology between '9590' and '9989'
610   or histology between '9050' and '9055'
611   or histology = '9140') then '20020'
612   end recode
613   from
614   (select 'C019' site, null histology from dual);
615
616For now, verify that it's in the noise:
617 */
618
619select case when missing_histology / tot > .001 then 1/0
620            else 1 end as few_missing_histologies
621from (
622select (
623select count(*)
624from naacr.extract ne
625 where ne."Morph--Type&Behav ICD-O-3" is null) missing_histology,
626(select count(*)  from naacr.extract ne) tot
627from dual
628)
629;
Note: See TracBrowser for help on using the repository browser.