Note: We no longer publish the latest version of our code here. We primarily use a kumc-bmi github organization. The heron ETL repository, in particular, is not public. Peers in the informatics community should see MultiSiteDev for details on requesting access.

source: heron_load/epic_i2b2_deid_verify.sql @ 0:42ad7288920a

heron-michigan tip
Last change on this file since 0:42ad7288920a was 0:42ad7288920a, checked in by Matt Hoag <mhoag@…>, 6 years ago

Merge with demo_concepts_3800

File size: 8.8 KB
Line 
1/* epic_i2b2_deid_verify.sql -- Tests to verify proper de-identification of
2dimensions/facts for patients over the HIPAA age group limit (>=90 at the time
3of writing).
4
5Copyright (c) 2012 University of Kansas Medical Center
6part of the HERON* open source codebase; see NOTICE file for license details.
7* http://informatics.kumc.edu/work/wiki/HERON
8*/
9
10
11--Make sure we can reach both id and de-id databases
12select start_date from blueherondata.observation_fact@deid where 1 = 0;
13select start_date from nightherondata.observation_fact where 1 = 0;
14
15
16/* Depending on when this test is run on the data, it may be inappropriate to
17use sysdate when calculating the ages of people are still alive (may have been
18properly capped to something less than the age limit as per HIPAA, but if the
19test is run sometime later using sysdate, it could show patients older than the
20limit.
21
22So, for patient_dimension age tests, use the upload date for the dimensions
23upload.
24*/
25create or replace view epic_dimensions_load_date as (
26  select max(load_date) load_date from nightherondata.upload_status
27  where transform_name = 'load_epic_dimensions'
28  )
29;
30
31-- Make sure there's some ages greater than or equal to the age limit in id
32select case when count(*) > 0 then 1 else 1/0 end some_gte_agelimit_in_id from(
33  select
34     idpd.patient_num, idpd.age_in_years_num, idpd.birth_date, idpd.death_date
35  from
36    nightherondata.patient_dimension idpd, hipaa_constants hcon
37  where
38    age_in_years_num >= hcon.max_age_years
39  )
40;
41
42
43/* Simple test to make sure the age in the patient dimension is not greater than
44or equal to the age limit.
45*/
46select case when count(*) > 0 then 1/0 else 1 end deid_age_gte_agelimit_pd from(
47  select
48    pat.age_in_years_num
49  from
50    blueherondata.patient_dimension@deid pat, hipaa_constants hcon
51  where
52    pat.age_in_years_num >= hcon.max_age_years
53  )
54;
55
56
57/* Below is useful for finding ages greater than or equal to the age limit in
58SSDMF in case the above test fails.
59
60select p.*, d.* from(
61  select replace(ssn, '-', '') ssn, birth_date epic_birth_date, death_date epic_death_date from(
62    select epat.ssn, epat.birth_date, epat.death_date
63    from hipaa_constants hcon, blueherondata.patient_dimension@deid pat
64    join nightherondata.patient_mapping pmap on pmap.patient_num = pat.patient_num
65    join patient_dimension epat on epat.patient_ide = pmap.patient_ide
66    where pat.age_in_years_num >= hcon.max_age_years
67    )
68  ) p
69left join dmf.dmf_table d on d.social_security_number = p.ssn
70;
71*/
72
73
74/* Check our age facts - make sure none exceed the limit
75*/
76select case when count(*) > 0 then 1/0 else 1 end deid_age_gte_agelimit_fact from(
77  with
78  code_base_age as(
79    select 'DEM|AGE:' cd from dual
80    ),
81  age_facts as(
82    select
83      to_number(substr(obs.concept_cd, length(code_base_age.cd) + 1)) age_num
84    from
85      blueherondata.observation_fact@deid obs,
86      code_base_age
87    where
88      obs.concept_cd like code_base_age.cd || '%'
89    )
90  select * from age_facts af, hipaa_constants hcon where af.age_num >= hcon.max_age_years
91  )
92;
93
94
95/* Check our age at visit facts - make sure none exceed the limit
96*/
97select case when count(*) > 0 then 1/0 else 1 end deid_ageatv_gte_agelimit_fact from(
98  with
99  code_base_ageatv as(
100    select 'DEM|AGEATV:' cd from dual
101    ),
102  age_facts as(
103    select
104      to_number(substr(obs.concept_cd, length(code_base_ageatv.cd) + 1)) age_num
105    from
106      blueherondata.observation_fact@deid obs,
107      code_base_ageatv
108    where
109      obs.concept_cd like code_base_ageatv.cd || '%'
110    )
111  select * from age_facts af, hipaa_constants hcon where af.age_num >= hcon.max_age_years
112  )
113;
114
115
116/* Make sure that ETL date minus birth date isn't greater than or equal to the
117HIPAA age limit in DEID. See #2711.
118*/
119select case when count(*) > 0 then 1/0 else 1 end deid_bd_gte_agelimit_yrs_ago from(
120  select * from (
121    select
122      pd.patient_num, pd.vital_status_cd, pd.birth_date, pd.death_date, pd.import_date,
123      months_between(pd.import_date, pd.BIRTH_DATE) / 12 calc_age
124    from blueherondata.patient_dimension@deid pd
125    where pd.vital_status_cd != 'y'
126    ), hipaa_constants hcon
127  where calc_age >= hcon.max_age_years
128  order by calc_age desc
129  )
130;
131
132
133/* Test to make sure we cannot identify the age of anyone at or over the limit
134by linking a  death fact with patient dimension and then subtracting the birthdate
135from the fact date.
136
137Most of the demographics facts are recorded with the reg_date from the
138clarity.patient table.  As per the Clarity Data Dictionary, reg_date is "The
139date on which the last patient verification occurred. If a patient was verified
140and then re-verifed at a later date, this column will show the re-verified date."
141
142There are many cases where the reg_date is later than the death date - even by
143years.
144*/
145select case when count(*) > 0 then 1/0 else 1 end deid_ssa_age_gte_agelimit from(
146  select *
147  from(
148    select
149      obs.patient_num, obs.start_date, pat.birth_date, obs.concept_cd,
150      floor(months_between(obs.start_date,pat.birth_date)/12) age_in_years
151    from
152      blueherondata.observation_fact@deid obs
153    join
154      blueherondata.patient_dimension@deid pat on pat.patient_num = obs.patient_num
155    where
156      obs.concept_cd = 'DEM|VITAL|SSA:y'
157    ), hipaa_constants hcon
158  where age_in_years >= hcon.max_age_years
159  order by age_in_years desc
160  )
161;
162
163/* Now check the vital stats from Epic.  We take SSDMF as truth for the patient
164dimension.  So if a patient was born in 1920 and Epic thought he died in 2013
165but SSDMF updated the death date to 1933, then the Epic death fact is still ok
166even though it is more than the age limit years in the future from the birth date
167(since the fact is presumed to be incorrect).
168*/
169select case when count(*) > 0 then 1/0 else 1 end deid_gte_agelimit_no_ssa_crctd from(
170  with
171  epic_vital_gte_limit as (
172    select * from(
173      select
174        obs.patient_num, obs.start_date, pat.birth_date, obs.concept_cd,
175        floor(months_between(obs.start_date,pat.birth_date)/12) age_in_years_calc
176      from
177        blueherondata.observation_fact@deid obs
178      join
179        blueherondata.patient_dimension@deid pat on pat.patient_num = obs.patient_num
180      where
181        obs.concept_cd = 'DEM|VITAL:y'
182      ), hipaa_constants hcon
183    where age_in_years_calc >= hcon.max_age_years
184    ),
185  ssdmf as (
186   select
187      obs.patient_num, obs.start_date, pat.birth_date, obs.concept_cd
188    from
189      blueherondata.observation_fact@deid obs
190    join
191      blueherondata.patient_dimension@deid pat on pat.patient_num = obs.patient_num
192    where
193      obs.concept_cd = 'DEM|VITAL|SSA:y'
194    )
195  select * from epic_vital_gte_limit ev
196  left join ssdmf on ev.patient_num = ssdmf.patient_num
197  where ssdmf.patient_num is null
198  )
199;
200
201
202/* Useful for debugging
203select p.*, d.* from(
204  select patient_ide, patient_num, replace(ssn, '-', '') ssn, epic_birth_date, birth_date_hipaa, deid_birth_date, epic_death_date, deid_death_date from(
205    select
206    pmap.patient_ide, pmap.patient_num,
207    epat.ssn, epat.birth_date epic_birth_date, idpdim.birth_date_hipaa, pat.birth_date deid_birth_date, epat.death_date epic_death_date, pat.death_date deid_death_date
208    from blueherondata.patient_dimension@deid pat
209    join nightherondata.patient_mapping pmap on pmap.patient_num = pat.patient_num
210    join patient_dimension epat on epat.patient_ide = pmap.patient_ide
211    join nightherondata.patient_dimension idpdim on idpdim.patient_num = pmap.patient_num
212    where pat.patient_num = xxxx
213    )
214  ) p
215left join dmf.dmf_table d on d.social_security_number = p.ssn
216;
217*/
218
219
220/* Make sure that birthdates of those < age limit are not changed a lot more than
221our standard de-id amount from id to deid.
222See: https://informatics.kumc.edu/work/wiki/DeIdentificationStrategy
223*/
224select case when count(*) > 0 then 1/0 else 1 end lt_agelimit_bd_same from(
225  with
226  ages as(
227    select
228      patient_num, birth_date, death_date, age_in_years_num,
229      case
230        when death_date is null then floor(months_between((select load_date from epic_dimensions_load_date),birth_date)/12)
231        else floor(months_between(death_date,birth_date)/12)
232        end age_in_years_num_calc
233    from
234      nightherondata.patient_dimension
235    ),
236  lt_limit as(
237    select * from ages, hipaa_constants hcon where age_in_years_num_calc <= hcon.max_age_years
238    )
239  select 
240    lt_limit.patient_num, lt_limit.age_in_years_num_calc, lt_limit.birth_date id_bd,
241    deid.birth_date deid_bd
242  from lt_limit
243    join blueherondata.patient_dimension@deid deid on deid.patient_num = lt_limit.patient_num
244  where floor(months_between(lt_limit.birth_date, deid.birth_date)/12) > 1
245  )
246;
Note: See TracBrowser for help on using the repository browser.