BACPAC Synthetic Data Analysis
- Prepared by:
- Qiong Liu, Center for Translational Data Science (CTDS), University of Chicago
Attribution
This notebook uses synthetic data created for the BACPAC study as part of the study entitled Back Pain Consortium (BACPAC) Research Program Data Integration, Algorithm Development and Operations Management Center. These synthetic data are for demonstration purposes only.
The purpose of this notebook is to demonstrate how data accessed through the HEAL Data Platform can be analyzed in a HEAL workspace.
The work here was conducted without direct involvement of the by the creator of the synthetic data, and therefore does not necessarily reflect the views or opinions of the creator, of the NIH HEAL Initiative®, or of the Center for Translational Data Science (CTDS) at the University of Chicago.
!pip install plotly kaleido==0.2.1 -q
import shutil
import pandas as pd
import kaleido
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
from pathlib import Path
import numpy as np
import json
import requests
import os
plotly.offline.init_notebook_mode()
from IPython.display import Markdown, Image, displayQuery study metadata¶
Users can query study metadata in the HEAL Data Platform using our metadata service (MDS). The cell below shows how to retrieve the metadata of the BACPAC study by interacting with the Gen3 MDS endpoint.
# Query the metadata of BACPAC using the project number "1U24AR076730-01"
response=requests.get("https://healdata.org/mds/metadata?data=True&limit=1000&gen3_discovery.project_number=1U24AR076730-01")
metadata_text=response.text
metadata_object=json.loads(metadata_text)
meta_df = pd.json_normalize([sub['gen3_discovery'] for sub in metadata_object.values() if 'gen3_discovery' in sub.keys()])
Markdown(meta_df[['research_focus_area', 'study_metadata.minimal_info.study_description', 'institutions']].transpose().to_markdown())Pull file objects using the Gen3 SDK¶
!gen3 drs-pull object dg.H34L/80f0a338-18e0-48de-b70f-cdabd63f67d9
!gen3 drs-pull object dg.H34L/530fd95c-48b6-488e-a699-9377180bd82d
!gen3 drs-pull object dg.H34L/654d7f1f-b61c-49a9-8a74-c82400fa4c27{"succeeded": ["dg.H34L/80f0a338-18e0-48de-b70f-cdabd63f67d9"], "failed": []}
{"succeeded": ["dg.H34L/530fd95c-48b6-488e-a699-9377180bd82d"], "failed": []}
{"succeeded": ["dg.H34L/654d7f1f-b61c-49a9-8a74-c82400fa4c27"], "failed": []}
Demographic characteristics of participants in BACPAC¶
# Read the demographic tsv file into dataframe
demo_bacpac=pd.read_csv("./participant_SMART.tsv", sep="\t", encoding="utf-8")
# Define age groups within participants
age_list = list(demo_bacpac["age_in_years"])
def age_group(agelist):
min_age = min(agelist)
grouplabel1 = str(min_age) + "-55 yr"
grouplabel2= ">55 yr"
grouplist = []
for i in agelist:
if i <=55:
grouplist.append(grouplabel1)
else:
grouplist.append(grouplabel2)
return grouplist
agegrouplist = age_group(age_list)
demo_bacpac["age_group"] = agegrouplist
# Compute three frequency tables using demographic factors
df1=pd.crosstab(index=demo_bacpac['race'], columns=demo_bacpac['sex'])
df2=pd.crosstab(index=demo_bacpac['ethnicity'], columns=demo_bacpac['sex'])
df3=pd.crosstab(index=demo_bacpac['age_group'], columns=demo_bacpac['sex'])
# Display concatenated tables
Markdown(pd.concat([df1, df2, df3], keys=['race', 'ethnicity', 'age_group']).to_markdown())# Generate a stacked bar chart of participants in BACPAC
new_df2 = pd.DataFrame(df2.stack())
new_df2.reset_index(inplace=True)
new_df2 = new_df2.rename({0:"Count", "sex": "Sex", "ethnicity": "Ethnicity"}, axis="columns")
fig1 = go.Figure()
fig1 = px.bar(new_df2, x="Sex", y="Count", color="Ethnicity",
title= "Ethnicity and Sex Characteristics of Participants in the BACPAC Study",
width= 800, height = 500)
fig1.show()Opioid pain medication profiling at two time points¶
# Read substance use tsv file into dataframe
substance_df = pd.read_csv("./substance_use_SMART.tsv", sep="\t", encoding="utf-8")
# Combine substance use df and demographic df based on participant id
def find_participant(mydf, endstr):
participant_id = []
for i in list(mydf["submitter_id"]):
i_participant = i.rstrip(endstr)
participant_id.append(i_participant)
return participant_id
substance_participant_id = find_participant(substance_df,"_sc")
substance_df["participant_id"] = substance_participant_id
demo_combine_substance = substance_df.merge(demo_bacpac, left_on="participant_id",
right_on="submitter_id", how="outer")
# Add one property of time point in the df
def find_timepoint(mydf):
timepoint = []
for i in list(mydf["visits.submitter_id"]):
if i.endswith("Week 0"):
timepoint.append("Week 0")
else:
timepoint.append("Week 12")
return timepoint
demo_combine_substance["time_point"] = find_timepoint(demo_combine_substance)
# Compute a frequency table using opioid medication factor and time point factor
opioid_crosstab = pd.crosstab(index=demo_combine_substance['OPIOID01'],
columns=demo_combine_substance['time_point'])
new_opioid = pd.DataFrame(opioid_crosstab.stack())
new_opioid.reset_index(inplace=True)
new_opioid = new_opioid.rename({0:"Count", "OPIOID01": "Taking Opioid", "time_point": "Time Point"},
axis="columns")
fig2 = go.Figure()
fig2 =px.bar(new_opioid, x="Taking Opioid", y="Count", color="Taking Opioid",
facet_row="Time Point", width=800, height=400)
fig2.update_layout(title_text="Self-Report of Opioid Pain Medication Use at Baseline and Twelve Weeks",title_font_size=20)
for data in fig2.data:
data["width"]=0.6
fig2.show()We observed an increase of participants taking opioid pain medication at the week 12 time point compared to baseline.
# Generate a bar chart showing the opioid taking at two time points in different sex groups
opioid_gender = pd.crosstab(index=[demo_combine_substance['OPIOID01'], demo_combine_substance['sex']],
columns=demo_combine_substance['time_point'])
new_opioid_gender = pd.DataFrame(opioid_gender.stack())
new_opioid_gender.reset_index(inplace=True)
new_opioid_gender = new_opioid_gender.rename({0:"Count", "OPIOID01": "Taking Opioid",
"time_point": "Time Point", "sex": "Sex"}, axis="columns")
fig3 = go.Figure()
fig3 = px.bar(new_opioid_gender, y="Sex", x="Count", color="Taking Opioid",
facet_col="Time Point", width=800, height=400, orientation='h',
category_orders={"Sex": ["Intersex", "Unknown", "Male", "Female"]})
fig3.update_layout(title_text="Opioid Pain Medication at Two Time Points in Different Sex Groups", title_font_size=20)
fig3.show()We observed an increase of particpants taking opioid medication at week 12 in both male and female groups compared to baseline week 0.
Physical function outcomes¶
The cell below uses the Physical Function 6b T-Score to display physical function outcomes in different ethnicity groups at week 0 and week 12.
# Read physical_function_SMART.tsv into dataframe and merge the df with demographic
function_df = pd.read_csv("./physical_function_SMART.tsv", sep="\t", encoding="utf-16")
function_participant_id = find_participant(function_df, "_pf")
function_df["participant_id"] = function_participant_id
demo_combine_function = function_df.merge(demo_bacpac, left_on="participant_id",
right_on="submitter_id", how="outer")
demo_combine_function["time_point"] = find_timepoint(demo_combine_function)
# Summary table of ROMIS-Physical Function 6b T-Score in different ethnicity groups
ethnicity_PRPF6BT = demo_combine_function[["time_point",
"PRPF6BT",
"ethnicity"]].groupby(['time_point','ethnicity']).describe()
Markdown(ethnicity_PRPF6BT.to_markdown())# Visualize the distribution of Physical Function 6b T-Score
# at two time points for hispanic and non-hispanic ethnicity groups
fig4 = go.Figure()
fig4 = make_subplots(
rows=2, cols=2,
specs=[[{"colspan": 2}, None],
[{}, {}]],
subplot_titles=("PROMIS-Physical Function 6b T-Score Distribution at Two Time Points","Hispanic or Latino",
"Not Hispanic or Latino"))
fig4.add_trace(go.Histogram(x=demo_combine_function[demo_combine_function["time_point"]=="Week 0"]["PRPF6BT"],
marker_color='#EB89B5', opacity=0.75, nbinsx=20, name="Week 0"),
row=1, col=1)
fig4.add_trace(go.Histogram(x=demo_combine_function[demo_combine_function["time_point"]=="Week 12"]["PRPF6BT"],
marker_color='#2B6CBE', opacity=0.75, nbinsx=20, name="Week 12"),
row=1, col=1)
fig4.add_trace(go.Histogram(x=demo_combine_function[(demo_combine_function["time_point"]=="Week 0")&(demo_combine_function["ethnicity"]=="Hispanic or Latino")]["PRPF6BT"],
marker_color='#EB89B5', opacity=0.75, nbinsx=20,showlegend=False),
row=2, col=1)
fig4.add_trace(go.Histogram(x=demo_combine_function[(demo_combine_function["time_point"]=="Week 12")&(demo_combine_function["ethnicity"]=="Hispanic or Latino")]["PRPF6BT"],
marker_color='#2B6CBE', opacity=0.75, nbinsx=20,showlegend=False),
row=2, col=1)
fig4.add_trace(go.Histogram(x=demo_combine_function[(demo_combine_function["time_point"]=="Week 0")&(demo_combine_function["ethnicity"]=="Not Hispanic or Latino")]["PRPF6BT"],
marker_color='#EB89B5', opacity=0.75, nbinsx=20,showlegend=False),
row=2, col=2)
fig4.add_trace(go.Histogram(x=demo_combine_function[(demo_combine_function["time_point"]=="Week 12")&(demo_combine_function["ethnicity"]=="Not Hispanic or Latino")]["PRPF6BT"],
marker_color='#2B6CBE', opacity=0.75, nbinsx=20,showlegend=False),
row=2, col=2)
fig4.update_layout(barmode='overlay', width=800, height=500,legend_title_text='Time Point')
fig4.update_layout(margin=dict(l=20, r=20, t=50, b=20, pad=2))
fig4.update_yaxes(title_text="Count",
title_font_size=15, range=[0, 40], row=1, col=1)
fig4.update_xaxes(title_text="PROMIS-Physical Function 6b T-Score",
title_font_size=15,
range=[29, 49], row=1, col=1)
fig4.update_yaxes(title_text="Count",
title_font_size=15, range=[0, 15], row=2, col=1)
fig4.update_xaxes(title_text="PROMIS-Physical Function 6b T-Score",
title_font_size=15, range=[29, 49], row=2, col=1)
fig4.update_yaxes(title_text="Count",
title_font_size=15, range=[0, 15], row=2, col=2)
fig4.update_xaxes(title_text="PROMIS-Physical Function 6b T-Score",
title_font_size=15, range=[29, 49], row=2, col=2)
fig4.show()- Anstrom, K. J., IVANOVA, A., & LAVANGE, L. (2024). Back Pain Consortium (BACPAC) Research Program Data Integration, Algorithm Development and Operations Management Center. HEAL Data Platform. 10.60490/HDP00258