import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix,
                             roc_auc_score, roc_curve, f1_score)
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import warnings
# Suppress convergence/deprecation warnings for cleaner tutorial output;
# in production, review all warnings before suppressing.
warnings.filterwarnings('ignore')

# Set visualization style for professional graphs
sns.set_theme(style="whitegrid")
custom_palette = ['#232D4B', '#E57200']  
sns.set_palette(custom_palette)

print("Libraries loaded and visualization style configured.")

Libraries loaded and visualization style configured.

# =============================================================================
# 1. EXTRACTION: Download 2021-2023 SAS XPT files directly from the CDC
# =============================================================================
base_url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/"

print("Downloading 8 NHANES 2021-2023 component tables from CDC...")
df_demo     = pd.read_sas(base_url + 'DEMO_L.xpt')     # Demographics
df_body     = pd.read_sas(base_url + 'BMX_L.xpt')      # Body Measures
df_sleep    = pd.read_sas(base_url + 'SLQ_L.xpt')      # Sleep
df_activity = pd.read_sas(base_url + 'PAQ_L.xpt')      # Physical Activity
df_depr     = pd.read_sas(base_url + 'DPQ_L.xpt')      # Depression Screener (PHQ-9)
df_crp      = pd.read_sas(base_url + 'HSCRP_L.xpt')    # hs-CRP (Inflammation Lab)
df_alcohol  = pd.read_sas(base_url + 'ALQ_L.xpt')      # Alcohol Use (NEW)
df_food     = pd.read_sas(base_url + 'FSQ_L.xpt')      # Food Security (NEW)

print(f"  Demographics:    {df_demo.shape[0]:,} records")
print(f"  Body Measures:   {df_body.shape[0]:,} records")
print(f"  Sleep:           {df_sleep.shape[0]:,} records")
print(f"  Physical Activity: {df_activity.shape[0]:,} records")
print(f"  Depression (PHQ): {df_depr.shape[0]:,} records")
print(f"  hs-CRP Lab:      {df_crp.shape[0]:,} records")
print(f"  Alcohol Use:     {df_alcohol.shape[0]:,} records")
print(f"  Food Security:   {df_food.shape[0]:,} records")

# =============================================================================
# 2. TRANSFORM: Merge all 8 datasets on the unique respondent ID (SEQN)
# =============================================================================
df = df_demo.copy()
for table in [df_body, df_sleep, df_activity, df_depr, df_crp, df_alcohol, df_food]:
    df = pd.merge(df, table, on='SEQN', how='inner')

print(f"\nAfter inner join across all 8 tables: {df.shape[0]:,} records")

# =============================================================================
# 3. TRANSFORM: Compute PHQ-9 Depression Score
# =============================================================================
# PHQ-9 consists of 9 items (DPQ010-DPQ090), each scored 0-3.
# Codes 7 (refused) and 9 (don't know) are treated as missing.
phq_cols = [f'DPQ0{i}0' for i in range(1, 10)]
df[phq_cols] = df[phq_cols].replace([7, 9], np.nan)
df['PHQ9_Score'] = df[phq_cols].sum(axis=1)

# =============================================================================
# 4. TRANSFORM: Rename variables, create features, filter adults
# =============================================================================
col_mapping = {
    'SEQN': 'ID',
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age',
    'BMXBMI': 'BMI',
    'SLD012': 'Sleep_Hours',
    'PAD820': 'Vigorous_Activity_Min',
    'INDFMPIR': 'Income_Poverty_Ratio',
    'LBXHSCRP': 'CRP_mgL',
    'ALQ130': 'Alcohol_Drinks_Day',      # NEW: Average alcoholic drinks per day
    'FSDAD': 'Food_Security_Category'     # NEW: Adult food security (1=Full, 2=Marginal, 3=Low, 4=Very Low)
}

keep_cols = list(col_mapping.values()) + ['PHQ9_Score']
df_clean = df.rename(columns=col_mapping)[keep_cols]

# Filter to adults aged 20+
df_clean = df_clean[df_clean['Age'] >= 20]

# Map Gender to readable labels
df_clean['Gender'] = df_clean['Gender'].map({1.0: 'Male', 2.0: 'Female'})

# Create binary depression target: PHQ-9 >= 10 is the clinical cutoff
df_clean['Depressed'] = df_clean['PHQ9_Score'].apply(
    lambda x: 1 if x >= 10 else 0 if pd.notnull(x) else np.nan
)

# Create binary food insecurity indicator (Low or Very Low security)
df_clean['Food_Insecure'] = df_clean['Food_Security_Category'].apply(
    lambda x: 1 if x >= 3 else 0 if pd.notnull(x) else np.nan
)

# =============================================================================
# 5. LOAD: Handle missing values and set data types
# =============================================================================
# Drop rows missing the target variable
df_clean = df_clean.dropna(subset=['PHQ9_Score'])

# Clean special codes: ALQ130 uses 777=Refused, 999=Don't know
df_clean['Alcohol_Drinks_Day'] = df_clean['Alcohol_Drinks_Day'].replace([777, 999], np.nan)

# Skip-logic NaN fills: 0 activity = didn't exercise; 0 drinks = doesn't drink
df_clean['Vigorous_Activity_Min'] = df_clean['Vigorous_Activity_Min'].fillna(0)
df_clean['Alcohol_Drinks_Day'] = df_clean['Alcohol_Drinks_Day'].fillna(0)

# Median imputation for remaining continuous features
for col in ['BMI', 'Sleep_Hours', 'Income_Poverty_Ratio', 'CRP_mgL']:
    df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# Mode imputation for categorical food security
df_clean['Food_Insecure'] = df_clean['Food_Insecure'].fillna(0)
df_clean['Food_Security_Category'] = df_clean['Food_Security_Category'].fillna(
    df_clean['Food_Security_Category'].mode()[0]
)

# Set logical data types
df_clean = df_clean.astype({
    'ID': 'int64', 'Age': 'int64',
    'Sleep_Hours': 'float64', 'Vigorous_Activity_Min': 'float64',
    'CRP_mgL': 'float64', 'Alcohol_Drinks_Day': 'float64',
    'PHQ9_Score': 'int64', 'Depressed': 'int64', 'Food_Insecure': 'int64'
})

print(f"\nFinal analytic dataset: {df_clean.shape[0]:,} adults, {df_clean.shape[1]} variables")
print(f"\n--- Data Types ---")
print(df_clean.dtypes)
print(f"\n--- First 5 Rows ---")
display(df_clean.head())
print(f"\n--- Descriptive Statistics ---")
display(df_clean.describe().round(2))

Downloading 8 NHANES 2021-2023 component tables from CDC...
  Demographics:    11,933 records
  Body Measures:   8,860 records
  Sleep:           8,501 records
  Physical Activity: 8,153 records
  Depression (PHQ): 6,337 records
  hs-CRP Lab:      8,727 records
  Alcohol Use:     6,337 records
  Food Security:   11,933 records

After inner join across all 8 tables: 6,337 records

Final analytic dataset: 6,064 adults, 13 variables

--- Data Types ---
ID                          int64
Gender                     object
Age                         int64
BMI                       float64
Sleep_Hours               float64
Vigorous_Activity_Min     float64
Income_Poverty_Ratio      float64
CRP_mgL                   float64
Alcohol_Drinks_Day        float64
Food_Security_Category    float64
PHQ9_Score                  int64
Depressed                   int64
Food_Insecure               int64
dtype: object

--- First 5 Rows ---

--- Descriptive Statistics ---

# =============================================================================
# EDA Stage 1: Summary Statistics
# =============================================================================
print("=" * 70)
print("SUMMARY STATISTICS")
print("=" * 70)

# 1. Depression base rate (establishes class imbalance)
dep_rate = df_clean['Depressed'].mean() * 100
print(f"\n1. Depression prevalence (PHQ-9 >= 10): {dep_rate:.1f}%")

# 2. Median vigorous activity across cohort
med_activity = df_clean['Vigorous_Activity_Min'].median()
print(f"2. Median vigorous activity (min/day):   {med_activity:.1f}")

# 3. Median income-to-poverty ratio
med_income = df_clean['Income_Poverty_Ratio'].median()
print(f"3. Median income-to-poverty ratio:       {med_income:.2f}")

# 4. Food insecurity rate
food_insec_rate = df_clean['Food_Insecure'].mean() * 100
print(f"4. Food insecurity prevalence:            {food_insec_rate:.1f}%")

# 5. Median alcohol consumption
med_alcohol = df_clean['Alcohol_Drinks_Day'].median()
mean_alcohol = df_clean['Alcohol_Drinks_Day'].mean()
print(f"5. Median alcohol drinks/day:             {med_alcohol:.1f} (mean: {mean_alcohol:.2f})")

# 6-7. Activity by depression status
activity_by_dep = df_clean.groupby('Depressed')['Vigorous_Activity_Min'].mean()
print(f"6. Mean vigorous activity — NOT depressed: {activity_by_dep[0]:.1f} min/day")
print(f"7. Mean vigorous activity — DEPRESSED:     {activity_by_dep[1]:.1f} min/day")

======================================================================
SUMMARY STATISTICS
======================================================================

1. Depression prevalence (PHQ-9 >= 10): 11.0%
2. Median vigorous activity (min/day):   0.0
3. Median income-to-poverty ratio:       2.82
4. Food insecurity prevalence:            18.2%
5. Median alcohol drinks/day:             1.0 (mean: 1.69)
6. Mean vigorous activity — NOT depressed: 37.8 min/day
7. Mean vigorous activity — DEPRESSED:     66.8 min/day

# =============================================================================
# EDA Stage 1: Correlation Heatmap
# =============================================================================
plt.figure(figsize=(10, 8))

numeric_cols = ['Age', 'BMI', 'Sleep_Hours', 'Vigorous_Activity_Min',
                'Income_Poverty_Ratio', 'CRP_mgL', 'Alcohol_Drinks_Day',
                'Food_Insecure', 'PHQ9_Score']
corr_matrix = df_clean[numeric_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap='coolwarm',
            vmax=0.3, vmin=-0.3, square=True, linewidths=.5,
            cbar_kws={"shrink": .8})
plt.title('Graph 1: Correlation Heatmap — All Features vs. PHQ-9 Score (2021-2023)',
          fontsize=13, pad=20)
plt.tight_layout()
plt.show()

# =============================================================================
# EDA Stage 2: Four-Panel Visualization
# =============================================================================
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Exploratory Data Analysis: Distributions and Relationships',
             fontsize=16, fontweight='bold', y=1.02)

# Graph 2: Distribution of PHQ-9 Scores
# Question: How is the target variable distributed? Is binary cutoff appropriate?
sns.histplot(data=df_clean, x='PHQ9_Score', bins=27, kde=True,
             color='#232D4B', ax=axes[0, 0])
axes[0, 0].axvline(x=10, color='#E57200', linestyle='--', linewidth=2, label='Clinical Cutoff (≥10)')
axes[0, 0].legend()
axes[0, 0].set_title('Graph 2: Distribution of PHQ-9 Depression Scores', fontsize=12)
axes[0, 0].set_xlabel('PHQ-9 Score')
axes[0, 0].set_ylabel('Count of Individuals')

# Graph 3: Depression Rate by Gender
# Question: Does depression prevalence differ by gender?
sns.barplot(data=df_clean, x='Gender', y='Depressed', errorbar=None,
            palette=custom_palette, hue='Gender', legend=False, ax=axes[0, 1])
axes[0, 1].set_title('Graph 3: Clinical Depression Rate by Gender', fontsize=12)
axes[0, 1].set_ylabel('Proportion Depressed (PHQ-9 ≥ 10)')

# Graph 4: Food Insecurity Rate by Depression Status
# Question: Does food insecurity differ sharply between depressed and non-depressed cohorts?
food_dep = df_clean.groupby('Depressed')['Food_Insecure'].mean().reset_index()
sns.barplot(data=food_dep, x='Depressed', y='Food_Insecure',
            palette=custom_palette, hue='Depressed', legend=False, ax=axes[1, 0])
axes[1, 0].set_title('Graph 4: Food Insecurity Rate by Depression Status', fontsize=12)
axes[1, 0].set_xticks([0, 1])
axes[1, 0].set_xticklabels(['Not Depressed', 'Depressed'])
axes[1, 0].set_xlabel('Depression Status')
axes[1, 0].set_ylabel('Proportion Food Insecure')

# Graph 5: Systemic Inflammation (CRP) by Depression Status
# Question: Is there objective biomarker evidence of depression-related inflammation?
df_vis = df_clean[df_clean['CRP_mgL'] <= 15]  # Cap at 15 mg/L to reduce outlier distortion
sns.violinplot(data=df_vis, x='Depressed', y='CRP_mgL', palette=custom_palette,
               hue='Depressed', legend=False, inner="quartile", ax=axes[1, 1])
axes[1, 1].set_title('Graph 5: Systemic Inflammation (CRP) by Depression Status', fontsize=12)
axes[1, 1].set_xticks([0, 1])
axes[1, 1].set_xticklabels(['Not Depressed', 'Depressed'])
axes[1, 1].set_xlabel('Depression Status')
axes[1, 1].set_ylabel('C-Reactive Protein (mg/L)')

plt.tight_layout()
plt.show()

# =============================================================================
# EDA Stage 2 (continued): New Predictor Visualizations
# =============================================================================
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Graph 6: Alcohol Consumption by Depression Status
sns.boxplot(data=df_clean, x='Depressed', y='Alcohol_Drinks_Day',
            palette=custom_palette, hue='Depressed', legend=False, ax=axes[0])
axes[0].set_title('Graph 6: Alcohol Consumption by Depression Status', fontsize=12)
axes[0].set_xticks([0, 1])
axes[0].set_xticklabels(['Not Depressed', 'Depressed'])
axes[0].set_xlabel('Depression Status')
axes[0].set_ylabel('Avg Alcoholic Drinks / Day')
axes[0].set_ylim(0, 10)  # Cap visualization to reduce extreme outlier distortion

# Graph 7: BMI Distribution by Depression Status
sns.boxplot(data=df_clean, x='Depressed', y='BMI', palette=custom_palette,
            hue='Depressed', legend=False, ax=axes[1])
axes[1].set_title('Graph 7: BMI Distribution by Depression Status', fontsize=12)
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['Not Depressed', 'Depressed'])
axes[1].set_xlabel('Depression Status')
axes[1].set_ylabel('Body Mass Index (BMI)')

# Graph 8: Sleep Duration by Depression Status
sns.violinplot(data=df_clean, x='Depressed', y='Sleep_Hours',
               palette=custom_palette, hue='Depressed', legend=False,
               inner="quartile", ax=axes[2])
axes[2].set_title('Graph 8: Sleep Duration by Depression Status', fontsize=12)
axes[2].set_xticks([0, 1])
axes[2].set_xticklabels(['Not Depressed', 'Depressed'])
axes[2].set_xlabel('Depression Status')
axes[2].set_ylabel('Sleep Duration (hours)')

plt.tight_layout()
plt.show()

print("Skewness check (>1 = right-skewed, supports non-parametric test):")
print(f"  CRP_mgL:            skew = {df_clean['CRP_mgL'].skew():.2f}")
print(f"  Alcohol_Drinks_Day: skew = {df_clean['Alcohol_Drinks_Day'].skew():.2f}")
print(f"  Sleep_Hours:        skew = {df_clean['Sleep_Hours'].skew():.2f}")
print()
print("→ CRP and Alcohol are heavily right-skewed; Mann-Whitney U is appropriate.")
print("→ Sleep is approximately symmetric; t-test would also be valid, but Mann-Whitney is used for consistency.")
print()

Skewness check (>1 = right-skewed, supports non-parametric test):
  CRP_mgL:            skew = 8.04
  Alcohol_Drinks_Day: skew = 2.57
  Sleep_Hours:        skew = -0.09

→ CRP and Alcohol are heavily right-skewed; Mann-Whitney U is appropriate.
→ Sleep is approximately symmetric; t-test would also be valid, but Mann-Whitney is used for consistency.

# =============================================================================
# EDA Stage 3: Statistical Hypothesis Testing
# =============================================================================
print("=" * 70)
print("EDA Stage 3: STATISTICAL TESTS: Depressed vs. Non-Depressed Cohort Comparisons")
print("=" * 70)

# Mann-Whitney U test for CRP (non-normal, continuous)
crp_nd = df_clean[df_clean['Depressed'] == 0]['CRP_mgL'].dropna()
crp_d  = df_clean[df_clean['Depressed'] == 1]['CRP_mgL'].dropna()
stat_crp, p_crp = stats.mannwhitneyu(crp_d, crp_nd, alternative='greater')
print(f"\n1. CRP (inflammation):")
print(f"   Mann-Whitney U = {stat_crp:,.0f}, p = {p_crp:.2e}")
print(f"   {'*** SIGNIFICANT' if p_crp < 0.05 else 'Not significant'}: Depressed cohort has {'higher' if p_crp < 0.05 else 'similar'} inflammation.")

# Mann-Whitney U test for Alcohol
alc_nd = df_clean[df_clean['Depressed'] == 0]['Alcohol_Drinks_Day'].dropna()
alc_d  = df_clean[df_clean['Depressed'] == 1]['Alcohol_Drinks_Day'].dropna()
stat_alc, p_alc = stats.mannwhitneyu(alc_d, alc_nd, alternative='greater')
print(f"\n2. Alcohol (drinks/day):")
print(f"   Mann-Whitney U = {stat_alc:,.0f}, p = {p_alc:.4f}")
print(f"    {'*** SIGNIFICANT' if p_alc < 0.05 else 'Not significant'}: Depressed cohort has higher average alcohol consumption.")

# Chi-squared test for Food Insecurity (categorical)
contingency = pd.crosstab(df_clean['Depressed'], df_clean['Food_Insecure'])
chi2, p_chi, dof, expected = stats.chi2_contingency(contingency)
print(f"\n3. Food Insecurity (categorical):")
print(f"   Chi-squared = {chi2:.1f}, df = {dof}, p = {p_chi:.2e}")
print(f"    {'*** SIGNIFICANT' if p_chi < 0.05 else 'Not significant'}: Depressed cohort has higher rates of food insecurity (40.1% vs. 15.4%).")

# Mann-Whitney U test for Sleep
sleep_nd = df_clean[df_clean['Depressed'] == 0]['Sleep_Hours'].dropna()
sleep_d  = df_clean[df_clean['Depressed'] == 1]['Sleep_Hours'].dropna()
stat_slp, p_slp = stats.mannwhitneyu(sleep_d, sleep_nd, alternative='two-sided')
print(f"\n4. Sleep Duration (two-sided test):")
print(f"   Mann-Whitney U = {stat_slp:,.0f}, p = {p_slp:.4f}")
print(f"    {'*** SIGNIFICANT' if p_slp < 0.05 else 'Not significant'}: Depressed cohort has significantly different sleep duration.")

======================================================================
EDA Stage 3: STATISTICAL TESTS: Depressed vs. Non-Depressed Cohort Comparisons
======================================================================

1. CRP (inflammation):
   Mann-Whitney U = 2,009,218, p = 1.11e-06
   *** SIGNIFICANT: Depressed cohort has higher inflammation.

2. Alcohol (drinks/day):
   Mann-Whitney U = 2,075,518, p = 0.0000
    *** SIGNIFICANT: Depressed cohort has higher average alcohol consumption.

3. Food Insecurity (categorical):
   Chi-squared = 243.0, df = 1, p = 8.83e-55
    *** SIGNIFICANT: Depressed cohort has higher rates of food insecurity (40.1% vs. 15.4%).

4. Sleep Duration (two-sided test):
   Mann-Whitney U = 1,639,981, p = 0.0001
    *** SIGNIFICANT: Depressed cohort has significantly different sleep duration.

# =============================================================================
# EDA Stage 3: Cross-Tabulation Summary
# =============================================================================
print("\n" + "=" * 70)
print("CROSS-TABULATION: Mean Feature Values by Depression Status")
print("=" * 70 + "\n")

cross_tab = df_clean.groupby('Depressed').agg(
    N=('ID', 'count'),
    Avg_Age=('Age', 'mean'),
    Avg_BMI=('BMI', 'mean'),
    Avg_Sleep=('Sleep_Hours', 'mean'),
    Avg_Activity=('Vigorous_Activity_Min', 'mean'),
    Avg_CRP=('CRP_mgL', 'mean'),
    Avg_Alcohol=('Alcohol_Drinks_Day', 'mean'),
    Avg_Income_Ratio=('Income_Poverty_Ratio', 'mean'),
    Pct_Food_Insecure=('Food_Insecure', 'mean'),
    Pct_Female=('Gender', lambda x: (x == 'Female').mean())
).round(3)

cross_tab.index = ['Not Depressed (0)', 'Depressed (1)']
display(cross_tab.T)

======================================================================
CROSS-TABULATION: Mean Feature Values by Depression Status
======================================================================

# =============================================================================
# MODEL 1: Binary Classification of Clinical Depression
# =============================================================================
print("=" * 70)
print("MODEL 1: Binary Classification — Logistic Regression vs. Random Forest")
print("=" * 70)

# Prepare features and target
feature_cols = ['Age', 'BMI', 'Sleep_Hours', 'Vigorous_Activity_Min',
                'Income_Poverty_Ratio', 'CRP_mgL', 'Alcohol_Drinks_Day', 'Food_Insecure']

# Encode Gender as binary
df_model = df_clean.copy()
df_model['Gender_Female'] = (df_model['Gender'] == 'Female').astype(int)
feature_cols_full = feature_cols + ['Gender_Female']

X = df_model[feature_cols_full]
y = df_model['Depressed']

# Train-test split (80/20, stratified to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
print(f"\nTraining set: {X_train.shape[0]:,} samples ({y_train.mean()*100:.1f}% positive)")
print(f"Test set:     {X_test.shape[0]:,} samples ({y_test.mean()*100:.1f}% positive)")

# Scale features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Logistic Regression (linear baseline) ---
lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
y_prob_lr = lr.predict_proba(X_test_scaled)[:, 1]

print("\n--- Logistic Regression ---")
print(classification_report(y_test, y_pred_lr, target_names=['Not Depressed', 'Depressed']))
auc_lr = roc_auc_score(y_test, y_prob_lr)
print(f"ROC-AUC: {auc_lr:.3f}")

# --- Random Forest (non-linear) ---
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced',
                            max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("\n--- Random Forest ---")
print(classification_report(y_test, y_pred_rf, target_names=['Not Depressed', 'Depressed']))
auc_rf = roc_auc_score(y_test, y_prob_rf)
print(f"ROC-AUC: {auc_rf:.3f}")

# =============================================================================
# Cross-Validation: Verify results are not sensitive to the train/test split
# =============================================================================
# A single 80/20 split could produce optimistic or pessimistic results by chance.
# Stratified 5-fold CV repeats the evaluation across 5 non-overlapping folds,
# each preserving the ~11% depression base rate.

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Logistic Regression CV (requires scaling inside each fold via pipeline)
from sklearn.pipeline import Pipeline
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))
])
cv_scores_lr = cross_val_score(lr_pipeline, X, y, cv=cv, scoring='roc_auc')

# Random Forest CV (no scaling needed)
cv_scores_rf = cross_val_score(rf, X, y, cv=cv, scoring='roc_auc')

print("\n" + "=" * 70)
print("CROSS-VALIDATION: 5-Fold Stratified ROC-AUC")
print("=" * 70)
print(f"\nLogistic Regression:  {cv_scores_lr.mean():.3f} ± {cv_scores_lr.std():.3f}  (folds: {', '.join(f'{s:.3f}' for s in cv_scores_lr)})")
print(f"Random Forest:        {cv_scores_rf.mean():.3f} ± {cv_scores_rf.std():.3f}  (folds: {', '.join(f'{s:.3f}' for s in cv_scores_rf)})")
print(f"\nLogistic Regression {'outperforms' if cv_scores_lr.mean() > cv_scores_rf.mean() else 'underperforms'} Random Forest across all folds.")
print(f"The narrow standard deviation (±{cv_scores_lr.std():.3f}) confirms the 80/20 holdout result is stable.")

======================================================================
MODEL 1: Binary Classification — Logistic Regression vs. Random Forest
======================================================================

Training set: 4,851 samples (11.0% positive)
Test set:     1,213 samples (11.0% positive)

--- Logistic Regression ---
               precision    recall  f1-score   support

Not Depressed       0.94      0.72      0.82      1079
    Depressed       0.22      0.66      0.33       134

     accuracy                           0.71      1213
    macro avg       0.58      0.69      0.58      1213
 weighted avg       0.86      0.71      0.76      1213

ROC-AUC: 0.725

--- Random Forest ---
               precision    recall  f1-score   support

Not Depressed       0.90      0.95      0.92      1079
    Depressed       0.22      0.12      0.16       134

     accuracy                           0.86      1213
    macro avg       0.56      0.53      0.54      1213
 weighted avg       0.82      0.86      0.84      1213

ROC-AUC: 0.674

======================================================================
CROSS-VALIDATION: 5-Fold Stratified ROC-AUC
======================================================================

Logistic Regression:  0.710 ± 0.009  (folds: 0.704, 0.700, 0.719, 0.723, 0.703)
Random Forest:        0.684 ± 0.017  (folds: 0.694, 0.674, 0.703, 0.657, 0.694)

Logistic Regression outperforms Random Forest across all folds.
The narrow standard deviation (±0.009) confirms the 80/20 holdout result is stable.

# =============================================================================
# MODEL 1: ROC Curves and Feature Importance
# =============================================================================
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# ROC Curves
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)

axes[0].plot(fpr_lr, tpr_lr, color='#232D4B', lw=2, label=f'Logistic Regression (AUC={auc_lr:.3f})')
axes[0].plot(fpr_rf, tpr_rf, color='#E57200', lw=2, label=f'Random Forest (AUC={auc_rf:.3f})')
axes[0].plot([0, 1], [0, 1], 'k--', lw=1, label='Random Chance')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('Graph 9: ROC Curves — Depression Classification', fontsize=12)
axes[0].legend(loc='lower right')

# Feature Importance (Random Forest)
importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=feature_cols_full).sort_values(ascending=True)
feat_imp.plot(kind='barh', color='#232D4B', ax=axes[1])
axes[1].set_title('Graph 10: Random Forest Feature Importance', fontsize=12)
axes[1].set_xlabel('Importance (Gini)')

plt.tight_layout()
plt.show()

# Logistic Regression Coefficients (Odds Ratios)
print("\n--- Logistic Regression: Odds Ratios ---")
odds_ratios = pd.DataFrame({
    'Feature': feature_cols_full,
    'Coefficient': lr.coef_[0],
    'Odds_Ratio': np.exp(lr.coef_[0])
}).sort_values('Odds_Ratio', ascending=False)
display(odds_ratios.round(3))

--- Logistic Regression: Odds Ratios ---

# =============================================================================
# MODEL 1: Logistic Regression — Odds Ratio Forest Plot with 95% CIs
# =============================================================================
# To produce CIs that are exactly calibrated to the sklearn model (which uses
# L2 regularization and balanced class weights), we compute standard errors
# directly from the model's Hessian matrix rather than refitting with statsmodels.

# Compute balanced class weights (same formula sklearn uses internally)
n_samples = len(y_train)
n_classes = 2
class_counts = np.bincount(y_train.astype(int))
w0 = n_samples / (n_classes * class_counts[0])
w1 = n_samples / (n_classes * class_counts[1])
sample_wts = np.where(y_train == 1, w1, w0)

# Compute standard errors from the weighted Hessian of the logistic likelihood
p_hat = lr.predict_proba(X_train_scaled)[:, 1]
W = p_hat * (1 - p_hat) * sample_wts
X_const = np.column_stack([np.ones(X_train_scaled.shape[0]), X_train_scaled])
H = X_const.T @ (X_const * W[:, np.newaxis])  # Hessian: X'WX
cov_matrix = np.linalg.inv(H)
se_all = np.sqrt(np.diag(cov_matrix))
se_coefs = se_all[1:]  # drop intercept

# Build odds ratio table from sklearn coefficients + Hessian-derived CIs
sklearn_coefs = lr.coef_[0]
z_scores = sklearn_coefs / se_coefs
p_values = 2 * (1 - stats.norm.cdf(np.abs(z_scores)))

odds_ratios = pd.DataFrame({
    'Feature': feature_cols_full,
    'Coefficient': sklearn_coefs,
    'Odds_Ratio': np.exp(sklearn_coefs),
    'CI_low': np.exp(sklearn_coefs - 1.96 * se_coefs),
    'CI_high': np.exp(sklearn_coefs + 1.96 * se_coefs),
    'p_value': p_values
}).sort_values('Odds_Ratio', ascending=True)

odds_ratios['Significant'] = odds_ratios['p_value'] < 0.05

# Clean feature labels
label_map = {
    'Food_Insecure': 'Food Insecure', 'Alcohol_Drinks_Day': 'Alcohol (Drinks/Day)',
    'Gender_Female': 'Gender (Female)', 'BMI': 'BMI', 'CRP_mgL': 'CRP (mg/L)',
    'Vigorous_Activity_Min': 'Vigorous Activity (Min)', 'Sleep_Hours': 'Sleep Hours',
    'Age': 'Age', 'Income_Poverty_Ratio': 'Income-to-Poverty Ratio'
}
labels = [label_map[f] for f in odds_ratios['Feature']]

fig, ax = plt.subplots(figsize=(10, 6))

for idx, row in enumerate(odds_ratios.itertuples()):
    color = '#E57200' if row.Odds_Ratio > 1 else '#232D4B'
    alpha = 1.0 if row.Significant else 0.35

    # CI whiskers with cap marks
    ax.plot([row.CI_low, row.CI_high], [idx, idx],
            color=color, linewidth=2.5, alpha=alpha, solid_capstyle='round')
    ax.plot([row.CI_low, row.CI_low], [idx - 0.12, idx + 0.12],
            color=color, linewidth=2, alpha=alpha)
    ax.plot([row.CI_high, row.CI_high], [idx - 0.12, idx + 0.12],
            color=color, linewidth=2, alpha=alpha)
    # Point estimate
    ax.scatter(row.Odds_Ratio, idx, color=color, s=100, zorder=4,
               edgecolors='white', linewidths=1, alpha=alpha)

    # Label with significance flag
    label_text = f'OR = {row.Odds_Ratio:.2f}'
    if not row.Significant:
        label_text += '  (n.s.)'

    if row.Odds_Ratio >= 1:
        ax.text(row.CI_high + 0.02, idx, label_text, va='center', ha='left',
                fontsize=9.5, fontweight='bold', alpha=alpha)
    else:
        ax.text(row.CI_low - 0.02, idx, label_text, va='center', ha='right',
                fontsize=9.5, fontweight='bold', alpha=alpha)

ax.axvline(x=1.0, color='black', linewidth=1.5, linestyle='--', alpha=0.7)

ax.set_yticks(range(len(odds_ratios)))
ax.set_yticklabels(labels, fontsize=11)
ax.set_xlabel('Odds Ratio (95% CI, Standardized Coefficients)', fontsize=12)
ax.set_title('Graph 10a: Logistic Regression \u2014 Odds Ratios for Depression', fontsize=13)

ax.axvspan(ax.get_xlim()[0], 1.0, alpha=0.04, color='#232D4B')
ax.axvspan(1.0, ax.get_xlim()[1], alpha=0.04, color='#E57200')

ax.text(0.72, -1.4, '\u2190 Protective', fontsize=10, color='#232D4B',
        ha='center', fontstyle='italic', fontweight='bold')
ax.text(1.35, -1.4, 'Risk \u2192', fontsize=10, color='#E57200',
        ha='center', fontstyle='italic', fontweight='bold')

ax.text(0.72, -2.1, 'CI crossing dashed line = not significant',
        fontsize=9, color='#666666', ha='center', fontstyle='italic')

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(0.52, 1.68)
ax.set_ylim(-2.5, len(odds_ratios) - 0.5)

plt.tight_layout()
plt.show()

# Print the full odds ratio table with CIs for reference
print("\nOdds Ratios with 95% Confidence Intervals:")
display(odds_ratios[['Feature', 'Odds_Ratio', 'CI_low', 'CI_high', 'p_value', 'Significant']].round(4))

Odds Ratios with 95% Confidence Intervals:

# =============================================================================
# Natural-Scale Odds Ratios
# Graph shows PER-SD ORs, which are useful for visually
# comparing feature strengths on a common scale. But for the binary variables
# (Food_Insecure, Gender_Female), the more intuitive quantity is the OR for
# a 0 -> 1 transition. Below we convert per-SD coefficients to per-natural-unit
# coefficients by dividing by each feature's original SD: beta_natural = beta_std / sigma.
# =============================================================================
feature_sds = X_train[feature_cols_full].std().values
natural_coefs = sklearn_coefs / feature_sds
natural_se    = se_coefs       / feature_sds

natural_ors_df = pd.DataFrame({
    'Feature':         feature_cols_full,
    'Per_SD_OR':       np.exp(sklearn_coefs),
    'Natural_OR':      np.exp(natural_coefs),
    'Natural_CI_low':  np.exp(natural_coefs - 1.96 * natural_se),
    'Natural_CI_high': np.exp(natural_coefs + 1.96 * natural_se),
    'Unit': ['per year', 'per kg/m²', 'per hour', 'per minute',
             'per ratio unit', 'per mg/L', 'per drink/day',
             '0 → 1 (food insecure)', '0 → 1 (female)']
}).round(4)

print("Natural-Scale Odds Ratios (per natural unit of each predictor):")
print("(Use these for substantive interpretation, especially for binary variables.)\n")
display(natural_ors_df)

Natural-Scale Odds Ratios (per natural unit of each predictor):
(Use these for substantive interpretation, especially for binary variables.)

# =============================================================================
# MODEL 2: Multiple Linear Regression — Predicting PHQ-9 Severity
# =============================================================================
print("=" * 70)
print("MODEL 2: OLS Regression — Predicting Continuous PHQ-9 Score")
print("=" * 70)

# Prepare features with Gender encoded
X_reg = df_model[feature_cols_full].astype(float)
y_reg = df_model['PHQ9_Score'].astype(float)

# Add constant for statsmodels OLS
X_reg_const = sm.add_constant(X_reg)

# Fit OLS model
ols_model = sm.OLS(y_reg, X_reg_const).fit()
print(ols_model.summary())

======================================================================
MODEL 2: OLS Regression — Predicting Continuous PHQ-9 Score
======================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             PHQ9_Score   R-squared:                       0.104
Model:                            OLS   Adj. R-squared:                  0.102
Method:                 Least Squares   F-statistic:                     77.89
Date:                Sat, 02 May 2026   Prob (F-statistic):          4.45e-137
Time:                        15:27:54   Log-Likelihood:                -17489.
No. Observations:                6064   AIC:                         3.500e+04
Df Residuals:                    6054   BIC:                         3.506e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     3.7929      0.439      8.640      0.000       2.932       4.654
Age                      -0.0199      0.003     -5.961      0.000      -0.026      -0.013
BMI                       0.0318      0.008      3.999      0.000       0.016       0.047
Sleep_Hours              -0.1434      0.035     -4.064      0.000      -0.213      -0.074
Vigorous_Activity_Min  3.976e-05      0.000      0.270      0.787      -0.000       0.000
Income_Poverty_Ratio     -0.1802      0.040     -4.544      0.000      -0.258      -0.102
CRP_mgL                   0.0245      0.008      3.023      0.003       0.009       0.040
Alcohol_Drinks_Day        0.3385      0.026     12.838      0.000       0.287       0.390
Food_Insecure             2.0589      0.159     12.913      0.000       1.746       2.371
Gender_Female             0.7898      0.113      6.968      0.000       0.568       1.012
==============================================================================
Omnibus:                     1933.494   Durbin-Watson:                   2.031
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             5734.417
Skew:                           1.669   Prob(JB):                         0.00
Kurtosis:                       6.399   Cond. No.                     3.02e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.02e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

# =============================================================================
# OLS REGRESSION WITH INTERACTION TERM: Food_Insecure × Income_Poverty_Ratio
# Testing the "buffering hypothesis" — does income moderate the effect of
# food insecurity on depression severity?
# =============================================================================

# Build feature matrix with the interaction term added
X_interact = X_reg.copy()
X_interact['FoodInsec_x_Income'] = (
    X_interact['Food_Insecure'] * X_interact['Income_Poverty_Ratio']
)
X_interact_const = sm.add_constant(X_interact)

# Fit the expanded model
ols_interact = sm.OLS(y_reg, X_interact_const).fit()

# Compare to the original additive model
print("=" * 70)
print("MODEL COMPARISON: Additive vs. Interaction")
print("=" * 70)
print(f"Original additive model R²:    {ols_model.rsquared:.4f}")
print(f"Model with interaction R²:     {ols_interact.rsquared:.4f}")
print(f"Change in R² (ΔR²):            {ols_interact.rsquared - ols_model.rsquared:+.4f}")
print(f"Interaction coefficient:       {ols_interact.params['FoodInsec_x_Income']:+.4f}")
print(f"Interaction p-value:           {ols_interact.pvalues['FoodInsec_x_Income']:.4f}")
ci = ols_interact.conf_int().loc['FoodInsec_x_Income']
print(f"Interaction 95% CI:            [{ci[0]:+.4f}, {ci[1]:+.4f}]")
print()

sig = ols_interact.pvalues['FoodInsec_x_Income'] < 0.05
print(f"Verdict: Interaction is {'SIGNIFICANT' if sig else 'NOT significant'} at α = 0.05")
print()
print(ols_interact.summary())

======================================================================
MODEL COMPARISON: Additive vs. Interaction
======================================================================
Original additive model R²:    0.1038
Model with interaction R²:     0.1042
Change in R² (ΔR²):            +0.0004
Interaction coefficient:       +0.1954
Interaction p-value:           0.1090
Interaction 95% CI:            [-0.0436, +0.4344]

Verdict: Interaction is NOT significant at α = 0.05

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             PHQ9_Score   R-squared:                       0.104
Model:                            OLS   Adj. R-squared:                  0.103
Method:                 Least Squares   F-statistic:                     70.37
Date:                Sat, 02 May 2026   Prob (F-statistic):          1.14e-136
Time:                        15:27:54   Log-Likelihood:                -17487.
No. Observations:                6064   AIC:                         3.500e+04
Df Residuals:                    6053   BIC:                         3.507e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     3.8630      0.441      8.757      0.000       2.998       4.728
Age                      -0.0198      0.003     -5.932      0.000      -0.026      -0.013
BMI                       0.0313      0.008      3.931      0.000       0.016       0.047
Sleep_Hours              -0.1414      0.035     -4.003      0.000      -0.211      -0.072
Vigorous_Activity_Min  4.084e-05      0.000      0.278      0.781      -0.000       0.000
Income_Poverty_Ratio     -0.2035      0.042     -4.818      0.000      -0.286      -0.121
CRP_mgL                   0.0243      0.008      2.988      0.003       0.008       0.040
Alcohol_Drinks_Day        0.3384      0.026     12.838      0.000       0.287       0.390
Food_Insecure             1.7146      0.267      6.410      0.000       1.190       2.239
Gender_Female             0.7898      0.113      6.969      0.000       0.568       1.012
FoodInsec_x_Income        0.1954      0.122      1.603      0.109      -0.044       0.434
==============================================================================
Omnibus:                     1931.447   Durbin-Watson:                   2.031
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             5720.508
Skew:                           1.668   Prob(JB):                         0.00
Kurtosis:                       6.394   Cond. No.                     3.07e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.07e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

# =============================================================================
# MODEL 2: Diagnostic Plots
# =============================================================================
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Residual Plot
residuals = ols_model.resid
fitted = ols_model.fittedvalues

axes[0].scatter(fitted, residuals, alpha=0.3, s=10, color='#232D4B')
axes[0].axhline(y=0, color='#E57200', linestyle='--', linewidth=2)
axes[0].set_xlabel('Fitted Values')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Graph 11: Residual Plot — OLS Regression', fontsize=12)

# Q-Q Plot
stats.probplot(residuals, dist="norm", plot=axes[1])
axes[1].set_title('Graph 12: Q-Q Plot — Residual Normality', fontsize=12)
axes[1].get_lines()[0].set_color('#232D4B')
axes[1].get_lines()[1].set_color('#E57200')

plt.tight_layout()
plt.show()

# Model performance metrics
from sklearn.metrics import r2_score, mean_squared_error
y_pred_ols = ols_model.predict(X_reg_const)
r2 = r2_score(y_reg, y_pred_ols)
rmse = np.sqrt(mean_squared_error(y_reg, y_pred_ols))
print(f"\nR-squared: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")
print(f"Mean PHQ-9 Score: {y_reg.mean():.2f}")

R-squared: 0.1038
RMSE: 4.33
Mean PHQ-9 Score: 3.51

#	NHANES Table	File	Key Variables	Rationale
1	Demographics	`DEMO_L.xpt`	Age, Gender, Income-to-Poverty Ratio	Core demographics and socioeconomic confounder
2	Body Measures	`BMX_L.xpt`	BMI	Objective physiological marker of metabolic health
3	Sleep Disorders	`SLQ_L.xpt`	Sleep Duration (hours)	Modifiable lifestyle factor tied to mental health
4	Physical Activity	`PAQ_L.xpt`	Vigorous Activity (min/day)	Key modifiable lifestyle factor under investigation
5	Depression Screener	`DPQ_L.xpt`	PHQ-9 item scores → total score	Target variable for both models
6	hs-CRP Laboratory	`HSCRP_L.xpt`	High-sensitivity C-Reactive Protein (mg/L)	Objective biomarker of systemic inflammation (Osimo et al., 2019)
7	Alcohol Use	`ALQ_L.xpt`	Average drinks per day (past 12 months)	Behavioral risk factor with strong depression comorbidity (Boden & Fergusson, 2011)
8	Food Security	`FSQ_L.xpt`	Adult food security category (FSDAD)	Socioeconomic determinant of mental health (Pourmotabbed et al., 2020)

	ID	Age	BMI	Sleep_Hours	Vigorous_Activity_Min	Income_Poverty_Ratio	CRP_mgL	Alcohol_Drinks_Day	Food_Security_Category	PHQ9_Score	Depressed	Food_Insecure
count	6064.00	6064.00	6064.00	6064.00	6064.00	6064.00	6064.00	6064.00	6064.00	6064.00	6064.00	6064.00
mean	136346.30	53.85	29.81	7.71	41.04	2.91	3.72	1.69	1.55	3.51	0.11	0.18
std	3437.86	17.18	7.30	1.58	378.79	1.54	7.14	2.17	0.97	4.57	0.31	0.39
min	130378.00	20.00	11.10	2.00	0.00	0.00	0.11	0.00	1.00	0.00	0.00	0.00
25%	133335.50	39.00	24.70	7.00	0.00	1.62	0.87	0.00	1.00	0.00	0.00	0.00
50%	136391.00	57.00	28.50	8.00	0.00	2.82	1.83	1.00	1.00	2.00	0.00	0.00
75%	139304.25	68.00	33.50	8.50	40.00	4.56	3.90	2.00	2.00	5.00	0.00	0.00
max	142310.00	80.00	74.80	14.00	9999.00	5.00	150.92	15.00	4.00	26.00	1.00	1.00

	Not Depressed (0)	Depressed (1)
N	5394.000	670.000
Avg_Age	54.455	48.981
Avg_BMI	29.645	31.107
Avg_Sleep	7.736	7.471
Avg_Activity	37.838	66.813
Avg_CRP	3.577	4.913
Avg_Alcohol	1.614	2.307
Avg_Income_Ratio	2.984	2.285
Pct_Food_Insecure	0.154	0.401
Pct_Female	0.542	0.631

	Feature	Coefficient	Odds_Ratio
7	Food_Insecure	0.380	1.463
6	Alcohol_Drinks_Day	0.220	1.247
8	Gender_Female	0.135	1.145
1	BMI	0.116	1.123
5	CRP_mgL	0.064	1.066
3	Vigorous_Activity_Min	0.024	1.025
2	Sleep_Hours	-0.132	0.876
0	Age	-0.200	0.819
4	Income_Poverty_Ratio	-0.273	0.761

	Feature	Per_SD_OR	Natural_OR	Natural_CI_low	Natural_CI_high	Unit
0	Age	0.8188	0.9884	0.9848	0.9920	per year
1	BMI	1.1230	1.0160	1.0077	1.0243	per kg/m²
2	Sleep_Hours	0.8764	0.9198	0.8892	0.9514	per hour
3	Vigorous_Activity_Min	1.0247	1.0001	0.9999	1.0002	per minute
4	Income_Poverty_Ratio	0.7614	0.8386	0.8032	0.8756	per ratio unit
5	CRP_mgL	1.0660	1.0095	1.0004	1.0188	per mg/L
6	Alcohol_Drinks_Day	1.2466	1.1073	1.0770	1.1385	per drink/day
7	Food_Insecure	1.4626	2.6715	2.2918	3.1141	0 → 1 (food insecure)
8	Gender_Female	1.1450	1.3130	1.1589	1.4876	0 → 1 (female)

Predictive Modeling of Mental Health and Wellness: Synergies of Lifestyle, Socioeconomic, and Biomarker Factors (NHANES 2021–2023)¶

Introduction and Motivation¶

Research Questions¶

Tutorial Roadmap¶

Extraction, Transform, and Load (ETL)¶

Data Sources¶

ETL Challenges¶

Exploratory Data Analysis (EDA)¶

Discussion: Summary Statistics and Correlation Structure¶

EDA Stage 2: Distribution and Group-Comparison Visualizations¶

Discussion: Distribution and Group-Comparison Findings¶

Discussion: Alcohol, Food Security, and Sleep Findings¶

EDA Stage 3: Hypothesis Testing¶

Discussion: Statistical Tests and Cross-Tabulation¶

From EDA to Model Design¶

Machine Learning Models¶

Discussion: Model 1 — Binary Classification Results¶

Testing a Theoretically-Motivated Interaction Term¶

Discussion: Model 2 — OLS Regression Results¶

Conclusions and Recommendations - Insight and Policy Decision¶

Summary of Findings¶

Recommendations¶

Limitations¶

Future Directions¶

References and Resources¶

Data Sources¶

Clinical & Scientific References¶

Technical References¶

	ID	Gender	Age	BMI	Sleep_Hours	Vigorous_Activity_Min	Income_Poverty_Ratio	CRP_mgL	Alcohol_Drinks_Day	Food_Security_Category	PHQ9_Score
0	130378	Male	43	27.0	9.5	45.0	5.00	1.78	0.0	1.0	0
1	130379	Male	66	33.5	9.0	45.0	5.00	2.03	3.0	1.0	1
2	130380	Female	44	29.7	8.0	0.0	1.41	5.62	1.0	1.0	2
3	130386	Male	34	30.2	7.5	30.0	1.33	1.05	2.0	1.0	1
4	130387	Female	68	42.6	3.0	0.0	1.32	3.96	0.0	1.0	0

	Feature	Odds_Ratio	CI_low	CI_high	p_value	Significant
4	Income_Poverty_Ratio	0.7614	0.7122	0.8141	0.0000	True
0	Age	0.8188	0.7695	0.8713	0.0000	True
2	Sleep_Hours	0.8764	0.8310	0.9244	0.0000	True
3	Vigorous_Activity_Min	1.0247	0.9756	1.0762	0.3309	False
5	CRP_mgL	1.0660	1.0025	1.1334	0.0413	True
1	BMI	1.1230	1.0580	1.1920	0.0001	True
8	Gender_Female	1.1450	1.0761	1.2183	0.0000	True
6	Alcohol_Drinks_Day	1.2466	1.1739	1.3238	0.0000	True
7	Food_Insecure	1.4626	1.3784	1.5520	0.0000	True