4 Python

4 Python#

CSV file saved as 'esrdRisk_t02tT.csv'

import pandas as pd

# Load the CSV dataset
data = pd.read_csv('./esrdRisk_t02tT.csv', low_memory=False)

# Print unique values in 'rSMGJcEdF_d'
print(data['rSMGJcEdF_d'].unique())

['Censored' 'Died' 'ESRD']

import pandas as pd

# Load the CSV dataset
data = pd.read_csv('./esrdRisk_t02tT.csv', low_memory=False)

# Display basic information about the dataset
print("Dataset information:")
print(data.info())

Show code cell output Hide code cell output

Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113657 entries, 0 to 113656
Data columns (total 40 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 pers_id             113657 non-null  int64  
 don_id              96217 non-null   object 
 don_yearcat         96217 non-null   object 
 don_age             113657 non-null  int64  
 don_agecat          113657 non-null  object 
 don_female          113657 non-null  object 
 don_race_ethn       113599 non-null  object 
 don_educat          72127 non-null   object 
 don_hyperten        63736 non-null   object 
 don_smoke           59119 non-null   object 
don_bmi             86983 non-null   float64
don_bmicat          86983 non-null   object 
don_bp_preop_syst   82898 non-null   float64
don_sbpcat          82898 non-null   object 
don_bp_preop_diast  82857 non-null   float64
don_dbpcat          82857 non-null   object 
don_egfr            90651 non-null   float64
don_egfrcat         90651 non-null   object 
creat               90687 non-null   float64
don_related         96042 non-null   object 
n2e_d               113657 non-null  object 
n2e_t               113657 non-null  float64
donor               113657 non-null  object 
healthy             17440 non-null   object 
acr                 112687 non-null  float64
case                113657 non-null  int64  
id                  113657 non-null  float64
rSMGJcEdF_d         113657 non-null  object 
rSMGJcEdF_t0        113657 non-null  object 
rSMGJcEdF_tT        113657 non-null  object 
rSMGJcEdF_t         113657 non-null  float64
rSMGJcEdF_x         113657 non-null  object 
age_t0              113657 non-null  int64  
age_tT              113657 non-null  float64
female              113657 non-null  int64  
race                113599 non-null  float64
_st                 113657 non-null  int64  
_d                  113657 non-null  int64  
_t                  113657 non-null  float64
_t0                 113657 non-null  int64  
dtypes: float64(12), int64(8), object(20)
memory usage: 34.7+ MB
None

# Display the first few rows of the dataset to understand its structure
print("\nFirst few rows of the dataset:")
print(data.head())

# Total number of observations and variables
num_observations = data.shape[0]
num_variables = data.shape[1]
print(f"\nTotal number of observations: {num_observations}")
print(f"Total number of variables: {num_variables}")

Total number of observations: 113657
Total number of variables: 40

# Display the outcome of interest and its format
print("\nOutcome of interest (rSMGJcEdF_d) and its format:")
print(data['rSMGJcEdF_d'].value_counts(dropna=False))

Outcome of interest (rSMGJcEdF_d) and its format:
rSMGJcEdF_d
Censored    108448
Died          4939
ESRD           270
Name: count, dtype: int64

Show code cell source Hide code cell source

import pandas as pd
import numpy as np
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

# Load the CSV dataset
data = pd.read_csv('./esrdRisk_t02tT.csv', low_memory=False)

# Convert date columns to datetime
data['rSMGJcEdF_t0'] = pd.to_datetime(data['rSMGJcEdF_t0'])
data['rSMGJcEdF_tT'] = pd.to_datetime(data['rSMGJcEdF_tT'])

# Calculate follow-up time in years
data['duration'] = (data['rSMGJcEdF_tT'] - data['rSMGJcEdF_t0']).dt.days / 365.25

# Map non-numeric values to numeric
value_mapping = {'Censored': 0, 'Died': 0, 'ESRD': 1}
data['rSMGJcEdF_d'] = data['rSMGJcEdF_d'].replace(value_mapping).astype(int)

# Apply the same date restrictions as in Stata
data.loc[data['rSMGJcEdF_tT'] > '2011-12-31', 'rSMGJcEdF_d'] = 0
data.loc[data['rSMGJcEdF_tT'] > '2011-12-31', 'rSMGJcEdF_tT'] = pd.to_datetime('2011-12-31')
data.loc[(data['rSMGJcEdF_t0'] < '1994-01-01') & (data['rSMGJcEdF_tT'] > '1994-01-01'), 'rSMGJcEdF_t0'] = pd.to_datetime('1994-01-01')

# Recalculate duration after date adjustments
data['duration'] = (data['rSMGJcEdF_tT'] - data['rSMGJcEdF_t0']).dt.days / 365.25

# Create the 'event' column (rSMGJcEdF_d == 1 indicates an event)
data['event'] = data['rSMGJcEdF_d'] == 1

# Create the 'donor' column with three categories
data['donor'] = np.where(data['don_id'].notna(), 'Living donor', 
                         np.where(data['don_id'].isna() & (data['don_age'] <= 60), 'Healthy nondonor', 'General population'))

# Verify the donor categories
print("\nDonor categories:")
print(data['donor'].value_counts())

# Verify the event counts (ESRD)
print("\nEvent counts (ESRD):")
print(data['event'].value_counts())

# Filter relevant columns and drop rows with missing values
data = data[['duration', 'event', 'donor']].dropna()

# Ensure 'duration' and 'event' are numeric
data['duration'] = pd.to_numeric(data['duration'], errors='coerce')
data['event'] = pd.to_numeric(data['event'], errors='coerce')

# Display the processed data
print("\nProcessed data:")
print(data.info())
print(data.head())

# Fit the Kaplan-Meier estimator for each group
kmf = KaplanMeierFitter()

# Initialize the plot
plt.figure(figsize=(10, 6))

# Plot the Kaplan-Meier curves for each group
for label, group in data.groupby('donor'):
    kmf.fit(group['duration'], event_observed=group['event'], label=label)
    plt.step(kmf.timeline, 10000 * (1 - kmf.survival_function_), where="post", label=label)

# Set axis limits and labels
plt.ylim(0, 40)
plt.xlim(0, 15)
plt.xlabel('Follow-up time (years)')
plt.ylabel('Cumulative incidence (per 10,000)')
plt.title('Kaplan-Meier Failure Curve')
plt.grid(True)

# Add legend
plt.legend(loc='upper left')

# Add y-axis ticks
plt.yticks(range(0, 300, 50))

plt.tight_layout()
plt.savefig('./KM_plot.png', dpi=300, bbox_inches='tight')
plt.show()

# Print cumulative incidence at 5, 12, and 15 years
for label, group in data.groupby('donor'):
    kmf.fit(group['duration'], event_observed=group['event'])
    print(f"\n{label}:")
    for year in [5, 12, 15]:
        ci = 10000 * (1 - kmf.survival_function_at_times(year).iloc[0])
        print(f"  {year} years: {ci:.2f} per 10,000")

Show code cell source Hide code cell source

import pandas as pd
import numpy as np
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

# Load the CSV dataset
data = pd.read_csv('./esrdRisk_t02tT.csv', low_memory=False)

# Convert date columns to datetime
data['rSMGJcEdF_t0'] = pd.to_datetime(data['rSMGJcEdF_t0'])
data['rSMGJcEdF_tT'] = pd.to_datetime(data['rSMGJcEdF_tT'])

# Calculate follow-up time in years
data['time_to_failure'] = (data['rSMGJcEdF_tT'] - data['rSMGJcEdF_t0']).dt.days / 365.25

# Map non-numeric values to numeric for event_observed
value_mapping = {'Censored': 0, 'Died': 0, 'ESRD': 1}
data['status'] = data['rSMGJcEdF_d'].replace(value_mapping).astype(int)

# Apply the same date restrictions as in Stata
data.loc[data['rSMGJcEdF_tT'] > '2011-12-31', 'rSMGJcEdF_d'] = 0
data.loc[data['rSMGJcEdF_tT'] > '2011-12-31', 'rSMGJcEdF_tT'] = pd.to_datetime('2011-12-31')
data.loc[(data['rSMGJcEdF_t0'] < '1994-01-01') & (data['rSMGJcEdF_tT'] > '1994-01-01'), 'rSMGJcEdF_t0'] = pd.to_datetime('1994-01-01')

# Recalculate duration after date adjustments
data['time_to_failure'] = (data['rSMGJcEdF_tT'] - data['rSMGJcEdF_t0']).dt.days / 365.25

# Filter relevant columns and drop rows with missing values
data = data[['time_to_failure', 'status']].dropna()

# Instantiate the KaplanMeierFitter
kmf = KaplanMeierFitter()

# Fit the data into the model
kmf.fit(data['time_to_failure'], event_observed=data['status'])

# Get the survival function
survival_function = kmf.survival_function_

# Compute 1 minus the survival function
complement_survival_function = 1 - survival_function

# Plot the complement of the Kaplan-Meier survival curve
plt.figure(figsize=(10, 6))
plt.plot(complement_survival_function, label='1 - Kaplan-Meier Estimate')
plt.title('Complement of Kaplan-Meier Survival Curve')
plt.xlabel('Time to Kidney Failure')
plt.ylabel('1 - Survival Probability')
plt.legend()
plt.grid(True)
plt.savefig('1_min_KM.png')
plt.show()

# Check if 'don_id' and 'don_age' columns exist
if 'don_id' in data.columns and 'don_age' in data.columns:
    # Create the 'donor' column with three categories
    data['donor'] = np.where(data['don_id'].notna(), 'Living donor', 
                             np.where(data['don_id'].isna() & (data['don_age'] <= 60), 'Healthy nondonor', 'General population'))

    # Verify the donor categories
    print("\nDonor categories:")
    print(data['donor'].value_counts())

    # Verify the event counts (ESRD)
    print("\nEvent counts (ESRD):")
    print(data['event'].value_counts())

    # Filter relevant columns and drop rows with missing values
    data = data[['time_to_failure', 'status', 'donor']].dropna()

    # Fit the Kaplan-Meier estimator for each group
    kmf = KaplanMeierFitter()

    # Initialize the plot
    plt.figure(figsize=(10, 6))

    # Plot the Kaplan-Meier curves for each group
    for label, group in data.groupby('donor'):
        kmf.fit(group['time_to_failure'], event_observed=group['status'], label=label)
        plt.step(kmf.timeline, 10000 * (1 - kmf.survival_function_), where="post", label=label)

    # Set axis limits and labels
    plt.ylim(0, 40)
    plt.xlim(0, 15)
    plt.xlabel('Follow-up time (years)')
    plt.ylabel('Cumulative incidence (per 10,000)')
    plt.title('Kaplan-Meier Failure Curve')
    plt.grid(True)

    # Add legend
    plt.legend(loc='upper left')

    # Add y-axis ticks
    plt.yticks(range(0, 300, 50))

    plt.tight_layout()
    plt.savefig('./KM_plot.png', dpi=300, bbox_inches='tight')
    plt.show()

    # Print cumulative incidence at 5, 12, and 15 years
    for label, group in data.groupby('donor'):
        kmf.fit(group['time_to_failure'], event_observed=group['status'])
        print(f"\n{label}:")
        for year in [5, 12, 15]:
            ci = 10000 * (1 - kmf.survival_function_at_times(year).iloc[0])
            print(f"  {year} years: {ci:.2f} per 10,000")
else:
    print("Columns 'don_id' or 'don_age' not found in the dataset.")

/var/folders/z3/nwmg00j178vfczmb9nk9487h0000gn/T/ipykernel_48040/2454471136.py:18: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  data['status'] = data['rSMGJcEdF_d'].replace(value_mapping).astype(int)

_images/8ad27666a850b37095f42035b4b27ba9cc1593fe8f814075f7100cbaec62f263.png

Columns 'don_id' or 'don_age' not found in the dataset.

Show code cell source Hide code cell source

import pandas as pd
import numpy as np
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

# Load the CSV dataset
data = pd.read_csv('./esrdRisk_t02tT.csv', low_memory=False)

# Convert date columns to datetime
data['rSMGJcEdF_t0'] = pd.to_datetime(data['rSMGJcEdF_t0'])
data['rSMGJcEdF_tT'] = pd.to_datetime(data['rSMGJcEdF_tT'])

# Calculate follow-up time in years
data['time_to_failure'] = (data['rSMGJcEdF_tT'] - data['rSMGJcEdF_t0']).dt.days / 365.25

# Map non-numeric values to numeric for event_observed
value_mapping = {'Censored': 0, 'Died': 0, 'ESRD': 1}
data['status'] = data['rSMGJcEdF_d'].replace(value_mapping).astype(int)

# Apply the same date restrictions as in Stata
data.loc[data['rSMGJcEdF_tT'] > '2011-12-31', 'rSMGJcEdF_d'] = 0
data.loc[data['rSMGJcEdF_tT'] > '2011-12-31', 'rSMGJcEdF_tT'] = pd.to_datetime('2011-12-31')
data.loc[(data['rSMGJcEdF_t0'] < '1994-01-01') & (data['rSMGJcEdF_tT'] > '1994-01-01'), 'rSMGJcEdF_t0'] = pd.to_datetime('1994-01-01')

# Recalculate duration after date adjustments
data['time_to_failure'] = (data['rSMGJcEdF_tT'] - data['rSMGJcEdF_t0']).dt.days / 365.25

# Create the 'donor' column with three categories
if 'don_id' in data.columns and 'don_age' in data.columns:
    data['donor'] = np.where(data['don_id'].notna(), 'Living donor', 
                             np.where(data['don_id'].isna() & (data['don_age'] <= 60), 'Healthy nondonor', 'General population'))

    # Filter relevant columns and drop rows with missing values
    data = data[['time_to_failure', 'status', 'donor']].dropna()

    # Fit the Kaplan-Meier estimator for each group
    kmf = KaplanMeierFitter()

    # Initialize the plot
    plt.figure(figsize=(10, 6))

    # Plot the Kaplan-Meier curves for each group
    for label, group in data.groupby('donor'):
        kmf.fit(group['time_to_failure'], event_observed=group['status'], label=label)
        plt.step(kmf.timeline, 10000 * (1 - kmf.survival_function_), where="post", label=label)

    # Set axis limits and labels
    plt.ylim(0, 40)
    plt.xlim(0, 15)
    plt.xlabel('Follow-up time (years)')
    plt.ylabel('Cumulative incidence (per 10,000)')
    plt.title('Kaplan-Meier Failure Curve by Donor Category')
    plt.grid(True)

    # Add legend
    plt.legend(loc='upper left')

    # Add y-axis ticks
    plt.yticks(range(0, 300, 50))

    plt.tight_layout()
    plt.savefig('./KM_plot_by_donor.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("Columns 'don_id' or 'don_age' not found in the dataset.")

/var/folders/z3/nwmg00j178vfczmb9nk9487h0000gn/T/ipykernel_48040/4095442099.py:18: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  data['status'] = data['rSMGJcEdF_d'].replace(value_mapping).astype(int)

_images/7049fba23098a846a53f3dec1f62d3256ae5ebeffdc911844c48aff7ed45a1e8.png