6 Translation#

Sure, I’ll break down the Stata script into smaller code blocks with notes so you can run them separately in a Jupyter Notebook with the Stata kernel.

Block 1: Setup and Logging#

import logging

# Close any existing log files
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

Block 2: Define Globals#

# Set global variables for the repository URL and local directory path
repo = "https://github.com/muzaale/forum/raw/main/"
dir = "~/documents/melody/pystata"

Block 3: Load Data#

Block 4: Initial Data Exploration#

Hide code cell source
import logging
import pandas as pd
import os

# Open a new log file to record the output
logging.basicConfig(filename='jamascript.log', level=logging.INFO, format='%(asctime)s - %(message)s')

# Log the start of the logging process
logging.info('Starting log file')

# Define the local directory and file name
local_dir = os.path.expanduser("~/documents/melody/pystata")
file_name = "esrdRisk_t02tT.csv"
file_path = os.path.join(local_dir, file_name)

# Check if the file exists locally
if os.path.isfile(file_path):
    try:
        data = pd.read_csv(file_path)
        logging.info('Dataset loaded successfully from local file')
    except Exception as e:
        logging.error(f"Failed to load dataset from local file: {e}")
        print(f"Failed to load dataset from local file: {e}")
        data = None
else:
    logging.error(f"File not found: {file_path}")
    print(f"File not found: {file_path}")
    data = None

if data is not None:
    # Display the distribution of the 'donor' variable
    donor_distribution = data['donor'].value_counts()
    print("Distribution of 'donor' variable:")
    print(donor_distribution)

    # Describe the 'rSMGJcEdF_d' variable
    codebook = data['rSMGJcEdF_d'].describe()
    print("\nCodebook for 'rSMGJcEdF_d' variable:")
    print(codebook)

    # Generate a new variable 'entry' based on the 'rSMGJcEdF_t0' variable
    data['entry'] = data['rSMGJcEdF_t0']

    # Logging the creation of the new variable
    logging.info("New variable 'entry' created based on 'rSMGJcEdF_t0'")
else:
    print("Dataset could not be loaded.")
Distribution of 'donor' variable:
donor
Donor                   96217
HealthyNondonor          9364
NotSoHealthyNondonor     8076
Name: count, dtype: int64

Codebook for 'rSMGJcEdF_d' variable:
count       113657
unique           3
top       Censored
freq        108448
Name: rSMGJcEdF_d, dtype: object
/var/folders/z3/nwmg00j178vfczmb9nk9487h0000gn/T/ipykernel_48067/85659033.py:19: DtypeWarning: Columns (1,2,19,23) have mixed types. Specify dtype option on import or set low_memory=False.
  data = pd.read_csv(file_path)

Block 5: Data Cleaning and Adjustment#

This is messing up the dataset

import pandas as pd
import logging

# Assuming 'data' is the DataFrame loaded previously

# Ensure the date columns are in datetime format
data['rSMGJcEdF_tT'] = pd.to_datetime(data['rSMGJcEdF_tT'], format='%d%b%Y', errors='coerce')
data['entry'] = pd.to_datetime(data['entry'], format='%d%b%Y', errors='coerce')

# Define the critical dates
date_2011 = pd.to_datetime('2011-12-31')
date_1994 = pd.to_datetime('1994-01-01')

# Log the start of the adjustment process
logging.info("Starting the adjustment process for linkage dates")

# Linkage for donors after 2011 is untrustworthy
data.loc[(data['rSMGJcEdF_tT'] > date_2011) & (data['donor'] == 1), 'rSMGJcEdF_d'] = 0
data.loc[(data['rSMGJcEdF_tT'] > date_2011) & (data['donor'] == 1), 'rSMGJcEdF_tT'] = date_2011

# Linkage before 1994 is untrustworthy
data.loc[(data['entry'] < date_1994) & (data['rSMGJcEdF_tT'] > date_1994) & (data['donor'] == 1), 'entry'] = date_1994

# Log the completion of the replacement process
logging.info("Linkage for dates after 2011 and before 1994 adjusted successfully")

Block 6: Mortality Analysis Setup#

Hide code cell source
import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

# Assuming 'data' is already the DataFrame loaded previously

# Set the event indicator based on 'Died'
data['event'] = (data['rSMGJcEdF_d'] == 'Died').astype(int)

# Ensure date columns are in datetime format
data['rSMGJcEdF_tT'] = pd.to_datetime(data['rSMGJcEdF_tT'], errors='coerce')
data['rSMGJcEdF_t0'] = pd.to_datetime(data['rSMGJcEdF_t0'], errors='coerce')
data['entry'] = data['rSMGJcEdF_t0']  # Use _t0 as entry

# Calculate the duration from origin to the event or censoring in years
data['duration'] = (data['rSMGJcEdF_tT'] - data['rSMGJcEdF_t0']).dt.total_seconds() / (365.25 * 24 * 60 * 60)

# Calculate the entry duration
data['entry_duration'] = (data['entry'] - data['rSMGJcEdF_t0']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
data['entry_duration'] = data['entry_duration'].fillna(0)  # Ensure non-donor entries are set to 0

# Ensure all durations are non-negative and valid
data = data[(data['duration'] >= 0) & (data['entry_duration'] >= 0)]

# Initialize the KaplanMeierFitter
kmf = KaplanMeierFitter()

# Plot setup
plt.figure(figsize=(10, 6))

# Unique donor values
donor_values = data['donor'].unique()

# Colors for each donor type
colors = ['blue', 'green', 'red']

# Overlay plots for different donor values
for donor, color in zip(donor_values, colors):
    kmf.fit(durations=data[data['donor'] == donor]['duration'],
            event_observed=data[data['donor'] == donor]['event'],
            entry=data[data['donor'] == donor]['entry_duration'],
            label=donor)
    ax = kmf.plot_cumulative_density(ci_show=False, color=color)

# Cut off the plot at 15 years
plt.xlim(0, 15)

# Convert y-axis to percentage
vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])

# Labels and title
plt.xlabel("Time (years)")
plt.ylabel("Failure Probability (%)")
plt.title("Failure Function Plot (Cumulative Density) by Donor Type")
plt.legend(title="Donor Type")

# Optional: Display the plot (depends on your environment)
plt.show()
/var/folders/z3/nwmg00j178vfczmb9nk9487h0000gn/T/ipykernel_48067/3613857153.py:50: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])
_images/e8f19c100f7757f4315f4d770bd92f27b3f5b067066a12bb502c7fa74ce846a8.png

Block 7: Generate Kaplan-Meier Estimates#

Hide code cell source
import pandas as pd
from lifelines import KaplanMeierFitter

# Assuming 'data' is already the DataFrame loaded previously

# Set the event indicator based on 'Died'
data['event'] = (data['rSMGJcEdF_d'] == 'Died').astype(int)

# Ensure date columns are in datetime format
data['rSMGJcEdF_tT'] = pd.to_datetime(data['rSMGJcEdF_tT'], errors='coerce')
data['rSMGJcEdF_t0'] = pd.to_datetime(data['rSMGJcEdF_t0'], errors='coerce')
data['entry'] = data['rSMGJcEdF_t0']  # Use _t0 as entry

# Calculate the duration from origin to the event or censoring in years
data['duration'] = (data['rSMGJcEdF_tT'] - data['rSMGJcEdF_t0']).dt.total_seconds() / (365.25 * 24 * 60 * 60)

# Calculate the entry duration
data['entry_duration'] = (data['entry'] - data['rSMGJcEdF_t0']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
data['entry_duration'] = data['entry_duration'].fillna(0)  # Ensure non-donor entries are set to 0

# Ensure all durations are non-negative and valid
data = data[(data['duration'] >= 0) & (data['entry_duration'] >= 0)]

# Initialize the KaplanMeierFitter
kmf = KaplanMeierFitter()

# Time points at which we want to extract survival probabilities
time_points = [5, 12, 15]

# Dictionary to hold the results
results = []

# Fit the model for each donor type and extract the survival probabilities
for donor in data['donor'].unique():
    kmf.fit(durations=data[data['donor'] == donor]['duration'],
            event_observed=data[data['donor'] == donor]['event'],
            entry=data[data['donor'] == donor]['entry_duration'],
            label=donor)
    
    for time in time_points:
        survival_prob = kmf.survival_function_at_times(time).values[0]
        results.append({
            'donor': donor,
            'time': time,
            'survival_probability': survival_prob,
            'failure_probability': 1 - survival_prob
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv('km_results.csv', index=False)

# Display the results
print(results_df)
Hide code cell output
                  donor  time  survival_probability  failure_probability
0                 Donor     5              0.996751             0.003249
1                 Donor    12              0.988017             0.011983
2                 Donor    15              0.983427             0.016573
3  NotSoHealthyNondonor     5              0.855844             0.144156
4  NotSoHealthyNondonor    12              0.653661             0.346339
5  NotSoHealthyNondonor    15              0.575531             0.424469
6       HealthyNondonor     5              0.981093             0.018907
7       HealthyNondonor    12              0.934715             0.065285
8       HealthyNondonor    15              0.912715             0.087285

Only the donor results match Segev, JAMA, 2010 p. 961. The nondonors here are unmatched and so do not yield the same estimates from this paper

A webApp is quick way to approximate these findings using the base-case of the “typical” donor as per Table 1, also on page 961. Namely, 40yo white female with some college education, BMI=28, SBP=120, no hypertension, no history of smoking cigarettes, creatinine 0.9 mg/dL, and eGFR 100 mL/min

Block 8: Summarize Failure Rates#

Block 9: Kaplan-Meier Survival Plot#

Block 11: Save Processed Data#

Block 12: Cox Proportional Hazards Model#

import pandas as pd

# Assuming 'data' is already the DataFrame loaded previously

# Check for constant variables
constant_columns = [col for col in data.columns if data[col].nunique() <= 1]
print("Constant columns:", constant_columns)
Constant columns: ['_st', '_t0', 'entry_duration']
import pandas as pd
from lifelines import CoxPHFitter

# Ensure 'donor' is a categorical variable
data['donor'] = data['donor'].astype('category')

# Select necessary columns
data = data[['duration', 'event', 'donor']]

# Remove constant columns if any
constant_columns = [col for col in data.columns if data[col].nunique() <= 1]
data = data.drop(columns=constant_columns)

print("Constant columns removed:", constant_columns)
Constant columns removed: []
data.head()
duration event donor
0 13.859001 0 Donor
1 9.409993 0 Donor
2 8.752909 0 Donor
3 8.273785 0 Donor
4 14.398357 0 Donor
import pandas as pd

# Load the data
file_path = '~/documents/melody/local/esrdRisk_t02tT.csv'  # Adjust the file path as needed
data = pd.read_csv(file_path)

# Check the first few rows and the column names
print(data.head())
print(data.columns)
Hide code cell output
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[9], line 5
      3 # Load the data
      4 file_path = '~/documents/melody/local/esrdRisk_t02tT.csv'  # Adjust the file path as needed
----> 5 data = pd.read_csv(file_path)
      7 # Check the first few rows and the column names
      8 print(data.head())

File ~/Documents/Melody/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File ~/Documents/Melody/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File ~/Documents/Melody/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File ~/Documents/Melody/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File ~/Documents/Melody/myenv/lib/python3.12/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    868 elif isinstance(handle, str):
    869     # Check whether the filename is to be opened in binary mode.
    870     # Binary mode does not support 'encoding' and 'newline'.
    871     if ioargs.encoding and "b" not in ioargs.mode:
    872         # Encoding
--> 873         handle = open(
    874             handle,
    875             ioargs.mode,
    876             encoding=ioargs.encoding,
    877             errors=errors,
    878             newline="",
    879         )
    880     else:
    881         # Binary mode
    882         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/apollo/documents/melody/local/esrdRisk_t02tT.csv'
Hide code cell source
import pandas as pd
from lifelines import CoxPHFitter

# Assuming 'data' is already the DataFrame loaded previously and necessary preprocessing is done

# Ensure 'donor' is a categorical variable
data['donor'] = data['donor'].astype('category')

# Fit a Cox proportional hazards model
cph = CoxPHFitter()
cph.fit(data, duration_col='duration', event_col='event', formula="C(donor)")

# Extract the summary and relevant coefficients
summary = cph.summary

# Filter only relevant columns
coefficients = summary.loc[:, ['coef', 'exp(coef)', 'exp(coef) lower 95%', 'exp(coef) upper 95%']]

# Rename the rows for clarity
coefficients.index = ['HealthyNondonor vs Donor', 'NotSoHealthyNondonor vs Donor']

# Display the simplified results
print("Coefficients and Hazard Ratios Relative to Baseline (Donor):")
print(coefficients)

# Compute hazard ratios
hazard_ratios = coefficients[['exp(coef)', 'exp(coef) lower 95%', 'exp(coef) upper 95%']]
hazard_ratios.columns = ['Hazard Ratio', 'Lower 95% CI', 'Upper 95% CI']

print("\nHazard Ratios Relative to Baseline (Donor):")
print(hazard_ratios)
Hide code cell output
Coefficients and Hazard Ratios Relative to Baseline (Donor):
                                   coef  exp(coef)  exp(coef) lower 95%  \
HealthyNondonor vs Donor      -0.286770   0.750684             0.447060   
NotSoHealthyNondonor vs Donor  2.359067  10.581077             8.187331   

                               exp(coef) upper 95%  
HealthyNondonor vs Donor                  1.260518  
NotSoHealthyNondonor vs Donor            13.674688  

Hazard Ratios Relative to Baseline (Donor):
                               Hazard Ratio  Lower 95% CI  Upper 95% CI
HealthyNondonor vs Donor           0.750684      0.447060      1.260518
NotSoHealthyNondonor vs Donor     10.581077      8.187331     13.674688

Block 13: Save Baseline Survival and Coefficients#

// Define matrix 'b' with model coefficients
matrix define b = e(b)

// Keep baseline survival estimates
keep s0 _t 

// Sort and list baseline survival estimates
sort _t s0
list in 1/10

// Save baseline survival estimates
save ${dir}/s0.dta, replace
export delimited using ${dir}/s0.csv, replace

// Save model coefficients
matrix beta = e(b)
svmat beta
keep beta*
drop if missing(beta1)
list 
save ${dir}/b.dta, replace
export delimited using ${dir}/b.csv, replace