6 Translation

6 Translation#

Sure, I’ll break down the Stata script into smaller code blocks with notes so you can run them separately in a Jupyter Notebook with the Stata kernel.

Block 1: Setup and Logging#

import logging

# Close any existing log files
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

Block 2: Define Globals#

# Set global variables for the repository URL and local directory path
repo = "https://github.com/muzaale/forum/raw/main/"
dir = "~/documents/melody/pystata"

Block 3: Load Data#

Block 4: Initial Data Exploration#

Distribution of 'donor' variable:
donor
Donor                   96217
HealthyNondonor          9364
NotSoHealthyNondonor     8076
Name: count, dtype: int64

Codebook for 'rSMGJcEdF_d' variable:
count       113657
unique           3
top       Censored
freq        108448
Name: rSMGJcEdF_d, dtype: object

/var/folders/z3/nwmg00j178vfczmb9nk9487h0000gn/T/ipykernel_48067/85659033.py:19: DtypeWarning: Columns (1,2,19,23) have mixed types. Specify dtype option on import or set low_memory=False.
  data = pd.read_csv(file_path)

Block 5: Data Cleaning and Adjustment#

This is messing up the dataset

import pandas as pd
import logging

# Assuming 'data' is the DataFrame loaded previously

# Ensure the date columns are in datetime format
data['rSMGJcEdF_tT'] = pd.to_datetime(data['rSMGJcEdF_tT'], format='%d%b%Y', errors='coerce')
data['entry'] = pd.to_datetime(data['entry'], format='%d%b%Y', errors='coerce')

# Define the critical dates
date_2011 = pd.to_datetime('2011-12-31')
date_1994 = pd.to_datetime('1994-01-01')

# Log the start of the adjustment process
logging.info("Starting the adjustment process for linkage dates")

# Linkage for donors after 2011 is untrustworthy
data.loc[(data['rSMGJcEdF_tT'] > date_2011) & (data['donor'] == 1), 'rSMGJcEdF_d'] = 0
data.loc[(data['rSMGJcEdF_tT'] > date_2011) & (data['donor'] == 1), 'rSMGJcEdF_tT'] = date_2011

# Linkage before 1994 is untrustworthy
data.loc[(data['entry'] < date_1994) & (data['rSMGJcEdF_tT'] > date_1994) & (data['donor'] == 1), 'entry'] = date_1994

# Log the completion of the replacement process
logging.info("Linkage for dates after 2011 and before 1994 adjusted successfully")

Block 6: Mortality Analysis Setup#

Show code cell source Hide code cell source

import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

# Assuming 'data' is already the DataFrame loaded previously

# Set the event indicator based on 'Died'
data['event'] = (data['rSMGJcEdF_d'] == 'Died').astype(int)

# Ensure date columns are in datetime format
data['rSMGJcEdF_tT'] = pd.to_datetime(data['rSMGJcEdF_tT'], errors='coerce')
data['rSMGJcEdF_t0'] = pd.to_datetime(data['rSMGJcEdF_t0'], errors='coerce')
data['entry'] = data['rSMGJcEdF_t0']  # Use _t0 as entry

# Calculate the duration from origin to the event or censoring in years
data['duration'] = (data['rSMGJcEdF_tT'] - data['rSMGJcEdF_t0']).dt.total_seconds() / (365.25 * 24 * 60 * 60)

# Calculate the entry duration
data['entry_duration'] = (data['entry'] - data['rSMGJcEdF_t0']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
data['entry_duration'] = data['entry_duration'].fillna(0)  # Ensure non-donor entries are set to 0

# Ensure all durations are non-negative and valid
data = data[(data['duration'] >= 0) & (data['entry_duration'] >= 0)]

# Initialize the KaplanMeierFitter
kmf = KaplanMeierFitter()

# Plot setup
plt.figure(figsize=(10, 6))

# Unique donor values
donor_values = data['donor'].unique()

# Colors for each donor type
colors = ['blue', 'green', 'red']

# Overlay plots for different donor values
for donor, color in zip(donor_values, colors):
    kmf.fit(durations=data[data['donor'] == donor]['duration'],
            event_observed=data[data['donor'] == donor]['event'],
            entry=data[data['donor'] == donor]['entry_duration'],
            label=donor)
    ax = kmf.plot_cumulative_density(ci_show=False, color=color)

# Cut off the plot at 15 years
plt.xlim(0, 15)

# Convert y-axis to percentage
vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])

# Labels and title
plt.xlabel("Time (years)")
plt.ylabel("Failure Probability (%)")
plt.title("Failure Function Plot (Cumulative Density) by Donor Type")
plt.legend(title="Donor Type")

# Optional: Display the plot (depends on your environment)
plt.show()

/var/folders/z3/nwmg00j178vfczmb9nk9487h0000gn/T/ipykernel_48067/3613857153.py:50: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])

_images/e8f19c100f7757f4315f4d770bd92f27b3f5b067066a12bb502c7fa74ce846a8.png

Block 7: Generate Kaplan-Meier Estimates#

Show code cell source Hide code cell source

import pandas as pd
from lifelines import KaplanMeierFitter

# Assuming 'data' is already the DataFrame loaded previously

# Set the event indicator based on 'Died'
data['event'] = (data['rSMGJcEdF_d'] == 'Died').astype(int)

# Ensure date columns are in datetime format
data['rSMGJcEdF_tT'] = pd.to_datetime(data['rSMGJcEdF_tT'], errors='coerce')
data['rSMGJcEdF_t0'] = pd.to_datetime(data['rSMGJcEdF_t0'], errors='coerce')
data['entry'] = data['rSMGJcEdF_t0']  # Use _t0 as entry

# Calculate the duration from origin to the event or censoring in years
data['duration'] = (data['rSMGJcEdF_tT'] - data['rSMGJcEdF_t0']).dt.total_seconds() / (365.25 * 24 * 60 * 60)

# Calculate the entry duration
data['entry_duration'] = (data['entry'] - data['rSMGJcEdF_t0']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
data['entry_duration'] = data['entry_duration'].fillna(0)  # Ensure non-donor entries are set to 0

# Ensure all durations are non-negative and valid
data = data[(data['duration'] >= 0) & (data['entry_duration'] >= 0)]

# Initialize the KaplanMeierFitter
kmf = KaplanMeierFitter()

# Time points at which we want to extract survival probabilities
time_points = [5, 12, 15]

# Dictionary to hold the results
results = []

# Fit the model for each donor type and extract the survival probabilities
for donor in data['donor'].unique():
    kmf.fit(durations=data[data['donor'] == donor]['duration'],
            event_observed=data[data['donor'] == donor]['event'],
            entry=data[data['donor'] == donor]['entry_duration'],
            label=donor)
    
    for time in time_points:
        survival_prob = kmf.survival_function_at_times(time).values[0]
        results.append({
            'donor': donor,
            'time': time,
            'survival_probability': survival_prob,
            'failure_probability': 1 - survival_prob
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv('km_results.csv', index=False)

# Display the results
print(results_df)

Only the donor results match Segev, JAMA, 2010 p. 961. The nondonors here are unmatched and so do not yield the same estimates from this paper

A webApp is quick way to approximate these findings using the base-case of the “typical” donor as per Table 1, also on page 961. Namely, 40yo white female with some college education, BMI=28, SBP=120, no hypertension, no history of smoking cigarettes, creatinine 0.9 mg/dL, and eGFR 100 mL/min

Block 8: Summarize Failure Rates#

Block 9: Kaplan-Meier Survival Plot#

Block 11: Save Processed Data#

Block 12: Cox Proportional Hazards Model#

import pandas as pd

# Assuming 'data' is already the DataFrame loaded previously

# Check for constant variables
constant_columns = [col for col in data.columns if data[col].nunique() <= 1]
print("Constant columns:", constant_columns)

Constant columns: ['_st', '_t0', 'entry_duration']

import pandas as pd
from lifelines import CoxPHFitter

# Ensure 'donor' is a categorical variable
data['donor'] = data['donor'].astype('category')

# Select necessary columns
data = data[['duration', 'event', 'donor']]

# Remove constant columns if any
constant_columns = [col for col in data.columns if data[col].nunique() <= 1]
data = data.drop(columns=constant_columns)

print("Constant columns removed:", constant_columns)

Constant columns removed: []

data.head()

	duration	donor
0	13.859001	Donor
1	9.409993	Donor
2	8.752909	Donor
3	8.273785	Donor
4	14.398357	Donor

import pandas as pd

# Load the data
file_path = '~/documents/melody/local/esrdRisk_t02tT.csv'  # Adjust the file path as needed
data = pd.read_csv(file_path)

# Check the first few rows and the column names
print(data.head())
print(data.columns)

Show code cell output Hide code cell output

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[9], line 5
# Load the data
file_path = '~/documents/melody/local/esrdRisk_t02tT.csv'  # Adjust the file path as needed
----> 5 data = pd.read_csv(file_path)
# Check the first few rows and the column names
print(data.head())

File ~/Documents/Melody/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
kwds_defaults = _refine_defaults_read(
   dialect,
   delimiter,
   (...)
   dtype_backend=dtype_backend,
)
kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File ~/Documents/Melody/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
_validate_names(kwds.get("names", None))
# Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
if chunksize or iterator:
   return parser

File ~/Documents/Melody/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   self.options["has_index_names"] = kwds["has_index_names"]
self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File ~/Documents/Melody/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   if "b" not in mode:
       mode += "b"
-> 1880 self.handles = get_handle(
   f,
   mode,
   encoding=self.options.get("encoding", None),
   compression=self.options.get("compression", None),
   memory_map=self.options.get("memory_map", False),
   is_text=is_text,
   errors=self.options.get("encoding_errors", "strict"),
   storage_options=self.options.get("storage_options", None),
)
assert self.handles is not None
f = self.handles.handle

File ~/Documents/Melody/myenv/lib/python3.12/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
elif isinstance(handle, str):
   # Check whether the filename is to be opened in binary mode.
   # Binary mode does not support 'encoding' and 'newline'.
   if ioargs.encoding and "b" not in ioargs.mode:
       # Encoding
--> 873         handle = open(
           handle,
           ioargs.mode,
           encoding=ioargs.encoding,
           errors=errors,
           newline="",
       )
   else:
       # Binary mode
       handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/apollo/documents/melody/local/esrdRisk_t02tT.csv'

Block 13: Save Baseline Survival and Coefficients#

// Define matrix 'b' with model coefficients
matrix define b = e(b)

// Keep baseline survival estimates
keep s0 _t 

// Sort and list baseline survival estimates
sort _t s0
list in 1/10

// Save baseline survival estimates
save ${dir}/s0.dta, replace
export delimited using ${dir}/s0.csv, replace

// Save model coefficients
matrix beta = e(b)
svmat beta
keep beta*
drop if missing(beta1)
list 
save ${dir}/b.dta, replace
export delimited using ${dir}/b.csv, replace