Content with notebooks#

You can also create content with Jupyter Notebooks. This means that you can include code blocks and their outputs in your book.

Markdown + notebooks#

As it is markdown, you can embed images, HTML, etc into your posts!

You can also \(add_{math}\) and

\[ math^{blocks} \]

or

\[\begin{split} \begin{aligned} \mbox{mean} la_{tex} \\ \\ math blocks \end{aligned} \end{split}\]

But make sure you $Escape $your $dollar signs $you want to keep!

MyST markdown#

MyST markdown works in Jupyter Notebooks as well. For more information about MyST markdown, check out the MyST guide in Jupyter Book, or see the MyST markdown documentation.

Code blocks and outputs#

Jupyter Book will also embed your code blocks and output in your book. For example, here’s some sample Matplotlib code:

from matplotlib import rcParams, cycler
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import pandas as pd

plt.ion()
<contextlib.ExitStack at 0x103c6b690>
# Fixing random state for reproducibility
np.random.seed(19680801)

N = 10
data = [np.logspace(0, 1, 100) + np.random.randn(100) + ii for ii in range(N)]
data = np.array(data).T
cmap = plt.cm.coolwarm
rcParams['axes.prop_cycle'] = cycler(color=cmap(np.linspace(0, 1, N)))


from matplotlib.lines import Line2D
custom_lines = [Line2D([0], [0], color=cmap(0.), lw=4),
                Line2D([0], [0], color=cmap(.5), lw=4),
                Line2D([0], [0], color=cmap(1.), lw=4)]

fig, ax = plt.subplots(figsize=(10, 5))
lines = ax.plot(data)
ax.legend(custom_lines, ['Cold', 'Medium', 'Hot']);
_images/669ed9ca52edff2851045aeafccd6f855f4d163ea9713cb0f374ff6852774c82.png

There is a lot more that you can do with outputs (such as including interactive outputs) with your book. For more information about this, see the Jupyter Book documentation

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

# Load data without specifying data types initially
df = pd.read_csv('~/documents/github/csv/donor_live.csv', low_memory=False)

# Print DataFrame info to verify load
print(df.info())

# Clean and convert data types
df['pers_id'] = pd.to_numeric(df['pers_id'], errors='coerce')
df['don_age_in_months'] = pd.to_numeric(df['don_age_in_months'], errors='coerce')
df['don_age'] = pd.to_numeric(df['don_age'], errors='coerce')
df['don_race'] = pd.to_numeric(df['don_race'], errors='coerce')

# Display basic statistics for continuous variables
print(df[['don_age_in_months', 'don_age', 'don_race']].describe())

# Plot histograms for continuous variables
continuous_vars = ['don_age_in_months', 'don_age', 'don_race']

for var in continuous_vars:
    plt.figure()
    df[var].dropna().hist(bins=30)
    plt.title(f'Histogram of {var}')
    plt.xlabel(var)
    plt.ylabel('Frequency')
    plt.show()

# Count and plot binary/categorical variables
binary_vars = ['don_gender', 'don_ethnicity_srtr']
categorical_vars = ['don_ty', 'don_home_state', 'don_race_srtr']

for var in binary_vars + categorical_vars:
    plt.figure()
    df[var].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {var}')
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.show()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173929 entries, 0 to 173928
Data columns (total 36 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   don_id                           173929 non-null  object 
 1   pers_id                          173929 non-null  int64  
 2   don_ty                           173929 non-null  object 
 3   don_age_in_months                173922 non-null  float64
 4   don_age                          173922 non-null  float64
 5   don_gender                       173929 non-null  object 
 6   don_home_state                   170492 non-null  object 
 7   don_ethnicity_pre2004            131260 non-null  float64
 8   don_race_pre2004                 131260 non-null  float64
 9   don_race                         173851 non-null  object 
 10  don_race_srtr                    173815 non-null  object 
 11  don_ethnicity_srtr               173929 non-null  object 
 12  don_home_country                 0 non-null       float64
 13  don_hgt_cm                       130924 non-null  float64
 14  don_wgt_kg                       132825 non-null  float64
 15  don_recov_dt                     173929 non-null  object 
 16  don_relationship_ty              172919 non-null  object 
 17  don_diab                         101127 non-null  object 
 18  don_hyperten_diet                3283 non-null    object 
 19  don_hyperten_diuretics           3283 non-null    object 
 20  don_hyperten_other_meds          3283 non-null    object 
 21  don_ki_creat_preop               128276 non-null  float64
 22  don_bp_preop_syst                118935 non-null  float64
 23  don_bp_preop_diast               118915 non-null  float64
 24  pers_esrd_death_dt               79 non-null      float64
 25  pers_ssa_death_dt                1395 non-null    object 
 26  don_hyperten_postop              101066 non-null  object 
 27  pers_esrd_first_service_dt       706 non-null     float64
 28  pers_esrd_first_dial_dt          690 non-null     float64
 29  pers_esrd_ki_dgn                 332 non-null     float64
 30  don_race_american_indian         173929 non-null  int64  
 31  don_race_asian                   173929 non-null  int64  
 32  don_race_black_african_american  173929 non-null  int64  
 33  don_race_hispanic_latino         173929 non-null  int64  
 34  don_race_native_hawaiian         173929 non-null  int64  
 35  don_race_white                   173929 non-null  int64  
dtypes: float64(14), int64(7), object(15)
memory usage: 47.8+ MB
None
       don_age_in_months        don_age    don_race
count      173922.000000  173922.000000  838.000000
mean          493.620244      40.676142   65.250597
std           139.174427      11.600951   48.149922
min             0.000000       0.000000   24.000000
25%           385.000000      32.000000   40.000000
50%           489.000000      40.000000   40.000000
75%           595.000000      49.000000   72.000000
max          1151.000000      95.000000  248.000000
_images/684389f364195f892ed335c4439843ceb1aec7dacf2a24027f24234a2e35e226.png _images/5a55f653ded79422ced068e1776282eb2db3e24067934dd48051f557903d9891.png _images/a572bb7900e6a7d4f10f966328edb2b1da535224fe2c49682c4d9795fed2f037.png _images/c772b73e79893149962e09d8fab5f43d656537871a1b746dceb7c08ddf24bc69.png _images/1e331fbccee74bb6595891f879e805a4d014b89a457c93eb0b4db879962bca78.png _images/4e590b77575156a0535c4f08eae6fe61e50a66dfaa520fff8d65e308701a1eae.png _images/8f6a50050f403f7334731e8cb0290d3c49588a7d20fff53dc990c94d04f18d5e.png _images/a8b75e259071e820e8f4cf901e685d952ccaa9a9059ed605b22c1f4c29dfa29e.png
# Read the CSV file into a DataFrame
df = pd.read_csv('~/documents/github/csv/donor_live.csv')

# Show the DataFrame

You can find the dimensions of a DataFrame using the .shape attribute. This attribute returns a tuple representing the dimensions of the DataFrame, where the first element is the number of rows and the second is the number of columns.

Here’s how to find the dimensions of your DataFrame df:

# Get the dimensions of the DataFrame
rows, cols = df.shape

# Print the dimensions
print(f"The DataFrame has {rows} rows and {cols} columns.")
The DataFrame has 173929 rows and 36 columns.

This will give you a quick idea of how large your dataset is.


Next, you can use the .head() method to print the first 5 rows of your DataFrame. This is a good way to get a feel for what information is in your dataset. You can pass a number into the .head() method to print a different number of rows. For example, df.head(10) will print the first 10 rows of your DataFrame.

df.head()
don_id pers_id don_ty don_age_in_months don_age don_gender don_home_state don_ethnicity_pre2004 don_race_pre2004 don_race ... don_hyperten_postop pers_esrd_first_service_dt pers_esrd_first_dial_dt pers_esrd_ki_dgn don_race_american_indian don_race_asian don_race_black_african_american don_race_hispanic_latino don_race_native_hawaiian don_race_white
0 HRG962 289 L 490.0 40.0 F TX 2.0 8.0 8: White ... N NaN NaN NaN 0 0 0 0 0 1
1 IT5273 484 L 312.0 26.0 F OH 2.0 8.0 8: White ... N NaN NaN NaN 0 0 0 0 0 1
2 HRL860 840 L 648.0 54.0 M ND 2.0 8.0 8: White ... N NaN NaN NaN 0 0 0 0 0 1
3 ZWB847 967 L 564.0 47.0 M NY 1.0 8.0 2000: Hispanic/Latino ... NaN NaN NaN NaN 0 0 0 1 0 0
4 ALB852 1145 L 326.0 27.0 F CA 1.0 8.0 2000: Hispanic/Latino ... NaN NaN NaN NaN 0 0 0 1 0 0

5 rows × 36 columns

You can restrict the DataFrame to a subset of rows and columns using slicing. Here’s how to keep only the first 1000 rows and the first 9 columns:

# Subset the DataFrame
subset_df = df.iloc[:1000, :9]

# Show the shape of the new DataFrame to confirm
print(f"The shape of the subset DataFrame is {subset_df.shape}")
The shape of the subset DataFrame is (1000, 9)

Explanation:

  • df.iloc[:1000, :9] uses the iloc indexer for Pandas DataFrames. The :1000 selects the first 1000 rows and the :9 selects the first 9 columns.

  • subset_df.shape will display the shape of the new DataFrame, which should be (1000, 9) to confirm that you’ve correctly subsetted it.

Now subset_df contains only the first 1000 rows and the first 9 columns of the original df.

subset_df.head()
don_id pers_id don_ty don_age_in_months don_age don_gender don_home_state don_ethnicity_pre2004 don_race_pre2004
0 HRG962 289 L 490.0 40.0 F TX 2.0 8.0
1 IT5273 484 L 312.0 26.0 F OH 2.0 8.0
2 HRL860 840 L 648.0 54.0 M ND 2.0 8.0
3 ZWB847 967 L 564.0 47.0 M NY 1.0 8.0
4 ALB852 1145 L 326.0 27.0 F CA 1.0 8.0
# Select the first 100 rows and cherry-pick columns by index (e.g., columns at index 0, 2, 4, and 6)
subset_csv = df.iloc[:100, [3, 4, 6]]
subset_csv.to_csv('~/documents/github/work/subset.csv', index=False)
# Summary statistics to identify continuous variables
cont_summary = subset_df.describe()
print("Continuous Variables:")
print(cont_summary.columns.tolist())  # These are likely to be continuous variables

# Identify binary variables
binary_vars = [col for col in subset_df.columns if subset_df[col].nunique() == 2]
print("\nBinary Variables:")
print(binary_vars)

# Identify categorical variables
cat_vars = [col for col in subset_df.select_dtypes(include=['object']).columns if col not in binary_vars]
print("\nCategorical Variables:")
print(cat_vars)
Continuous Variables:
['pers_id', 'don_age_in_months', 'don_age', 'don_ethnicity_pre2004', 'don_race_pre2004']

Binary Variables:
['don_gender', 'don_ethnicity_pre2004']

Categorical Variables:
['don_id', 'don_ty', 'don_home_state']
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Sample data (replace with your own)
ages = subset_df['don_age'].dropna()  # Remove NaN values if any

# Add jitter to x-values for visualization
np.random.seed(42)
jitter = np.random.normal(0, 0.1, len(ages))

# Scatter plot
plt.scatter(jitter, ages, color='lime', alpha=0.5, label='Data Points')

# Calculate mean and 95% CI for ages
mean_age = np.mean(ages)
confidence = 0.95
std_dev = np.std(ages)
ci = std_dev * stats.t.ppf((1 + confidence) / 2, len(ages) - 1)

# Overlay mean as a horizontal line (in gray)
plt.axhline(y=mean_age, color='gray', linestyle='--', label=f'Mean Age: {mean_age:.2f}')

# Overlay mean as a blue dot
plt.scatter(0, mean_age, color='blue', zorder=5, label='Mean')

# Overlay 95% CI as a vertical error bar (in orange)
plt.errorbar(0, mean_age, yerr=ci, color='orange', fmt='o', capsize=5, label=f'95% CI: {mean_age - ci:.2f} - {mean_age + ci:.2f}')

# Remove x-axis ticks, labels, and title
plt.xticks([])
plt.xlabel('')

plt.ylabel('Age (years)')
plt.title('Jittered Scatter Plot of Age with Overlaid Mean and 95% CI')
plt.legend()
plt.tight_layout()
plt.show()
_images/b35ecc1de37f275b5299bd9e729da3e8cc20670d7ad50eff46f0f99710fe8a87.png