import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.patches
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.weightstats

import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('heart.csv')
df.head()


df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64


df.nunique()

age          41
sex           2
cp            4
trtbps       49
chol        152
fbs           2
restecg       3
thalachh     91
exng          2
oldpeak      40
slp           3
caa           5
thall         4
output        2
dtype: int64


df.caa.value_counts()

0    175
1     65
2     38
3     20
4      5
Name: caa, dtype: int64


df.loc[df['caa'] == 4, 'caa'] = np.NaN


df.thall.value_counts()

2    166
3    117
1     18
0      2
Name: thall, dtype: int64


df.loc[df['thall'] == 0, 'thall'] = np.NaN


df = df.fillna(df.median())
df.isnull().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64


# Calculate Interquartile Range and outlier step for each column with continuous variables, and drop any values that are outside of the range of (Q1 - outlier step, Q3 + outlier step)
continuous_variables = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
for var in continuous_variables:
    var_data = df[var]
    Q1 = np.percentile(var_data, 25.)
    Q3 = np.percentile(var_data, 75.)
    IQR = Q3 - Q1
    outlier_step = IQR * 1.5
    outliers = var_data[~((var_data >= Q1 - outlier_step) & (var_data <= Q3 + outlier_step))].index.tolist()
    df.drop(outliers, inplace=True, errors='ignore')


df.describe()


df_visual = pd.DataFrame()
df_visual['sex'] = df.sex.replace({0: "Female", 1: "Male"})
df_visual['cp'] = df.cp.replace({0: "Typical Angina", 
                          1: "Atypical Angina", 
                          2: "Non-Anginal Pain",
                          3: "Symptomatic"})
df_visual['fbs'] = df.fbs.replace({0: "False", 1: "True"})
df_visual['restecg'] = df.restecg.replace({0: "normal", 1: "ST-T Wave Abnormality", 2: "Left Ventricular Hypertrophy"})
df_visual['exng'] = df.exng.replace({1: "Yes", 0: "No"})
df_visual['slp'] = df.slp.replace({0: "Upsloping", 1: "Flat", 2:"Downsloping"})
df_visual['caa'] = df.caa.replace({0: "0", 1: "1", 2: "2", 3: "3"})
df_visual['thall'] = df.thall.replace({0: "Null", 1: "Fixed Defect", 2: "Reversible Defect", 3: "Normal"})
df_visual['output'] = df.output.replace({1: "Heart Disease", 0: "No Heart Disease"})
df['output_name'] = df.output.replace({1: "Heart Disease", 0: "No Heart Disease"})
df_names = {'age': 'Age', 'sex': 'Sex', 'cp': 'Chest Pain', 'trtbps': 'Resting Blood Pressure', 'chol': 'Cholesterol', 'fbs': 'Fasting Blood Sugar', 'restecg': 'Resting ECG', 'thalachh': 'Maximum Heart Rate', 'exng': 'Exercise-Induced Angina', 'oldpeak': 'ST Depression', 'slp': 'Slope', 'caa': 'Number of Major Vessels', 'thall': 'Thalassemia', 'output': 'Heart Disease'}


fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(30, 20))

for i, var in enumerate(df_names):
    ax = axes[int(i/4), i%4]
    sns.set(font_scale=.9)
    if var in df_visual:
        sns.histplot(data=df_visual, x=var, ax=ax)
        ax.set_xlabel(df_names[var] + ' Distribution')
    else:
        sns.histplot(data=df, x=var, ax=ax)
        ax.set_xlabel(df_names[var] + ' Distribution')

ax = axes[3, 2]
ax.axis('off')
ax = axes[3, 3]
ax.axis('off')

(0.0, 1.0, 0.0, 1.0)


sns.set(style="white") 
plt.rcParams['figure.figsize'] = (15, 10) 
sns.heatmap(df.corr(), annot = True, linewidths=.5, cmap="Blues")
plt.title('Correlation Between Variables', fontsize = 20)
plt.show()


continous_variables = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']

fig, axes = plt.subplots(nrows=5, ncols=1, figsize=(20, 50))

for i, var in enumerate(continuous_variables):
    ax = axes[i]
    
    # Prepare data for plotting
    df_copy = pd.DataFrame({var: df.sort_values(by = [var])[var].unique(), 'heart disease': np.nan, 'no heart disease': np.nan})
    for var_point in df.sort_values(by = [var])[var].unique():
        idx = df_copy.index[df_copy[var] == var_point]
        num_heart_diseases = len(df.loc[(df[var] == var_point) & (df['output'] == 1)]['output'])
        num_no_heart_diseases = len(df.loc[(df[var] == var_point) & (df['output'] == 0)]['output'])
        df_copy.loc[idx, 'heart disease'] = num_heart_diseases
        df_copy.loc[idx, 'no heart disease'] = num_no_heart_diseases

    # Plot data
    labels = df_copy[var]
    x = np.arange(len(labels))
    width = 0.4
    
    bar1 = ax.bar(x - width/2, df_copy['heart disease'], width, label="Heart Disease")
    bar2 = ax.bar(x + width/2, df_copy['no heart disease'], width, label="No Heart Disease")

    # Plot regression lines
    sns.regplot(x=df_copy.index, y='heart disease', data=df_copy, ax=ax, scatter_kws={'s': 0})
    sns.regplot(x=df_copy.index, y='no heart disease', data=df_copy, ax=ax, scatter_kws={'s': 0})

    ax.set_ylabel("Counts")
    ax.set_title("Heart Disease Based on " + df_names[var], weight='bold')
    ax.set_xlabel(df_names[var])
    if var == 'chol' or var == 'thalachh':
        ax.set_xticks(x, labels, fontsize=5)
    else:
        ax.set_xticks(x, labels, fontsize=10)
    ax.legend()


continuous_variables = ["age","trtbps","chol","thalachh","oldpeak"]
sns.pairplot(df[continuous_variables + ['output_name']], hue='output_name')

<seaborn.axisgrid.PairGrid at 0xffff79765310>


categorical_variables = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(40, 30))

for i, var in enumerate(categorical_variables):
    ax = axes[int(i/3), i%3]
    
    name = df_visual[var]
    sns.set(font_scale=1.9)
    sns.countplot(x=var, hue='output', data=df_visual, palette='muted', ax=ax)
    ax.set_title("Heart Disease Based on " + df_names[var], fontsize = 18, weight = 'bold')
    ax.set_xlabel(df_names[var])

    # Calculate percentages
    totals = []
    for i in ax.patches:
        totals.append(i.get_height())
    total = sum(totals)
    for i in ax.patches:
        if i.get_height() > 10:
            ax.text(i.get_x()+.1, i.get_height()-5,
                    str(round((i.get_height()/total)*100, 2))+'%', fontsize=14,
                        color='white', weight = 'bold')  
    plt.tight_layout()

ax = axes[2, 2]
ax.axis('off')

(0.0, 1.0, 0.0, 1.0)


# Visualize distributions for males with heart disease

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(40, 30))
categorical_variables = ['exng','caa','cp','fbs','restecg','slp','thall']

for i, var in enumerate(categorical_variables):
    ax = axes[int(i/3), i%3]
    sns.set(font_scale=1.25)
    sns.histplot(data=df_visual[(df_visual['output'] == 'Heart Disease') & (df_visual['sex'] == 'Male')], x=var, ax=ax)
    ax.set_xlabel(df_names[var] + " Distribution for Males with Heart Disease")
    
ax = axes[2, 1]
ax.axis('off')
ax = axes[2, 2]
ax.axis('off')

(0.0, 1.0, 0.0, 1.0)


# Visualize distributions for males with heart disease

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(40, 30))
categorical_variables = ['exng','caa','cp','fbs','restecg','slp','thall']

for i, var in enumerate(categorical_variables):
    ax = axes[int(i/3), i%3]
    sns.set(font_scale=1.25)
    sns.histplot(data=df_visual[(df_visual['output'] == 'Heart Disease') & (df_visual['sex'] == 'Female')], x=var, ax=ax)
    ax.set_xlabel(df_names[var] + " Distribution for Females with Heart Disease")
    
ax = axes[2, 1]
ax.axis('off')
ax = axes[2, 2]
ax.axis('off')

(0.0, 1.0, 0.0, 1.0)


male_data = []
male_data.append(statsmodels.stats.weightstats.DescrStatsW(df[(df['output'] == 1) & (df['sex'] == 1)]['age']).zconfint_mean())
male_data.append('Non-Anginal Pain')
male_data.append(statsmodels.stats.weightstats.DescrStatsW(df[(df['output'] == 1) & (df['sex'] == 1)]['trtbps']).zconfint_mean())
male_data.append(statsmodels.stats.weightstats.DescrStatsW(df[(df['output'] == 1) & (df['sex'] == 1)]['chol']).zconfint_mean())
male_data.append('<= 120 mg/dl')
male_data.append('ST-T Wave Abnormality')
male_data.append(statsmodels.stats.weightstats.DescrStatsW(df[(df['output'] == 1) & (df['sex'] == 1)]['thalachh']).zconfint_mean())
male_data.append('No')
male_data.append(statsmodels.stats.weightstats.DescrStatsW(df[(df['output'] == 1) & (df['sex'] == 1)]['oldpeak']).zconfint_mean())
male_data.append('Downsloping')
male_data.append('0')
male_data.append('Normal')

female_data = []
female_data.append(statsmodels.stats.weightstats.DescrStatsW(df[(df['output'] == 1) & (df['sex'] == 0)]['age']).zconfint_mean())
female_data.append('Non-Anginal Pain')
female_data.append(statsmodels.stats.weightstats.DescrStatsW(df[(df['output'] == 1) & (df['sex'] == 0)]['trtbps']).zconfint_mean())
female_data.append(statsmodels.stats.weightstats.DescrStatsW(df[(df['output'] == 1) & (df['sex'] == 0)]['chol']).zconfint_mean())
female_data.append('<= 120 mg/dl')
female_data.append('ST-T Wave Abnormality')
female_data.append(statsmodels.stats.weightstats.DescrStatsW(df[(df['output'] == 1) & (df['sex'] == 0)]['thalachh']).zconfint_mean())
female_data.append('No')
female_data.append(statsmodels.stats.weightstats.DescrStatsW(df[(df['output'] == 1) & (df['sex'] == 0)]['oldpeak']).zconfint_mean())
female_data.append('Downsloping')
female_data.append('0')
female_data.append('Normal')


rows = ['Age', 'Chest Pain', 'Resting Blood Pressure', 'Cholesterol', 'Fasting Blood Sugar', 'Resting ECG', 'Maximum Heart Rate', 'Exercise-Induced Angina', 'ST Depression Segment', 'Slope of Peak', 'Number of Major Vessels', 'Thalassemia']
cols = ['Male', 'Female']

fig, ax = plt.subplots(figsize=(11, 8))
ax.set_ylim(-1, 12)
ax.set_xlim(0, 3.5)
ax.text(x=1.5, y=11.75, s='Male', va='center', ha='center', weight='bold')
ax.text(x=2.75, y=11.75, s='Female', va='center', ha='center', weight='bold')

for i, row in enumerate(reversed(rows)):
   ax.text(x=.15, y=i, s=row, va='center', ha='left', fontsize=8, weight='bold')
   
for i, row in enumerate(reversed(male_data)):
   ax.text(x=1.5, y=i, s=row, va='center', ha='center', fontsize=8, weight='bold')
   
for i, row in enumerate(reversed(female_data)):
   ax.text(x=2.75, y=i, s=row, va='center', ha='center', fontsize=8, weight='bold')

for i, row in enumerate(rows):
    ax.plot(
    	[0, 4],
    	[i -.5, i - .5],
    	ls=':',
    	lw='.5',
    	c='grey'
    )
    
ax.plot([0, 4], [11.5, 11.5], lw='.5', c='black')

male_bg = matplotlib.patches.Rectangle(
   (.875, -.5), 1.25, 12, ec='none', fc='lightblue', alpha=.2, zorder=-1 
)

female_bg = matplotlib.patches.Rectangle(
   (2.125, -.5), 1.25, 12, ec='none', fc='lightgreen', alpha=.2, zorder=-1 
)

ax.add_patch(male_bg)
ax.add_patch(female_bg)
ax.axis('off')

(0.0, 3.5, -1.0, 12.0)


min_value = df['age'].min()
max_value = df['age'].max() 
bins = np.linspace(min_value,max_value,4)
labels = ['29-45', '45-61','61-77']
df['age_bins'] = pd.cut(df['age'], bins=bins, labels=labels, include_lowest=True)
sns.histplot(df['age_bins'], bins=3)

<AxesSubplot:xlabel='age_bins', ylabel='Count'>


min_value = df['cp'].min()
max_value = df['cp'].max() 
bins = np.linspace(min_value,max_value,5)
labels = ['asymptomatic', 'typical_angina','atypical_angina', 'non_anginal']
df['cp_bins'] = pd.cut(df['cp'], bins=bins, labels=labels, include_lowest=True)
sns.histplot(df['cp_bins'], bins=3)

<AxesSubplot:xlabel='cp_bins', ylabel='Count'>


# putting a name to chest pain to get dummy columns
classify = df.copy()
for index, row in classify.iterrows():
    if row['cp'] == 1:
        classify.at[index,'cp'] = 'typical_angina'
    elif row['cp'] == 2:
        classify.at[index, 'cp'] = 'atypical_angina'
    elif row['cp'] == 3:
        classify.at[index, 'cp'] = 'non_anginal'
    elif row['cp'] == 0:
        classify.at[index, 'cp'] = 'asymptomatic'


# putting a name to sex to get dummy columns
for index, row in classify.iterrows():
    if row['sex'] == 1:
        classify.at[index,'sex'] = 'male'
    elif row['sex'] == 0:
        classify.at[index, 'sex'] = 'female'
classify


import statsmodels.api as sm
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
from sklearn.metrics import (confusion_matrix, accuracy_score)


dummies = pd.get_dummies(classify['sex'])
classify = pd.concat([classify, dummies], axis = 1)
dummies = pd.get_dummies(classify['cp'])
classify = pd.concat([classify, dummies], axis = 1)
dummies = pd.get_dummies(classify['output_name'])
classify = pd.concat([classify, dummies], axis = 1)
classify.columns

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output', 'output_name',
       'age_bins', 'cp_bins', 'female', 'male', 'asymptomatic',
       'atypical_angina', 'non_anginal', 'typical_angina', 'Heart Disease',
       'No Heart Disease'],
      dtype='object')


train, test = train_test_split(classify, test_size = 0.2, stratify = classify['output'], random_state = 42)


# looking at males within a certain age 
fn = ['male', 'age', 'asymptomatic', 'chol']
cn = ['output']


X_train = train[fn]
y_train = train[cn]
X_test = test[fn]
y_test = test[cn]


LogisticModel = sm.Logit(y_train, X_train).fit()
print(LogisticModel.summary())

Optimization terminated successfully.
         Current function value: 0.576975
         Iterations 5
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                 output   No. Observations:                  227
Model:                          Logit   Df Residuals:                      223
Method:                           MLE   Df Model:                            3
Date:                Sat, 17 Dec 2022   Pseudo R-squ.:                  0.1590
Time:                        03:48:21   Log-Likelihood:                -130.97
converged:                       True   LL-Null:                       -155.73
Covariance Type:            nonrobust   LLR p-value:                 1.009e-10
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
male            -0.8026      0.330     -2.430      0.015      -1.450      -0.155
age              0.0021      0.013      0.162      0.871      -0.024       0.028
asymptomatic    -1.8254      0.307     -5.936      0.000      -2.428      -1.223
chol             0.0062      0.003      2.197      0.028       0.001       0.012
================================================================================


y_hat = LogisticModel.predict(X_test)
compare = pd.concat([y_test, y_hat], axis=1)
compare = compare.rename(columns={0: 'prediction'})
compare['difference'] = compare['output'] - compare['prediction']
compare.sort_values(by=['difference'])

prediction = list(map(round, y_hat)) 
# confusion matrix 
cm = confusion_matrix(y_test, prediction)  
print ("Confusion Matrix : \n", cm)  
  
# accuracy score of the model 
print('Test accuracy = ', accuracy_score(y_test, prediction))

Confusion Matrix : 
 [[21  4]
 [ 3 29]]
Test accuracy =  0.8771929824561403


# go through the residuals to generate a confusion matrix

def rating(difference):
    if difference > 0.5:
        return "false negative"
    elif difference >= 0:
        return "true positive"
    elif difference >= -0.5:
        return "true negative"
    else:
        return "false positive"

compare['type'] = compare.apply(lambda x: rating(x['difference']),axis=1)
true_positives = compare[compare['type'] == "true positive"].count()["type"]
true_negatives = compare[compare['type'] == "true negative"].count()["type"]
type_1_errors = compare[compare['type'] == "false positive"].count()["type"]
type_2_errors = compare[compare['type'] == "false negative"].count()["type"]
print("True positives: ", true_positives)
print("True negatives: ", true_negatives)
print("False positives: ", type_1_errors)
print("False negatives: ", type_2_errors)

True positives:  29
True negatives:  21
False positives:  4
False negatives:  3


roc_auc_score(y_test, y_hat)

0.925


# we then take a look at the ROC curve

fpr, tpr, threshold = roc_curve(y_test, y_hat)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


fn = ['asymptomatic', 'age', 'chol', 'male']
cn = ['Heart Disease', 'No Heart Disease']


dt = DecisionTreeClassifier(max_depth = 2, random_state = 43)
dt.fit(X_train,y_train)
prediction = dt.predict(X_test)
print("The accuracy of the Decision Tree is","{:.3f}".format(metrics.accuracy_score(prediction,y_test)))

The accuracy of the Decision Tree is 0.860


plt.figure(figsize = (10,8))
plot_tree(dt, feature_names = fn, class_names = cn, filled = True)

[Text(0.5, 0.8333333333333334, 'chol <= 0.5\ngini = 0.493\nsamples = 227\nvalue = [100, 127]\nclass = No Heart Disease'),
 Text(0.25, 0.5, 'age <= 56.5\ngini = 0.354\nsamples = 122\nvalue = [28, 94]\nclass = No Heart Disease'),
 Text(0.125, 0.16666666666666666, 'gini = 0.165\nsamples = 77\nvalue = [7, 70]\nclass = No Heart Disease'),
 Text(0.375, 0.16666666666666666, 'gini = 0.498\nsamples = 45\nvalue = [21, 24]\nclass = No Heart Disease'),
 Text(0.75, 0.5, 'male <= 303.5\ngini = 0.431\nsamples = 105\nvalue = [72, 33]\nclass = Heart Disease'),
 Text(0.625, 0.16666666666666666, 'gini = 0.456\nsamples = 94\nvalue = [61, 33]\nclass = Heart Disease'),
 Text(0.875, 0.16666666666666666, 'gini = 0.0\nsamples = 11\nvalue = [11, 0]\nclass = Heart Disease')]


from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
rf = RandomForestClassifier(max_depth=2, random_state=43)
rf.fit(X_train, y_train)
prediction = rf.predict(X_test)
print("The accuracy of the Random Forest is","{:.3f}".format(metrics.accuracy_score(prediction,y_test)))

The accuracy of the Random Forest is 0.912


plt.figure(figsize=(10,8))
plot_tree(rf.estimators_[0], feature_names=fn, class_names=cn, filled=True)

[Text(0.5, 0.8333333333333334, 'chol <= 0.5\ngini = 0.485\nsamples = 152\nvalue = [94, 133]\nclass = No Heart Disease'),
 Text(0.25, 0.5, 'asymptomatic <= 0.5\ngini = 0.331\nsamples = 88\nvalue = [28, 106]\nclass = No Heart Disease'),
 Text(0.125, 0.16666666666666666, 'gini = 0.0\nsamples = 27\nvalue = [0, 36]\nclass = No Heart Disease'),
 Text(0.375, 0.16666666666666666, 'gini = 0.408\nsamples = 61\nvalue = [28, 70]\nclass = No Heart Disease'),
 Text(0.75, 0.5, 'asymptomatic <= 0.5\ngini = 0.412\nsamples = 64\nvalue = [66, 27]\nclass = Heart Disease'),
 Text(0.625, 0.16666666666666666, 'gini = 0.499\nsamples = 14\nvalue = [10, 11]\nclass = No Heart Disease'),
 Text(0.875, 0.16666666666666666, 'gini = 0.346\nsamples = 50\nvalue = [56, 16]\nclass = Heart Disease')]


from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 17
# create an instance of neighbours classifier and fit data
clf = KNeighborsClassifier(n_neighbors, weights="distance")
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
print("The accuracy of the K-Nearest Neighbors is","{:.3f}".format(metrics.accuracy_score(prediction,y_test)))

The accuracy of the K-Nearest Neighbors is 0.544

	age	sex	cp	trtbps	chol	fbs	restecg	thalachh	exng	oldpeak	slp	caa	thall	output
count	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000
mean	53.992958	0.700704	0.989437	129.911972	242.077465	0.140845	0.524648	150.042254	0.316901	0.948592	1.429577	0.637324	2.309859	0.559859
std	9.161678	0.458758	1.031258	15.368133	44.753834	0.348476	0.514206	22.692716	0.466090	1.026141	0.599150	0.904727	0.578570	0.497280
min	29.000000	0.000000	0.000000	94.000000	126.000000	0.000000	0.000000	88.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000
25%	46.750000	0.000000	0.000000	120.000000	209.750000	0.000000	0.000000	133.750000	0.000000	0.000000	1.000000	0.000000	2.000000	0.000000
50%	55.000000	1.000000	1.000000	130.000000	239.000000	0.000000	1.000000	153.000000	0.000000	0.600000	1.000000	0.000000	2.000000	1.000000
75%	60.000000	1.000000	2.000000	140.000000	270.250000	0.000000	1.000000	168.000000	1.000000	1.600000	2.000000	1.000000	3.000000	1.000000
max	77.000000	1.000000	3.000000	170.000000	360.000000	1.000000	2.000000	202.000000	1.000000	4.000000	2.000000	3.000000	3.000000	1.000000

Exploratory and Predictive Analysis on Heart Disease

by Jason Devers and Erica Marghescu

Introduction

Data Collection

Variable Descriptions

Continuous Variables

Categorical Variables

Medical Definitions

Data Management and Representation

Exploratory Data Analysis

Conclusions

Conclusions

Conclusions

Inferential Statistics

Predicting Heart Disease

Taking a look at the 4 different chest pains:¶

Classifiers

Test/Train

Logistic Regression

Confusion Matrix

Decision Tree

K-Nearest-Neighbors

Conclusion¶

	age	sex	cp	trtbps	chol	fbs	restecg	thalachh	exng	oldpeak	slp	thall	output
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trtbps	chol	fbs	restecg	thalachh	exng	oldpeak	slp	caa	thall	output	output_name	age_bins	cp_bins
0	63	male	non_anginal	145	233	1	0	150	0	2.3	0	0.0	1.0	1	Heart Disease	61-77	non_anginal
1	37	male	atypical_angina	130	250	0	1	187	0	3.5	0	0.0	2.0	1	Heart Disease	29-45	atypical_angina
2	41	female	typical_angina	130	204	0	0	172	0	1.4	2	0.0	2.0	1	Heart Disease	29-45	typical_angina
3	56	male	typical_angina	120	236	0	1	178	0	0.8	2	0.0	2.0	1	Heart Disease	45-61	typical_angina
4	57	female	asymptomatic	120	354	0	1	163	1	0.6	2	0.0	2.0	1	Heart Disease	45-61	asymptomatic
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	57	female	asymptomatic	140	241	0	1	123	1	0.2	1	0.0	3.0	0	No Heart Disease	45-61	asymptomatic
299	45	male	non_anginal	110	264	0	1	132	0	1.2	1	0.0	3.0	0	No Heart Disease	29-45	non_anginal
300	68	male	asymptomatic	144	193	1	1	141	0	3.4	1	2.0	3.0	0	No Heart Disease	61-77	asymptomatic
301	57	male	asymptomatic	130	131	0	1	115	1	1.2	1	1.0	3.0	0	No Heart Disease	45-61	asymptomatic
302	57	female	typical_angina	130	236	0	0	174	0	0.0	1	1.0	2.0	0	No Heart Disease	45-61	typical_angina

	age	sex	cp	trtbps	chol	fbs	restecg	thalachh	exng	oldpeak	slp	thall	output
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trtbps	chol	fbs	restecg	thalachh	exng	oldpeak	slp	thall	output
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1