import numpy as np
import pandas as pd
# Loading Dataset
# Titanic survival data is divided into two datasets: Train and Test
= pd.read_csv("/Users/gabrielguzman/School/Research/Data Science/titanic/train.csv")
train_data = pd.read_csv("/Users/gabrielguzman/School/Research/Data Science/titanic/train.csv") test_data
Intro to Data Science Presentation
Data Science
Data Analytics
Machine Learning
Data Science Presentation for Association of Computing Machinery
Description: Introductory data science presentation and project walkthrough with Gettysburg College students
The Association for Computing Machinery (ACM) in Gettysburg College is a student-led organization where students from different disciplines and fields with interest in computer science gather to learn from each other and to form a community on campus. For this week’s meeting, gave a presentation on “Into to Data Science”, demonstrating the data science and data analytics workflow, from data exploration, hypothesis testing, and modelling processes useful to answer data-driven questions. This presentation is adapted from Kaggle’s Titanic - Machine Learning from Disaster’ Machine Learning Competition.
Step 1: Loading the Data
Step 2: Exploratory Data Analysis
# Tabular Output of 'test_data'
test_data.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
# Women survived in train data
= train_data.loc[train_data.Sex == 'female']["Survived"]
women = sum(women)/len(women)
rate_women print("% of women who survived:", rate_women)
# Men survived in train data
= train_data.loc[train_data.Sex == 'male']["Survived"]
men = sum(men)/len(men)
rate_men print("% of men who survived:", rate_men)
% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924
# Machine Learning Model (Random Forest Algorithm)
from sklearn.ensemble import RandomForestClassifier
= train_data["Survived"]
y
= ["Pclass", "Sex", "SibSp", "Parch"]
features = pd.get_dummies(train_data[features])
X = pd.get_dummies(test_data[features])
X_test
= RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model
model.fit(X, y)= model.predict(X_test)
predictions
= pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions, 'Sex': test_data.Sex})
output 'submission.csv', index=False) output.to_csv(
# Female survived = 0.910828025
# Male survived = 0.010398614
# % of women who survived: 0.7420382165605095
# % of men who survived: 0.18890814558058924