Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

Signup for an account in the Kaggle site and go to https://www.kaggle.com/c/tita

ID: 2246831 • Letter: S

Question

Signup for an account in the Kaggle site and go to https://www.kaggle.com/c/titanic, use data and problem description in the above link to complete the analysis of what sorts of people were likely to survive. In particular, apply R to predict which passengers survived the tragedy.

Provide the R code, screenshots of your implementation and results along with the clear description of the steps you took to solve this exercise. Feel free to use https://www.kaggle.com/c/titanic/forums for your reference.

Explanation / Answer

Answer: See the R script of analysis below:

----------------------------------------------------

setwd("./") #set path as per your directory
#load train data
data1<-read.csv("train.csv",header = TRUE,sep = ",",na.strings = NA)
#exploring data
class(data1)
names(data1)
dim(data1)
head(data1)
#summary of data
summary(data1)
#Note: Missing values are only in Age column.
str(data1)
#check relationships of data
class_table<-table(data1$Pclass)
class_table
prop.table(class_table)*100
survived_by_class<-table(data1$Pclass,data1$Survived)
survived_by_class
prop.table(survived_by_class)*100
gender_table<-table(data1$Sex)
gender_table
prop.table(gender_table)*100
survived_by_gender<-table(data1$Sex,data1$Survived)
survived_by_gender
prop.table(survived_by_gender)*100
age_table<-table(data1$Age)
age_table
prop.table(age_table)*100
mean(data1$Age,na.rm=TRUE)
gender_class_table<-table(data1$Sex,data1$Pclass)
gender_class_table
survided_by_gender_class<-table(data1$Sex,data1$Pclass,data1$Survived)
survided_by_gender_class
#Note: class and gender and gender and class taken together were the dominating factors for survival
sapply(data1,function(x) sum(is.na(x))) #number of missing values.
str(data1)

#model fitting by taking class and gender columns
#by using logistic regression
model<-glm(factor(Survived)~Pclass+Sex,data=data1, family = binomial(logit))
#summary of model
summary(model)
#analysis of model
anova(model)

#load test data
test_data<-read.csv("test.csv",header = TRUE,sep = ",",na.strings = NA)
#test data size
dim(test_data)
names(test_data)
#predict using model
predicted_data<-predict(model,data=test_data,type="response")
class(predicted_data)
head(predicted_data)

#model fitting by using random forest
library(randomForest)
model1<-randomForest(factor(Survived)~Pclass+Sex,data=data1,importance=TRUE)
#summary of model
summary(model1)
#analysis of model
importance(model1)
#predict using model
predicted_data1<-predict(model1,data=test_data,type="class")
class(predicted_data1)
head(predicted_data1)

-------------------------------------------------------

Running of analysis and output:

------------------------------------------------

  > #load train data  > data1<-read.csv("train.csv",header = TRUE,sep = ",",na.strings = NA)  > #exploring data  > class(data1)  [1] "data.frame"  > names(data1)   [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"         "Age"         "SibSp"         [8] "Parch"       "Ticket"      "Fare"        "Cabin"       "Embarked"     > dim(data1)  [1] 891  12  > head(data1)    PassengerId Survived Pclass                                                Name    Sex Age SibSp Parch  1           1        0      3                             Braund, Mr. Owen Harris   male  22     1     0  2           2        1      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0  3           3        1      3                              Heikkinen, Miss. Laina female  26     0     0  4           4        1      1        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0  5           5        0      3                            Allen, Mr. William Henry   male  35     0     0  6           6        0      3                                    Moran, Mr. James   male  NA     0     0              Ticket    Fare Cabin Embarked  1        A/5 21171  7.2500              S  2         PC 17599 71.2833   C85        C  3 STON/O2. 3101282  7.9250              S  4           113803 53.1000  C123        S  5           373450  8.0500              S  6           330877  8.4583              Q  > #summary of data  > summary(data1)    PassengerId       Survived          Pclass                                         Name         Sex        Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Abbing, Mr. Anthony                  :  1   female:314     1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Abbott, Mr. Rossmore Edward          :  1   male  :577     Median :446.0   Median :0.0000   Median :3.000   Abbott, Mrs. Stanton (Rosa Hunt)     :  1                  Mean   :446.0   Mean   :0.3838   Mean   :2.309   Abelson, Mr. Samuel                  :  1                  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000   Abelson, Mrs. Samuel (Hannah Wizosky):  1                  Max.   :891.0   Max.   :1.0000   Max.   :3.000   Adahl, Mr. Mauritz Nils Martin       :  1                                                                   (Other)                              :885                       Age            SibSp           Parch             Ticket         Fare                Cabin     Embarked   Min.   : 0.42   Min.   :0.000   Min.   :0.0000   1601    :  7   Min.   :  0.00              :687    :  2      1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000   347082  :  7   1st Qu.:  7.91   B96 B98    :  4   C:168      Median :28.00   Median :0.000   Median :0.0000   CA. 2343:  7   Median : 14.45   C23 C25 C27:  4   Q: 77      Mean   :29.70   Mean   :0.523   Mean   :0.3816   3101295 :  6   Mean   : 32.20   G6         :  4   S:644      3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000   347088  :  6   3rd Qu.: 31.00   C22 C26    :  3              Max.   :80.00   Max.   :8.000   Max.   :6.0000   CA 2144 :  6   Max.   :512.33   D          :  3              NA's   :177                                      (Other) :852                    (Other)    :186             > #Note: Missing values are only in Age column.  > str(data1)  'data.frame':   891 obs. of  12 variables:   $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...   $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...   $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...   $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...   $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...   $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...   $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...   $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...   $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...   $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...   $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...   $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...  > #check relationships of data  > class_table<-table(data1$Pclass)  > class_table      1   2   3   216 184 491   > prop.table(class_table)*100           1        2        3   24.24242 20.65095 55.10662   > survived_by_class<-table(data1$Pclass,data1$Survived)  > survived_by_class             0   1    1  80 136    2  97  87    3 372 119  > prop.table(survived_by_class)*100                   0         1    1  8.978676 15.263749    2 10.886644  9.764310    3 41.750842 13.355780  > gender_table<-table(data1$Sex)  > gender_table    female   male      314    577   > prop.table(gender_table)*100     female    male   35.2413 64.7587   > survived_by_gender<-table(data1$Sex,data1$Survived)  > survived_by_gender                       0   1    female  81 233    male   468 109  > prop.table(survived_by_gender)*100                             0         1    female  9.090909 26.150393    male   52.525253 12.233446  > age_table<-table(data1$Age)  > age_table    0.42 0.67 0.75 0.83 0.92    1    2    3    4    5    6    7    8    9   10   11   12   13   14 14.5   15   16      1    1    2    2    1    7   10    6   10    4    3    3    4    8    2    4    1    2    6    1    5   17     17   18   19   20 20.5   21   22   23 23.5   24 24.5   25   26   27   28 28.5   29   30 30.5   31   32 32.5     13   26   25   15    1   24   27   15    1   30    1   23   18   18   25    2   20   25    2   17   18    2     33   34 34.5   35   36 36.5   37   38   39   40 40.5   41   42   43   44   45 45.5   46   47   48   49   50     15   15    1   18   22    1    6   11   14   13    2    6   13    5    9   12    2    3    9    9    6   10     51   52   53   54   55 55.5   56   57   58   59   60   61   62   63   64   65   66   70 70.5   71   74   80      7    6    1    8    2    1    4    2    5    2    4    3    4    2    2    3    1    2    1    2    1    1   > prop.table(age_table)*100         0.42      0.67      0.75      0.83      0.92         1         2         3         4         5         6   0.1400560 0.1400560 0.2801120 0.2801120 0.1400560 0.9803922 1.4005602 0.8403361 1.4005602 0.5602241 0.4201681           7         8         9        10        11        12        13        14      14.5        15        16   0.4201681 0.5602241 1.1204482 0.2801120 0.5602241 0.1400560 0.2801120 0.8403361 0.1400560 0.7002801 2.3809524          17        18        19        20      20.5        21        22        23      23.5        24      24.5   1.8207283 3.6414566 3.5014006 2.1008403 0.1400560 3.3613445 3.7815126 2.1008403 0.1400560 4.2016807 0.1400560          25        26        27        28      28.5        29        30      30.5        31        32      32.5   3.2212885 2.5210084 2.5210084 3.5014006 0.2801120 2.8011204 3.5014006 0.2801120 2.3809524 2.5210084 0.2801120          33        34      34.5        35        36      36.5        37        38        39        40      40.5   2.1008403 2.1008403 0.1400560 2.5210084 3.0812325 0.1400560 0.8403361 1.5406162 1.9607843 1.8207283 0.2801120          41        42        43        44        45      45.5        46        47        48        49        50   0.8403361 1.8207283 0.7002801 1.2605042 1.6806723 0.2801120 0.4201681 1.2605042 1.2605042 0.8403361 1.4005602          51        52        53        54        55      55.5        56        57        58        59        60   0.9803922 0.8403361 0.1400560 1.1204482 0.2801120 0.1400560 0.5602241 0.2801120 0.7002801 0.2801120 0.5602241          61        62        63        64        65        66        70      70.5        71        74        80   0.4201681 0.5602241 0.2801120 0.2801120 0.4201681 0.1400560 0.2801120 0.1400560 0.2801120 0.1400560 0.1400560   > mean(data1$Age,na.rm=TRUE)  [1] 29.69912  > gender_class_table<-table(data1$Sex,data1$Pclass)  > gender_class_table                       1   2   3    female  94  76 144    male   122 108 347  > survided_by_gender_class<-table(data1$Sex,data1$Pclass,data1$Survived)  > survided_by_gender_class  , ,  = 0                         1   2   3    female   3   6  72    male    77  91 300    , ,  = 1                         1   2   3    female  91  70  72    male    45  17  47    > #Note: class and gender and gender and class taken together were the dominating factors for survival  > sapply(data1,function(x) sum(is.na(x))) #number of missing values.  PassengerId    Survived      Pclass        Name         Sex         Age       SibSp       Parch      Ticket             0           0           0           0           0         177           0           0           0          Fare       Cabin    Embarked             0           0           0   > str(data1)  'data.frame':   891 obs. of  12 variables:   $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...   $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...   $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...   $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...   $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...   $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...   $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...   $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...   $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...   $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...   $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...   $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...  >   > #model fitting by taking class and gender columns  > #by using logistic regression  > model<-glm(factor(Survived)~Pclass+Sex,data=data1, family = binomial(logit))  > #summary of model  > summary(model)    Call:  glm(formula = factor(Survived) ~ Pclass + Sex, family = binomial(logit),       data = data1)    Deviance Residuals:       Min       1Q   Median       3Q      Max    -2.2030  -0.7036  -0.4519   0.6719   2.1599      Coefficients:              Estimate Std. Error z value Pr(>|z|)      (Intercept)   3.2946     0.2974  11.077   <2e-16 ***  Pclass       -0.9606     0.1061  -9.057   <2e-16 ***  Sexmale      -2.6434     0.1838 -14.380   <2e-16 ***  ---  Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1    (Dispersion parameter for binomial family taken to be 1)        Null deviance: 1186.7  on 890  degrees of freedom  Residual deviance:  827.2  on 888  degrees of freedom  AIC: 833.2    Number of Fisher Scoring iterations: 4    > #analysis of model  > anova(model)  Analysis of Deviance Table    Model: binomial, link: logit    Response: factor(Survived)    Terms added sequentially (first to last)             Df Deviance Resid. Df Resid. Dev  NULL                     890     1186.7  Pclass  1   102.25       889     1084.4  Sex     1   257.21       888      827.2  >   > #load test data  > test_data<-read.csv("test.csv",header = TRUE,sep = ",",na.strings = NA)  > #test data size  > dim(test_data)  [1] 418  11  > names(test_data)   [1] "PassengerId" "Pclass"      "Name"        "Sex"         "Age"         "SibSp"       "Parch"         [8] "Ticket"      "Fare"        "Cabin"       "Embarked"     > #predict using model  > predicted_data<-predict(model,data=test_data,type="response")  > class(predicted_data)  [1] "numeric"  > head(predicted_data)           1          2          3          4          5          6   0.09705221 0.91166115 0.60180274 0.91166115 0.09705221 0.09705221   >   > #model fitting by using random forest  > library(randomForest)  > model1<-randomForest(factor(Survived)~Pclass+Sex,data=data1,importance=TRUE)  > #summary of model  > summary(model1)                  Length Class  Mode       call               4   -none- call       type               1   -none- character  predicted        891   factor numeric    err.rate        1500   -none- numeric    confusion          6   -none- numeric    votes           1782   matrix numeric    oob.times        891   -none- numeric    classes            2   -none- character  importance         8   -none- numeric    importanceSD       6   -none- numeric    localImportance    0   -none- NULL       proximity          0   -none- NULL       ntree              1   -none- numeric    mtry               1   -none- numeric    forest            14   -none- list       y                891   factor numeric    test               0   -none- NULL       inbag              0   -none- NULL       terms              3   terms  call       > #analysis of model  > importance(model1)                0        1 MeanDecreaseAccuracy MeanDecreaseGini  Pclass 18.35609 19.43456             20.01697         34.59309  Sex    47.96910 62.58943             57.62603        104.97207  > #predict using model  > predicted_data1<-predict(model1,data=test_data,type="class")  > class(predicted_data1)  [1] "factor"  > head(predicted_data1)  1 2 3 4 5 6   0 1 1 1 0 0   Levels: 0 1  
------------------------------------------------------------------------------------------------------------------
Hire Me For All Your Tutoring Needs
Integrity-first tutoring: clear explanations, guidance, and feedback.
Drop an Email at
drjack9650@gmail.com
Chat Now And Get Quote