-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_analysis.R
53 lines (46 loc) · 2.44 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
library(dplyr)
library(data.table)
### 1.MERGES THE TRAINING AND THE TEST SETS TO CREATE ONE DATA SET. ###
feature_names <- read.table(file='UCI HAR Dataset\\features.txt')[,2]
# 1.1. Create function to create 1 or 2 data tables (cbind|rbind if needed)
create_table <- function(X,var_names,y=FALSE, var_names_2,rbin=FALSE,...){
table <- fread(file=X,col.names = var_names,...)
if (y != FALSE){
table2 <- fread(file=y,col.names = var_names_2)
if (rbin != FALSE){
table <- rbind(table,table2)
return(table)}
table <- cbind(table,table2)
return(table)}
}
# 1.2. Call the function and combining tables into 1 table
train_set<-create_table('UCI HAR Dataset\\train\\X_train.txt',feature_names,
'UCI HAR Dataset\\train\\y_train.txt','activities')
test_set<-create_table('UCI HAR Dataset\\test\\X_test.txt',feature_names,
'UCI HAR Dataset\\test\\y_test.txt','activities')
subject <- create_table('UCI HAR Dataset\\train\\subject_train.txt','subject',
'UCI HAR Dataset\\test\\subject_test.txt','subject',
rbin=TRUE)
full_data <- rbind(train_set,test_set)
full_data <- cbind(subject,full_data)
### 2. EXTRACTS ONLY THE MEASUREMENTS ###
### ON THE MEAN AND STANDARD DEVIATION FOR EACH MEASUREMENT. ###
# Search through names with regex to subset the full_data
mean_sd_col <- grep('subject|mean|std|activities',names(full_data),value=TRUE)
mean_sd_data <- select(full_data,mean_sd_col)
### 3. USES DESCRIPTIVE ACTIVITY NAMES TO NAME THE ACTIVITIES IN THE DATA SET ###
activity_label <- read.table(file='UCI HAR Dataset\\activity_labels.txt')[,2]
# Replace values of mean_sd_data$activities with above list
for (i in c(1:6)){
mean_sd_data <- mutate(mean_sd_data,
activities = replace(activities, activities == i,
activity_label[i]))
}
### 4.APPROPRIATELY LABELS THE DATA SET WITH DESCRIPTIVE VARIABLE NAMES. ###
# Already done in 1.1
### 5. FROM THE DATA SET IN STEP 4, CREATES A SECOND, INDEPENDENT TIDY DATA SET ###
### WITH THE AVERAGE OF EACH VARIABLE FOR EACH ACTIVITY AND EACH SUBJECT. ###
# 5.1. Create a group of activities and subject,
# then calculate the mean of all column except the group. Then write result
tidy_dataset <- mean_sd_data[,lapply(.SD,mean), by = .(activities, subject)]
write.table(tidy_dataset,'tidy_dataset.txt',row.name=FALSE)