-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrse-sheffield.R
67 lines (53 loc) · 2.01 KB
/
rse-sheffield.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# text cleaning script using textclean library
# see https://github.com/trinker/textclean#functions for instructions
# Tina Keil, [email protected], February 2022
# blog data can be big, so increase java heap
# but adjust to RAM available on your machine!
options(java.parameters = "-Xmx8000m") #8GB ram
options(scipen=999) #turn off scientific notation
#load required libraries
library(dplyr)
library(filesstrings)
library(stringi)
library(lubridate) #for converting dates
library(data.table) #fread is much faster for reading csv
library(readr)
library(textclean)
library(beepr)
#set working directory to directory of script
path <- dirname(rstudioapi::getSourceEditorContext()$path)
setwd(path)
source("functions.R")
############ settings ##############
in_file <- "rse-sheffield.csv" #name of file to import
out_name <- tools::file_path_sans_ext(in_file)
out_file <- paste0("cleaned/","clean_",out_name,".csv") #name of file after cleaning
out_csv <- paste0(out_file, ".csv")
infilepath <- paste0("originals/",in_file)
############## process ##############
now <- start_time()
#get data from csv file
if (file.exists(infilepath)) {
raw_data <- read.csv(infilepath, sep=",")
} else {
stop("Can't find input file. Please check.")
}
url <- raw_data$article.href
cat("* Converting date\n")
#from 22 January 2020 14:43 -> 2020-01-22 14:43:00
pubdate <- str_replace(raw_data$published, " - ", " ")
pubdate <- dmy_hm(pubdate)
#reformat author
author <- str_replace_all(raw_data$author, "\\.", ". ")
author <- trimws(str_to_title(author))
#clean text
cat("* Starting to process title and body\n")
title <- cleantext(raw_data$title)
content <- cleantext(raw_data$content)
content <- stri_replace_all_fixed(content,paste0(title,". "),"")
content <- stri_replace_all_fixed(content,paste0(author,". "),"")
content <- stri_replace_all_fixed(content,paste0(raw_data$published,". "),"")
#add to new data frame
clean_data <- data.frame(url,title,content,author,pubdate)
save2file(clean_data, out_name, out_file)
show_alltime(now, out_name)