-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtweet_subsets.R
57 lines (42 loc) · 1.72 KB
/
tweet_subsets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# Find all tweets with certain search terms (using OR) and save as new subset,
# full set as well as grouped by twitter account name
# Tina Keil, [email protected], April 2022
options(java.parameters = "-Xmx8000m") #8GB ram
options(scipen=999) #turn off scientific notation
#load required libraries
library(dplyr)
library(filesstrings)
library(data.table) #fread is much faster for reading csv
library(readr)
#set working directory to directory of script
path = dirname(rstudioapi::getSourceEditorContext()$path)
setwd(path)
############# SET THIS ###################
#search terms
search_term1 = "RSE"
search_term2 = "research software"
############ END OF SETTINGS #############
#define some dirs (trailing slash required)
dir = "cleaned/" #path to input file
file = "all_tweets_en_only.csv"
filepath <- paste0(dir,file)
out_dir = "cleaned/subsets/"
#search regex using OR
search_regex = paste0("(?:^|\\W)",search_term1,"|", search_term2,"(?:$|\\W)")
#create dir if it does not already exist
create_dir(out_dir)
############## process ##############
if (file.exists(filepath)) {
data <- fread(filepath)
} else {
stop("Can't find input file(s). Please check.")
}
data$content = gsub('\\"+','\\"', data$content, perl=TRUE) #replace multiple quotes with one
#find all content with search term and add to new data.frame
subset_data <- data %>%
filter(str_detect(content, regex(search_regex, ignore_case=TRUE))) #case insensitive
#save complete subset in csv
write.csv(subset_data, paste0(out_dir, "clean_subset_all_tweets.csv"), row.names = FALSE)
#save csv as subset grouped by account
by(subset_data, subset_data$account, FUN=function(i)
write.csv(i, paste0(out_dir, "clean_subset_", i$account[1], ".csv"), row.names = FALSE))