Merge pull request #8 from aviiciii/v2

Another Dataset added
aviiciii · Jun 27, 2023 · b959be7 · b959be7
2 parents a2e65d9 + 7144135
commit b959be7
Show file tree

Hide file tree

Showing 12 changed files with 17,007 additions and 5 deletions.
diff --git a/add.py b/add.py
@@ -0,0 +1,41 @@
+import csv
+from collections import defaultdict
+
+# Paths
+input1 = 'data/output/1.csv'  # smaller file
+input2 = 'data/output/2.csv'  # larger file
+output_path = 'data/output/2.csv'  # output file
+
+word_freq = defaultdict(int)  # Dictionary to store word frequencies
+
+# Read input1
+print('Reading input1... ', end='')
+with open(input1, 'r') as csv_file:
+    reader = csv.DictReader(csv_file)
+    for row in reader:
+        word = row['word'].strip()
+        freq = int(row['frequency'])
+        word_freq[word] += freq
+print('Completed.')
+print(len(word_freq))
+
+# Read input2
+print('Reading input2... ', end='')
+with open(input2, 'r') as csv_file:
+    reader = csv.DictReader(csv_file)
+    for row in reader:
+        word = row['word'].strip()
+        freq = int(row['frequency'])
+        word_freq[word] += freq
+print('Completed.')
+print(len(word_freq))
+
+# Write to file
+print('Writing to file... ', end='')
+with open(output_path, 'w', newline='') as csv_file:
+    fieldnames = ['word', 'frequency']
+    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+    writer.writeheader()
+    for word, freq in word_freq.items():
+        writer.writerow({'word': word, 'frequency': freq})
+print('Done!')
diff --git a/clean_output.py b/clean_output.py
@@ -10,7 +10,7 @@
 filtered_length = 0
 
 # Open the CSV file
-with open('data/filtered_punc.csv', 'r+') as file:
+with open('data/output/2.csv', 'r') as file:
     reader = csv.reader(file)
 
     header = next(reader)  # Skip the header row
@@ -58,7 +58,7 @@
 
 
 # save the filtered words and frequencies to a new csv file
-with open('data/filtered_punc.csv', 'w') as file:
+with open('data/output/filtered/2.csv', 'w') as file:
     writer = csv.writer(file)
     # header
     writer.writerow(['word', 'frequency'])

diff --git a/output/v2/summary.md b/output/v2/summary.md
@@ -0,0 +1,9 @@
+# Summary
+
+Total count of words: 3080012
+
+Frequency > 5: 464197
+
+Frequency > 100: 55496
+
+Frequency > 1000: 8724
diff --git a/output/v2/top_100.csv b/output/v2/top_100.csv
@@ -0,0 +1,100 @@
+word,frequency
+இதழ்,2780383
+ஒரு,1071872
+கதை,471419
+என்று,468139
+சிறுகதை,451262
+இந்த,385687
+கதைகள்,378715
+எஸ்,349964
+மே,315915
+தமிழ்,266342
+நான்,249104
+வேண்டும்,245605
+பற்றி,238999
+இது,230037
+மற்றும்,224747
+காதல்,206431
+என,204954
+என்ன,192112
+மேலும்,189485
+முதல்,187897
+சிறுகதைப்,186466
+என்ற,183976
+அந்த,181185
+கதையாசிரியர்கள்,181077
+சிறுகதைகள்,179806
+என்றால்,177176
+என்,173869
+குமார்,161690
+அவர்,156373
+தான்,155981
+ஆர்,150391
+அறிவியல்,145152
+ஆனால்,144875
+இந்திய,139510
+என்பது,139461
+இப்படி,138256
+குடும்பம்,137388
+கடிதம்,137219
+எம்,136438
+கண்ணன்,136291
+மார்ச்,134913
+ஏப்ரல்,133806
+ஜூலை,133140
+ஜூன்,132995
+ஜனவரி,132626
+அக்டோபர்,132315
+ஆகஸ்ட்,131694
+கட்டுரை,131351
+பிப்ரவரி,129360
+படிக்க,125659
+சிறப்பு,125278
+அப்பா,125250
+கதைகளில்,122961
+சிறப்பிதழ்,122599
+டிசம்பர்,122236
+நவம்பர்,121826
+செப்டம்பர்,120933
+உள்ள,120300
+இரா,118310
+மகள்,116916
+அது,115392
+சொல்வனம்,114796
+தி,111244
+சி,108550
+பேய்,108213
+உலக,107236
+தத்துவ,106874
+நா,106764
+இருக்கும்,106443
+அடுத்தநாள்,105980
+அப்பாவா,105873
+இருந்து,105642
+முதலிரவுக்கு,105641
+எப்படி,105605
+கொண்டு,105130
+பல,104625
+அரசியல்,104421
+இல்லை,103893
+சொன்ன,100235
+எழுத்துக்கலைபற்றி,96919
+சில,96693
+கி,96340
+நகைச்சுவை,96075
+சுஜாதா,94649
+அரவிந்தன்,93399
+கதைப்பதிவு,91375
+காமம்,91351
+பெரிய,90727
+கே,90128
+ஜெயமோகன்,90117
+செய்து,89243
+கடந்த,88917
+சு,88818
+அவர்கள்,88763
+மக்கள்,88345
+அதன்,86353
+தேதி,85197
+கொத்தனார்,84275
+வரும்,84238