-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdynamic_words_duplicates.py
142 lines (112 loc) · 5.68 KB
/
dynamic_words_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import csv
from collections import defaultdict
import pandas as pd
def tell_me_about(s): return (type(s), s)
def find_non_utf8_characters(input_string):
non_utf8_characters = []
#print("string was " + input_string)
return input_string.encode('latin-1').decode('utf-8')
# for char in input_string:
# print(char)
# try:
# #char.encode('utf-8')
# char = repr(char).encode('utf-8')
# except UnicodeEncodeError:
# non_utf8_characters.append(char)
# except Exception as e:
# print("exception" + str(e))
# print("exception" + str(e))
# print("string is now " + input_string)
# return non_utf8_characters
def find_invalid_utf8_bytes(input_bytes):
invalid_bytes = []
try:
input_bytes.decode('utf-8')
except UnicodeDecodeError as e:
invalid_bytes.append((e.start, e.end))
except Exception as e:
print("oops")
print("oops")
return invalid_bytes
def find_duplicate_words_in_directory(root_folder):
# Dictionary to store words and the files in which they appear
word_occurrences = defaultdict(list)
test = defaultdict(list)
for folder_name, _, filenames in os.walk(root_folder):
for filename in filenames:
print("processing ", filename)
file_path = os.path.join(folder_name, filename)
if "txt" in filename:
with open(file_path, 'r', encoding='utf-8') as file:
try:
for line_number, line in enumerate(file, start=1):
print(line.strip().lower())
# Append the file to the list of files where the word occurs
word_occurrences[line.strip().lower()].append(file_path)
test[file_path].append(line.strip().lower())
except Exception as e:
print(tell_me_about(line))
try:
ret = find_non_utf8_characters(line)
word_occurrences[ret.strip().lower()].append(file_path)
print("non utf-8 character is " + str(ret))
except:
print("Failed")
# Filter out words that occur only once
word_occurrences = {word: files for word, files in word_occurrences.items() if len(files) > 1}
word_occurrences2 = {word: files for word, files in test.items() if len(files) > 1}
return word_occurrences
def remove_duplicate_lines_interactively(file_word_mapping):
for file_path, words in file_word_mapping.items():
print(f"Processing file: {file_path}")
unique_words = list(set(words))
lines_to_write = []
for word in unique_words:
if words.count(word) > 1:
print(f"\nDuplicate word found: {word}")
print(f"Original lines: {words}")
# Prompt the user to choose which file retains the word
choice = input(f"Choose the file to retain the word ('{file_path}' or another file): ")
if choice != file_path:
# Remove the word from other files
for other_file_path in file_word_mapping.keys():
if choice == other_file_path:
file_word_mapping[other_file_path] = [w for w in file_word_mapping[other_file_path] if w != word]
# Build lines to write for the current file
lines_to_write.extend(line for line in open(file_path, 'r', encoding='utf-8', errors='ignore') if word not in line)
# Update the actual file with the modified content
with open(file_path, 'w', encoding='utf-8', errors='ignore') as file:
file.writelines(lines_to_write)
def sort_csv(input_file, output_file, sort_column):
with open(input_file, 'r', newline='', encoding='utf-8') as infile:
reader = csv.DictReader(infile)
sorted_rows = sorted(reader, key=lambda row: row[sort_column])
with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
fieldnames = reader.fieldnames
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(sorted_rows)
def create_file_word_mapping(word_occurrences):
file_word_mapping = defaultdict(list)
for word, files in word_occurrences.items():
for file_path in files:
with open(file_path, 'r', encoding='utf-8') as file:
words = [word.strip() for line in file for word in line.split()]
file_word_mapping[file_path].extend(words)
return file_word_mapping
def write_csv(output_file, word_occurrences):
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['Word', 'Files'])
for word, files in word_occurrences.items():
# Remove duplicates from the list of files
unique_files = list(set(files))
csv_writer.writerow([word] + unique_files)
if __name__ == "__main__":
root_folder = "X:/dif/stable-diffusion-webui-docker/data/config/auto/extensions/sd-dynamic-prompts/wildcards"
output_file = "X:/dif/stable-diffusion-webui-docker/data/config/auto/extensions/sd-dynamic-prompts/wildcards/duplicate_words.csv"
word_occurrences = find_duplicate_words_in_directory(root_folder)
write_csv(output_file, word_occurrences)
sort_csv(output_file,"X:/dif/stable-diffusion-webui-docker/data/config/auto/extensions/sd-dynamic-prompts/wildcards/duplicate_words2.csv",'Word')
print(f"CSV file '{output_file}' generated.")