-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_DSM.py
236 lines (193 loc) · 6.98 KB
/
create_DSM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import argparse
import numpy as np
import pandas as pd
import csv
from sklearn.cluster import SpectralClustering
from dataclasses import dataclass
# Build the argument parser
parser = argparse.ArgumentParser(
prog="DSM Builder", description="Builds elements of a DSM from an OPL text file."
)
parser.add_argument("input", help="Input OPL file", type=open)
parser.add_argument("output", help="Output file")
parser.add_argument("matrix", choices=["PO", "PP", "OO"], help="Desired output matrix")
parser.add_argument(
"-s",
"--seed",
help="RNG seed for deterministic clustering, ignored if -n is not set",
type=int,
dest="seed",
default=None,
)
parser.add_argument(
"-c",
"--clusters",
help="Write cluster labels to this location, ignored if -n is not set",
dest="clusters",
)
exclusive = parser.add_mutually_exclusive_group()
exclusive.add_argument(
"-o",
"--order",
help="A file with preferred element ordering. Elements can be seperated with newlines or commas",
dest="order",
)
exclusive.add_argument(
"-n",
"--n_clusters",
help="The number of clusters to form",
type=int,
dest="n_clusters",
default=0,
)
# Parse arguments
args = parser.parse_args()
@dataclass
class Relation:
object: str
keyword: str
process: str
# Essentially the OPL contains the following information
# 1. Objects (lines ending in "object")
# 2. Process (lines ending in "process")
# 3. Relations between processes and objects
# This can be in the form Process-Keyword-Object(s)
# Keywords are: requires, affects, consumes, yields
# Or Object(s)-"handles"-Process for Agent relations
# many-to-one relation ships are comma separated in one line
# Build lists of each item
objects = []
processes = []
relations = []
# Parse the input file
for line in args.input.readlines(False):
# Incase the input has empty lines
if line.find(".") == -1:
continue
# Remove pesky newlines and trim the leading numbers
line = line.split(".")[1].strip()
# Check if the line is an object
if line.endswith("object"):
objects.append(line.split(" is ")[0].strip())
continue
# Check if the line is a process
if line.endswith("process"):
processes.append(line.split(" is ")[0].strip())
continue
# Check if this line is a "handles" relationship
# Object(s)-"handles"-Process
if line.find(" handles ") != -1:
# Split on the keyword
l = line.split(" handles ")
process = l[1].strip()
# Check for multiple objects and create a relationship for each one
for object in [
x.strip() for x in l[0].replace(" and ", ",").split(",") if x.strip()
]:
relations.append(Relation(object, "handles", process))
continue
# If none of these we will assume the line is a relationship
# Process-Keyword-Object(s)
keywords = ["affects", "requires", "yields", "consumes"]
for keyword in keywords:
# Check if the keyword exists
if line.find(keyword) != -1:
# Split on the keyword
l = line.split(keyword)
process = l[0].strip()
for object in [
x.strip() for x in l[1].replace(" and ", ",").split(",") if x.strip()
]:
relations.append(Relation(object, keyword, process))
# Remove any duplicates and sort
objects = np.unique(np.array(objects, dtype="object"))
processes = np.unique(np.array(processes, dtype="object"))
# If there is a preferred ordering
if args.order:
# Read the ordering file
with open(args.order, "r", newline="") as ordering_file:
ordering = np.array(
[
x.strip()
for x in ordering_file.read().replace("\n", ",").split(",")
if x != ""
],
dtype="object",
)
if args.matrix == "OO":
# Check that it has all the same objects
if (objects == np.sort(ordering)).all():
objects = ordering
else:
print("Provided ordering does not have the same Objects as the OPL")
exit()
if args.matrix == "PP":
if (processes == np.sort(ordering)).all():
processes = ordering
else:
print("Provided ordering does not have the same Processes as the OPL")
exit()
# Check to make sure there is at least one object and process
if len(objects) == 0 or len(processes) == 0:
print("Invalid input, exiting.")
exit()
# Create matrices
po = pd.DataFrame(
np.empty((len(processes), len(objects)), dtype="object"), processes, objects
)
po_num = pd.DataFrame(np.zeros((len(processes), len(objects))), processes, objects)
# Iterate over all the relations and build the Process-Object Matrix
for r in relations:
po.loc[r.process, r.object] = r.keyword[0] # store a letter noting the relationship
po_num.loc[r.process, r.object] = (
1 # write a one for doing matrix multiplication later
)
# Compute the matrices
pp = pd.DataFrame(np.dot(po_num, po_num.transpose()), processes, processes)
oo = pd.DataFrame(np.dot(po_num.transpose(), po_num), objects, objects)
# If we want to cluster
if args.n_clusters > 0:
# Cluster matrices
o_cluster = SpectralClustering(
random_state=args.seed, n_clusters=args.n_clusters, affinity="precomputed"
).fit(oo)
p_cluster = SpectralClustering(
random_state=args.seed, n_clusters=args.n_clusters, affinity="precomputed"
).fit(pp)
# If desired, write the clusters to a file
if args.clusters:
with open(args.clusters, "w", newline="") as cluster_file:
out = csv.writer(cluster_file)
out.writerow(np.concat((["Objects"], np.sort(o_cluster.labels_))))
out.writerow(np.concat((["Proesses"], np.sort(p_cluster.labels_))))
# Rearrange the matrices
processes = processes[p_cluster.labels_.argsort()]
objects = objects[o_cluster.labels_.argsort()]
po = pd.DataFrame(
po.to_numpy()[np.ix_(p_cluster.labels_.argsort(), o_cluster.labels_.argsort())],
processes,
objects,
)
po_num = pd.DataFrame(
po_num.to_numpy()[
np.ix_(p_cluster.labels_.argsort(), o_cluster.labels_.argsort())
],
processes,
objects,
)
# Recompute the matrices
pp = pd.DataFrame(np.dot(po_num, po_num.transpose()), processes, processes)
oo = pd.DataFrame(np.dot(po_num.transpose(), po_num), objects, objects)
print("Processed:")
print(f"{len(objects)} objects")
print(f"{len(processes)} processes")
print(f"{len(relations)} relationships")
# Save the results as a CSV
with open(args.output, "w", newline="") as output_file:
print(f"Writing {args.matrix} matrix to {args.output}")
if args.matrix == "PO":
po.to_csv(path_or_buf=output_file)
if args.matrix == "PP":
pp.to_csv(path_or_buf=output_file)
if args.matrix == "OO":
oo.to_csv(path_or_buf=output_file)