Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distance measure implementation #1527

Merged
merged 32 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
75e48a8
Added distance measure calculation
trKhaled Nov 28, 2023
a6f22ab
code refactor
trKhaled Jan 6, 2024
e7fc35e
minor fixes
trKhaled Jan 8, 2024
3e6cb9b
removed database data from constant.py
trKhaled Jan 8, 2024
45c0ba6
bug fixes
trKhaled Jan 11, 2024
4444750
refactor
trKhaled Jan 11, 2024
b342dcb
improved code logic and added comments
trKhaled Jan 17, 2024
4153f82
integrated distance measurement to the sql checker
trKhaled May 13, 2024
1079ebb
improved duplicate flagging logic
trKhaled May 14, 2024
e5eb883
chore(sql-checker): remove __pycache__
Zitrone44 May 28, 2024
25bfca3
only the closest distance gets saved on the database now
chrastlet Sep 30, 2024
e4a10b7
Subqueries and "AND","OR" operators work
chrastlet Oct 29, 2024
bae0fcd
Fixed lint problems and started implementing LIKE, BETWEEN, IN support
chrastlet Nov 13, 2024
674299a
style(sql-distance-checker): fix style
Zitrone44 Nov 7, 2024
3daeeef
chore(sql-checker): fix pylint errors
Zitrone44 Nov 7, 2024
5529757
refactor(sql-checker): apply ioc and add tests
Zitrone44 Dec 4, 2024
85aa981
Merge branch 'dev' into sql-distance-checker
Zitrone44 Dec 4, 2024
3afeaa4
feat(sql-checker): add distances to checker output
Zitrone44 Dec 4, 2024
32e566b
feat(sql-checker): reimplement query comparision
Zitrone44 Dec 6, 2024
99f48b6
fix(sql-checker): fix query learning
Zitrone44 Dec 7, 2024
f85f8bf
fix(sql-parser): fix different query lengths
Zitrone44 Dec 7, 2024
3efc0a3
feat(sql-checker): add error depth
Zitrone44 Dec 9, 2024
13f4d3e
style(sql-checker): fix style
Zitrone44 Dec 9, 2024
5e79525
chore(sql-checker): remove print
Zitrone44 Dec 9, 2024
a1cbece
feat(sql-checker): divide distance by 50 and round
Zitrone44 Dec 9, 2024
3957561
feat(sql-checker): improve feedback
Zitrone44 Dec 10, 2024
403d713
feat(sql-checker): make distance disableable
Zitrone44 Dec 11, 2024
aaae168
fix(sql-checker): move mongomock into dev dependencies
Zitrone44 Dec 11, 2024
4a4e6f1
ci(python): install all dependencies for testing
Zitrone44 Dec 11, 2024
95074e0
ci(python): update to python 3.11
Zitrone44 Dec 11, 2024
e9399c5
ci(python): update other to python 3.11
Zitrone44 Dec 11, 2024
b0785c1
ci(python): fix unittests
Zitrone44 Dec 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions modules/fbs-sql-checker/distance/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__pycache__/
102 changes: 102 additions & 0 deletions modules/fbs-sql-checker/distance/attribute_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import sqlparse
import constants as c
import attribute_distance as att_dist
import format as f

ref_pro_att: list[str] = []
query_pro_att: list[str] = []

ref_cmd_list: list[str] = []
query_cmd_list: list[str] = []

ref_map: dict[str, dict[str, str]] = {}
query_map: dict[str, dict[str, str]] = {}


def extract_attributes(ref, query):
_token_iteration(ref, ref_map, ref_pro_att, ref_cmd_list)
_token_iteration(query, query_map, query_pro_att, query_cmd_list)

print(f"REF MAP: {ref_map}\nQuery Map: {query_map}\n")
print("Projection attributes before order: ", ref_pro_att, query_pro_att)

print(f"COMMAND LIST HERE {ref_cmd_list}, QUERY {query_cmd_list}")

attribute_distance = att_dist.get_attributes_distance(ref_pro_att, query_pro_att)

print("Projection attributes after order: ", ref_pro_att, query_pro_att, "\n")

command_distance = att_dist.get_command_distance(ref_cmd_list, query_cmd_list)

keyword_distance = att_dist.get_keyword_distance(ref_map, query_map)

print(f"attributes: {attribute_distance}, command: {command_distance}, keywordw: {keyword_distance}")

return attribute_distance + command_distance + keyword_distance


def _token_iteration(tokens, map_dict, pro_att_list, cmd_list):
for i, token in enumerate(tokens):
if isinstance(token, sqlparse.sql.Token):
if isinstance(token, sqlparse.sql.Token):
if token.ttype == sqlparse.tokens.Whitespace or token.ttype == sqlparse.tokens.Newline:
continue
if token.ttype == sqlparse.tokens.Keyword and token.value == c.DISTINCT:
_extract_keyword(tokens[i + 2], map_dict)
if token.ttype == sqlparse.tokens.Wildcard:
pro_att_list.append(token.value)
break
if isinstance(token, sqlparse.sql.IdentifierList):
for t in token.get_identifiers():
_extract_att_and_cmds(t, map_dict, pro_att_list, cmd_list)
if isinstance(token, (
sqlparse.sql.Identifier, sqlparse.sql.Function, sqlparse.sql.Operation,
sqlparse.sql.Parenthesis)):
_extract_att_and_cmds(token, map_dict, pro_att_list, cmd_list)
if token.ttype == sqlparse.tokens.Keyword and token.value == c.FROM:
break


def _extract_att_and_cmds(token, map_dict, pro_att_list, cmd_list):
_extract_alias(token, map_dict)
if isinstance(token, (sqlparse.sql.Operation, sqlparse.sql.Parenthesis)):
pro_att_list.append(token.value)
if isinstance(token, sqlparse.sql.Function):
params = [p.value for p in token.get_parameters()]
cmd_list.append(token.get_real_name())
pro_att_list.append(params[0])
if token.value.__contains__(c.DISTINCT):
_extract_keyword(f.format_command(token), map_dict)
if isinstance(token, sqlparse.sql.Identifier):
if str(token.get_real_name()).upper() in [cmd for cmd in c.SELECT_CMDS]:
cmd_list.append(token.get_real_name())
pro_att_list.append(f.format_alias(f.format_distinct(f.format_db_name(f.format_command(token)))))
if token.value.__contains__(c.DISTINCT):
_extract_keyword(f.format_command(token), map_dict)
else:
pro_att_list.append(f.format_distinct(f.format_db_name(f.format_alias(token.value))))


def _extract_alias(ident: sqlparse.sql.Identifier, map_dict):
if ident.has_alias():
updated_ident = f.format_alias(f.format_db_name(ident.value))
# token first will extract the attribute without its alias
_add_to_map(map_dict, c.ALIAS, updated_ident, ident.get_alias())


def _extract_keyword(ident, map_dict):
if isinstance(ident, sqlparse.sql.IdentifierList):
ident_list = list[sqlparse.sql.Identifier](ident.get_identifiers())
# get all the identifiers that are referred to the distinct keyword. (alias gets formatted and removed)
result = ", ".join(f.format_db_name(f.format_alias(i.value)) for i in ident_list)
_add_to_map(map_dict, c.KEYWORD, c.DISTINCT, result)
else:
# remove trailing alias or distinct keyword to add only the attribute to the map
updated_value = f.format_distinct(f.format_db_name(f.format_alias(ident)))
_add_to_map(map_dict, c.KEYWORD, c.DISTINCT, updated_value)


def _add_to_map(map_dict, key, inner_key, value):
if key not in map_dict:
map_dict[key] = {}
map_dict[key][inner_key] = value
91 changes: 91 additions & 0 deletions modules/fbs-sql-checker/distance/attribute_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import re
import constants as c
import uuid

operation_map: dict[str, str, str] = {}


def get_attributes_distance(ref: list[str], query: list[str]):
moves = 0
# ignore wildcard
if ref.__contains__("*"):
moves = 0
elif set(ref) == set(query):
for r, q in zip(ref, query):
if r != q:
moves += c.ORDER_MULT
# Rearrange the query to match the order of reference
query.remove(r)
query.insert(ref.index(r), r)
else:
for r in ref:
if r not in query:
moves += c.OBJECT_MULT

op_dist = _get_operation_distance(ref, query)
moves += op_dist
print(f"\nOP MAP: {operation_map}\n")
return moves


def get_command_distance(ref: list[str], query: list[str]):
moves = 0
if len(ref) != len(query):
moves += c.OBJECT_MULT
elif set(ref) != set(query):
moves += c.STRUCT_MULT
return moves


def get_keyword_distance(ref_map: dict, query_map: dict):
moves = 0
ref_kws: dict = ref_map.get(c.KEYWORD)
query_kws: dict = query_map.get(c.KEYWORD)

if ref_kws is not None and query_kws is not None:
if set(ref_kws.values()) == set(query_kws.values()):
moves = 0
else:
moves += c.OBJECT_MULT
return moves


def _get_operation_distance(ref_list: list[str], query_list: list[str]):
ref_op_list = []
query_op_list = []

# using a regex pattern to extract the operations contained in both attribute lists
for exp in ref_list:
if re.findall(c.MATH_EXP_REGEX, exp):
ref_op_list.append(exp)
for exp in query_list:
if re.findall(c.MATH_EXP_REGEX, exp):
query_op_list.append(exp)

return _calculate_expression_similarity(ref_op_list, query_op_list)

# Jaccard index may not be the best method to measure the distance of two mathematical expressions
def _calculate_expression_similarity(ref_exp: list[str], query_exp: list[str]):
diff = 0
for r, q in zip(ref_exp, query_exp):
ref_set = set(r.replace("(", "").replace(")", ""))
query_set = set(q.replace("(", "").replace(")", ""))
intersection = len(ref_set.intersection(query_set))
union = len(ref_set.union(query_set)) # len(ref_set) + len(query_set) - intersection
if union != 0:
# Jaccard Index / Similarity Coefficient
diff_val = 1 - (intersection / union)
_add_to_op_map(operation_map, r, q, diff_val)
diff += diff_val
return diff


def _add_to_op_map(op_map, ref, query, sim):
generated_uuid = str(uuid.uuid4())
short_id = generated_uuid[:4] # Take the first 8 characters as the short ID
new_entry_key = f"{short_id}"
op_map[new_entry_key] = {
"ref": ref,
"query": query,
"difference": sim
}
60 changes: 60 additions & 0 deletions modules/fbs-sql-checker/distance/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# BASIC SQL KEYWORDS
DISTINCT = "DISTINCT"
FROM = "FROM"
TABLE = "TABLE"
ALIAS = "ALIAS"
ON = "ON"
WHERE = "WHERE"
GROUP_BY = "GROUP BY"
HAVING = "HAVING"
ORDER_BY = "ORDER BY"
DESC = r'(?i)DESC'
ASC = r'(?i)ASC'
KEYWORD = "KEYWORD"
COMMAND = "COMMAND"

# SELECT COMMANDS
SELECT_CMDS = [
"SUM",
"COUNT",
"ROUND",
"SEC_TO_TIME",
"AVG",
"MAX",
"MIN",
"ABS",
"TIME_TO_SEC",
"YEAR",
"UPPER",
"LOWER",
"LENGTH"
]

# JOIN TYPES
JOIN_TYPES = [
"INNER JOIN",
"LEFT JOIN",
"RIGHT JOIN",
"FULL JOIN",
"SELF JOIN",
"JOIN"
]

# REGULAR EXPRESSIONS
ALIAS_REGEX = r"\sas\s+\"(.+?)\"|\sas\s+(\w+)"
DB_NAME_REGEX = r"^[^.]+\.(.*)$"
DB_COMP_REGEX = r'(\s*(?:<=|>=|!=|=|<|>)\s*)'
MATH_EXP_REGEX = r"[\d()+\-*\/]"
EQ_COMP_REGEX = r'\s*\w+\s*=\s*\w+\s*'

# DATABASE DATA
HOSTNAME = 'localhost'
DB_NAME = 'postgres'
USERNAME = 'postgres'
PASSWORD = 'admin'
PORT_ID = 5432
Zitrone44 marked this conversation as resolved.
Show resolved Hide resolved

# MULTIPLIERS
ORDER_MULT = 5
STRUCT_MULT = 20
OBJECT_MULT = 50
46 changes: 46 additions & 0 deletions modules/fbs-sql-checker/distance/db_connection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import psycopg2
import constants as c

connection = None


def setup_db(att_list):
with psycopg2.connect(
host=c.HOSTNAME,
dbname=c.DB_NAME,
user=c.USERNAME,
password=c.PASSWORD,
port=c.PORT_ID
) as connection:
cursor = connection.cursor()

for i, att in enumerate(att_list):
sql_script = f"CREATE TABLE IF NOT EXISTS {chr(65 + i)} (x INTEGER, CONSTRAINT {chr(97 + i)}_unique_x UNIQUE (x));"
cursor.execute(sql_script)

bits = len(att_list)
for i in range(2 ** bits):
binary_number = format(i, f'0{bits}b')
decimal_number = int(binary_number, 2)
offset = -1
for j in range(len(binary_number)):
if binary_number[offset] == '1':
select_query = f"SELECT * FROM {chr(65 + j)} WHERE x = {decimal_number};"
cursor.execute(select_query)
result = cursor.fetchone()

if result is None:
insert_query = f"INSERT INTO {chr(65 + j)} (x) VALUES ({decimal_number});"
cursor.execute(insert_query)
offset -= 1
Zitrone44 marked this conversation as resolved.
Show resolved Hide resolved
connection.commit()
return connection


def execute_query(query, connection: psycopg2):
cursor = connection.cursor()
cursor.execute(query)
result = cursor.fetchall()
res_set = set(frozenset(s) for s in result)
connection.commit()
return res_set
16 changes: 16 additions & 0 deletions modules/fbs-sql-checker/distance/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
version: '3.8'

services:
postgres:
image: postgres:latest
environment:
POSTGRES_DB: postgres
POSTGRES_USER: postgres
POSTGRES_PASSWORD: admin
ports:
- "5432:5432"
volumes:
- postgres_data:/var/lib/postgresql/data

volumes:
postgres_data:
21 changes: 21 additions & 0 deletions modules/fbs-sql-checker/distance/equation_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import re
import constants as c
import sympy as s


def check_equation(ref: list[str], query: list[str]):
moves = 0
for r, q in zip(ref, query):
if re.match(c.EQ_COMP_REGEX, r) and re.match(c.EQ_COMP_REGEX, q):
if set(r) != set(q):
moves += c.OBJECT_MULT
elif not _check_if_equal(r, q):
moves += c.OBJECT_MULT
return moves


def _check_if_equal(eq1, eq2):
eq1 = s.simplify(s.sympify(eq1))
eq2 = s.simplify(s.sympify(eq2))

return eq1.equals(eq2)
54 changes: 54 additions & 0 deletions modules/fbs-sql-checker/distance/format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import re
import sqlparse
import constants as c


def format_alias(ident: str):
# check ident with pattern to get the alias keyword and alias name
regex = re.compile(c.ALIAS_REGEX, re.IGNORECASE)
match = regex.search(ident)
if match:
# flag is used for case sensitivity
ident = re.sub(c.ALIAS_REGEX, "", ident, flags=re.IGNORECASE).strip()
return ident


def format_db_name(ident: str):
# check ident with pattern to get the alias keyword and alias name
regex = re.compile(c.DB_NAME_REGEX)

# Check if the ident matches the pattern
match = regex.search(ident)
if match:
ident = match.group(1)

return ident


def format_distinct(ident: str):
if ident.__contains__(c.DISTINCT):
ident = ident.replace(c.DISTINCT, "").strip()
return ident


def format_command(ident: sqlparse.sql.Identifier):
# get_real_name() function returns the select command and will get removed along the alias and the 'as' keyword
formatted = ident.value.replace(ident.get_real_name(), "").replace("(", "").replace(")", "")
return formatted


def format_comp_db_name(ident: str):
# Split the input string using the regex pattern to find the operator
parts = re.split(c.DB_COMP_REGEX, ident)

# Get the left and right sides of the equation after removing whitespace
left_substring = parts[0].rsplit('.', 1)[-1].strip()
right_substring = parts[2].rsplit('.', 1)[-1].strip()

# Check if the operator is "LIKE" and replace it with "="
operator = parts[1].strip()

# Join the substrings back together with the operator
result = f"{left_substring} {operator} {right_substring}"

return result
Loading