Skip to content

Commit

Permalink
robust json parsing & entity extraction progress log
Browse files Browse the repository at this point in the history
  • Loading branch information
rangehow committed Oct 18, 2024
1 parent a96899b commit 7a87465
Show file tree
Hide file tree
Showing 3 changed files with 216 additions and 16 deletions.
2 changes: 1 addition & 1 deletion nano_graphrag/_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
already_processed % len(PROMPTS["process_tickers"])
]
print(
f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",
f"{now_ticks} Processed {already_processed}({already_processed*100//len(ordered_chunks)}%) chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",
end="",
flush=True,
)
Expand Down
98 changes: 83 additions & 15 deletions nano_graphrag/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
logger = logging.getLogger("nano-graphrag")
ENCODER = None


def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
try:
# If there is already an event loop, use it.
Expand All @@ -29,24 +28,93 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
return loop


def locate_json_string_body_from_string(content: str) -> Union[str, None]:
"""Locate the JSON string body from a string"""
maybe_json_str = re.search(r"{.*}", content, re.DOTALL)
if maybe_json_str is not None:
return maybe_json_str.group(0)
else:
def extract_first_complete_json(s: str):
"""Extract the first complete JSON object from the string using a stack to track braces."""
stack = []
first_json_start = None

for i, char in enumerate(s):
if char == '{':
stack.append(i)
if first_json_start is None:
first_json_start = i
elif char == '}':
if stack:
start = stack.pop()
if not stack:
first_json_str = s[first_json_start:i+1]
try:
# Attempt to parse the JSON string
return json.loads(first_json_str.replace("\n", ""))
except json.JSONDecodeError as e:
logger.error(f"JSON decoding failed: {e}. Attempted string: {first_json_str[:50]}...")
return None
finally:
first_json_start = None
logger.warning("No complete JSON object found in the input string.")
return None

def parse_value(value: str):
"""Convert a string value to its appropriate type (int, float, bool, None, or keep as string). Work as a more broad 'eval()'"""
value = value.strip()

if value == "null":
return None
elif value == "true":
return True
elif value == "false":
return False
else:
# Try to convert to int or float
try:
if '.' in value: # If there's a dot, it might be a float
return float(value)
else:
return int(value)
except ValueError:
# If conversion fails, return the value as-is (likely a string)
return value.strip('"') # Remove surrounding quotes if they exist

def extract_values_from_json(json_string, keys=["reasoning", "answer", "data"], allow_no_quotes=False):
"""Extract key values from a non-standard or malformed JSON string, handling nested objects."""
extracted_values = {}

# Enhanced pattern to match both quoted and unquoted values, as well as nested objects
regex_pattern = r'(?P<key>"?\w+"?)\s*:\s*(?P<value>{[^}]*}|".*?"|[^,}]+)'

for match in re.finditer(regex_pattern, json_string, re.DOTALL):
key = match.group('key').strip('"') # Strip quotes from key
value = match.group('value').strip()

# If the value is another nested JSON (starts with '{' and ends with '}'), recursively parse it
if value.startswith('{') and value.endswith('}'):
extracted_values[key] = extract_values_from_json(value)
else:
# Parse the value into the appropriate type (int, float, bool, etc.)
extracted_values[key] = parse_value(value)

if not extracted_values:
logger.warning("No values could be extracted from the string.")

return extracted_values


def convert_response_to_json(response: str) -> dict:
json_str = locate_json_string_body_from_string(response)
assert json_str is not None, f"Unable to parse JSON from response: {response}"
try:
data = json.loads(json_str)
return data
except json.JSONDecodeError as e:
logger.error(f"Failed to parse JSON: {json_str}")
raise e from None
"""Convert response string to JSON, with error handling and fallback to non-standard JSON extraction."""
prediction_json = extract_first_complete_json(response)

if prediction_json is None:
logger.info("Attempting to extract values from a non-standard JSON string...")
prediction_json = extract_values_from_json(response, allow_no_quotes=True)

if not prediction_json:
logger.error("Unable to extract meaningful data from the response.")
else:
logger.info("JSON data successfully extracted.")

return prediction_json




def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"):
Expand Down
132 changes: 132 additions & 0 deletions tests/test_json_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import unittest
from loguru import logger
from nano_graphrag._utils import convert_response_to_json

class TestJSONExtraction(unittest.TestCase):

def setUp(self):
"""Set up runs before each test case."""
logger.remove()
logger.add(lambda msg: None) # disallow output

def test_standard_json(self):
"""Test standard JSON extraction."""
response = '''
{
"reasoning": "This is a test.",
"answer": 42,
"data": {"key1": "value1", "key2": "value2"}
}
'''
expected = {
"reasoning": "This is a test.",
"answer": 42,
"data": {"key1": "value1", "key2": "value2"}
}
self.assertEqual(convert_response_to_json(response), expected)

def test_non_standard_json_without_quotes(self):
"""Test non-standard JSON without quotes on numbers and booleans."""
response = '''
{
"reasoning": "Boolean and numbers test.",
"answer": 42,
"isCorrect": true,
"data": {key1: value1}
}
'''
expected = {
"reasoning": "Boolean and numbers test.",
"answer": 42,
"isCorrect": True,
"data": {"key1": "value1"}
}
self.assertEqual(convert_response_to_json(response), expected)

def test_nested_json(self):
"""Test extraction of nested JSON objects."""
response = '''
{
"reasoning": "Nested structure.",
"answer": 42,
"data": {"nested": {"key": "value"}}
}
'''
expected = {
"reasoning": "Nested structure.",
"answer": 42,
"data": {
"nested": {"key": "value"}
}
}
self.assertEqual(convert_response_to_json(response), expected)

def test_malformed_json(self):
"""Test handling of malformed JSON."""
response = '''
Some text before JSON
{
"reasoning": "This is malformed.",
"answer": 42,
"data": {"key": "value"}
}
Some text after JSON
'''
expected = {
"reasoning": "This is malformed.",
"answer": 42,
"data": {"key": "value"}
}
self.assertEqual(convert_response_to_json(response), expected)

def test_incomplete_json(self):
"""Test handling of incomplete JSON."""
response = '''
{
"reasoning": "Incomplete structure",
"answer": 42
'''
expected = {
"reasoning": "Incomplete structure",
"answer": 42
}
self.assertEqual(convert_response_to_json(response), expected)

def test_value_with_special_characters(self):
"""Test JSON with special characters in values."""
response = '''
{
"reasoning": "Special characters !@#$%^&*()",
"answer": 42,
"data": {"key": "value with special characters !@#$%^&*()"}
}
'''
expected = {
"reasoning": "Special characters !@#$%^&*()",
"answer": 42,
"data": {"key": "value with special characters !@#$%^&*()"}
}
self.assertEqual(convert_response_to_json(response), expected)

def test_boolean_and_null_values(self):
"""Test JSON with boolean and null values."""
response = '''
{
"reasoning": "Boolean and null test.",
"isCorrect": true,
"isWrong": false,
"unknown": null,
"answer": 42
}
'''
expected = {
"reasoning": "Boolean and null test.",
"isCorrect": True,
"isWrong": False,
"unknown": None,
"answer": 42
}
self.assertEqual(convert_response_to_json(response), expected)

if __name__ == "__main__":
unittest.main()

0 comments on commit 7a87465

Please sign in to comment.