diff --git a/templates/components/googleaistudio.jsonnet b/templates/components/googleaistudio.jsonnet index 04249c16..e4f45ae9 100644 --- a/templates/components/googleaistudio.jsonnet +++ b/templates/components/googleaistudio.jsonnet @@ -13,7 +13,7 @@ local prompts = import "prompts/mixtral.jsonnet"; create:: function(engine) local envSecrets = engine.envSecrets("bedrock-credentials") - .with_env_var("GOOGLEAISTUDIO_KEY", "googleaistudio-key"); + .with_env_var("GOOGLE_AI_STUDIO_KEY", "googleaistudio-key"); local container = engine.container("text-completion") diff --git a/templates/components/prompt-template.jsonnet b/templates/components/prompt-template.jsonnet index 8ba0d17f..cc4195aa 100644 --- a/templates/components/prompt-template.jsonnet +++ b/templates/components/prompt-template.jsonnet @@ -17,22 +17,38 @@ local default_prompts = import "prompts/default-prompts.jsonnet"; "prompt-template", "-p", url.pulsar, + "--text-completion-request-queue", "non-persistent://tg/request/text-completion", "--text-completion-response-queue", "non-persistent://tg/response/text-completion-response", - "--definition-template", + + "--system-prompt", + $["system-template"], + + "--prompt", + "question={{question}}", + "extract-definitions=" + $["prompt-definition-template"], - "--relationship-template", + "extract-relationships=" + $["prompt-relationship-template"], - "--topic-template", + "extract-topics=" + $["prompt-topic-template"], - "--knowledge-query-template", + "kg-prompt=" + $["prompt-knowledge-query-template"], - "--document-query-template", + "document-prompt=" + $["prompt-document-query-template"], - "--rows-template", + "extract-rows=" + $["prompt-rows-template"], + + "--prompt-response-type", + "extract-definitions=json", + "extract-relationships=json", + "extract-topics=json", + "kg-prompt=text", + "document-prompt=text", + "extract-rows=json", + ]) .with_limits("0.5", "128M") .with_reservations("0.1", "128M"); @@ -71,18 +87,33 @@ local default_prompts = import "prompts/default-prompts.jsonnet"; "non-persistent://tg/request/text-completion-rag", "--text-completion-response-queue", "non-persistent://tg/response/text-completion-rag-response", - "--definition-template", + + "--system-prompt", + $["system-template"], + + "--prompt", + "question={{question}}", + "extract-definitions=" + $["prompt-definition-template"], - "--relationship-template", + "extract-relationships=" + $["prompt-relationship-template"], - "--topic-template", + "extract-topics=" + $["prompt-topic-template"], - "--knowledge-query-template", + "kg-prompt=" + $["prompt-knowledge-query-template"], - "--document-query-template", + "document-prompt=" + $["prompt-document-query-template"], - "--rows-template", + "extract-rows=" + $["prompt-rows-template"], + + "--prompt-response-type", + "extract-definitions=json", + "extract-relationships=json", + "extract-topics=json", + "kg-prompt=text", + "document-prompt=text", + "extract-rows=json", + ]) .with_limits("0.5", "128M") .with_reservations("0.1", "128M"); diff --git a/templates/prompts/default-prompts.jsonnet b/templates/prompts/default-prompts.jsonnet index 6f8c7b7b..aa48412b 100644 --- a/templates/prompts/default-prompts.jsonnet +++ b/templates/prompts/default-prompts.jsonnet @@ -4,16 +4,18 @@ { - "prompt-definition-template":: "\nStudy the following text and derive definitions for any discovered entities.\nDo not provide definitions for entities whose definitions are incomplete\nor unknown.\nOutput relationships in JSON format as an arary of objects with fields:\n- entity: the name of the entity\n- definition: English text which defines the entity\n\n\n\n{text}\n\n\n\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not use special characters in the abstract text. The\nabstract will be written as plain text. Do not add markdown formatting\nor headers or prefixes. Do not include null or unknown definitions.\n", + "system-template":: "You are a helpful assistant.", - "prompt-relationship-template":: "\nStudy the following text and derive entity relationships. For each\nrelationship, derive the subject, predicate and object of the relationship.\nOutput relationships in JSON format as an arary of objects with fields:\n- subject: the subject of the relationship\n- predicate: the predicate\n- object: the object of the relationship\n- object-entity: false if the object is a simple data type: name, value or date. true if it is an entity.\n\n\n\n{text}\n\n\n\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not use special characters in the abstract text. The\nabstract must be written as plain text. Do not add markdown formatting\nor headers or prefixes.\n", + "prompt-definition-template":: "\nStudy the following text and derive definitions for any discovered entities.\nDo not provide definitions for entities whose definitions are incomplete\nor unknown.\nOutput relationships in JSON format as an arary of objects with fields:\n- entity: the name of the entity\n- definition: English text which defines the entity\n\n\n\n{{text}}\n\n\n\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not use special characters in the abstract text. The\nabstract will be written as plain text. Do not add markdown formatting\nor headers or prefixes. Do not include null or unknown definitions.\n", - "prompt-topic-template":: "You are a helpful assistant that performs information extraction tasks for a provided text.\nRead the provided text. You will identify topics and their definitions in JSON.\n\nReading Instructions:\n- Ignore document formatting in the provided text.\n- Study the provided text carefully.\n\nHere is the text:\n{text}\n\nResponse Instructions: \n- Do not respond with special characters.\n- Return only topics that are concepts and unique to the provided text.\n- Respond only with well-formed JSON.\n- The JSON response shall be an array of objects with keys \"topic\" and \"definition\". \n- The JSON response shall use the following structure:\n\n```json\n[{{\"topic\": string, \"definition\": string}}]\n```\n\n- Do not write any additional text or explanations.", + "prompt-relationship-template":: "\nStudy the following text and derive entity relationships. For each\nrelationship, derive the subject, predicate and object of the relationship.\nOutput relationships in JSON format as an arary of objects with fields:\n- subject: the subject of the relationship\n- predicate: the predicate\n- object: the object of the relationship\n- object-entity: false if the object is a simple data type: name, value or date. true if it is an entity.\n\n\n\n{{text}}\n\n\n\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not use special characters in the abstract text. The\nabstract must be written as plain text. Do not add markdown formatting\nor headers or prefixes.\n", - "prompt-knowledge-query-template":: "Study the following set of knowledge statements. The statements are written in Cypher format that has been extracted from a knowledge graph. Use only the provided set of knowledge statements in your response. Do not speculate if the answer is not found in the provided set of knowledge statements.\n\nHere's the knowledge statements:\n{graph}\n\nUse only the provided knowledge statements to respond to the following:\n{query}\n", + "prompt-topic-template":: "You are a helpful assistant that performs information extraction tasks for a provided text.\nRead the provided text. You will identify topics and their definitions in JSON.\n\nReading Instructions:\n- Ignore document formatting in the provided text.\n- Study the provided text carefully.\n\nHere is the text:\n{{text}}\n\nResponse Instructions: \n- Do not respond with special characters.\n- Return only topics that are concepts and unique to the provided text.\n- Respond only with well-formed JSON.\n- The JSON response shall be an array of objects with keys \"topic\" and \"definition\". \n- The JSON response shall use the following structure:\n\n```json\n[{\"topic\": string, \"definition\": string}]\n```\n\n- Do not write any additional text or explanations.", - "prompt-document-query-template":: "Study the following context. Use only the information provided in the context in your response. Do not speculate if the answer is not found in the provided set of knowledge statements.\n\nHere is the context:\n{documents}\n\nUse only the provided knowledge statements to respond to the following:\n{query}\n", + "prompt-knowledge-query-template":: "Study the following set of knowledge statements. The statements are written in Cypher format that has been extracted from a knowledge graph. Use only the provided set of knowledge statements in your response. Do not speculate if the answer is not found in the provided set of knowledge statements.\n\nHere's the knowledge statements:\n{% for edge in knowledge %}({{edge.s}})-[{{edge.p}}]->({{edge.o}})\n{%endfor%}\n\nUse only the provided knowledge statements to respond to the following:\n{{query}}\n", - "prompt-rows-template":: "\nStudy the following text and derive objects which match the schema provided.\n\nYou must output an array of JSON objects for each object you discover\nwhich matches the schema. For each object, output a JSON object whose fields\ncarry the name field specified in the schema.\n\n\n\n{schema}\n\n\n\n{text}\n\n\n\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not add markdown formatting or headers or prefixes.\n", + "prompt-document-query-template":: "Study the following context. Use only the information provided in the context in your response. Do not speculate if the answer is not found in the provided set of knowledge statements.\n\nHere is the context:\n{{documents}}\n\nUse only the provided knowledge statements to respond to the following:\n{{query}}\n", + + "prompt-rows-template":: "\nStudy the following text and derive objects which match the schema provided.\n\nYou must output an array of JSON objects for each object you discover\nwhich matches the schema. For each object, output a JSON object whose fields\ncarry the name field specified in the schema.\n\n\n\n{{schema}}\n\n\n\n{{text}}\n\n\n\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not add markdown formatting or headers or prefixes.\n", } \ No newline at end of file diff --git a/tests/README.prompts b/tests/README.prompts new file mode 100644 index 00000000..7a17affe --- /dev/null +++ b/tests/README.prompts @@ -0,0 +1,27 @@ + +test-prompt-... is tested with this prompt set... + +prompt-template \ + -p pulsar://localhost:6650 \ + --system-prompt 'You are a {{attitude}}, you are called {{name}}' \ + --global-term \ + 'name=Craig' \ + 'attitude=LOUD, SHOUTY ANNOYING BOT' \ + --prompt \ + 'question={{question}}' \ + 'french-question={{question}}' \ + "analyze=Find the name and age in this text, and output a JSON structure containing just the name and age fields: {{description}}. Don't add markup, just output the raw JSON object." \ + "graph-query=Study the following knowledge graph, and then answer the question.\\n\nGraph:\\n{% for edge in knowledge %}({{edge.0}})-[{{edge.1}}]->({{edge.2}})\\n{%endfor%}\\nQuestion:\\n{{question}}" \ + "extract-definition=Analyse the text provided, and then return a list of terms and definitions. The output should be a JSON array, each item in the array is an object with fields 'term' and 'definition'.Don't add markup, just output the raw JSON object. Here is the text:\\n{{text}}" \ + --prompt-response-type \ + 'question=text' \ + 'analyze=json' \ + 'graph-query=text' \ + 'extract-definition=json' \ + --prompt-term \ + 'question=name:Bonny' \ + 'french-question=attitude:French-speaking bot' \ + --prompt-schema \ + 'analyze={ "type" : "object", "properties" : { "age": { "type" : "number" }, "name": { "type" : "string" } } }' \ + 'extract-definition={ "type": "array", "items": { "type": "object", "properties": { "term": { "type": "string" }, "definition": { "type": "string" } }, "required": [ "term", "definition" ] } }' + diff --git a/tests/test-lang-definition b/tests/test-lang-definition index c6e593fd..67342779 100755 --- a/tests/test-lang-definition +++ b/tests/test-lang-definition @@ -7,7 +7,13 @@ p = PromptClient(pulsar_host="pulsar://localhost:6650") chunk = """I noticed a cat in my garden. It is a four-legged animal which is a mammal and can be tame or wild. I wonder if it will be friends -with me. I think the cat's name is Fred and it has 4 legs""" +with me. I think the cat's name is Fred and it has 4 legs. + +A cat is a small mammal. + +A grapefruit is a citrus fruit. + +""" resp = p.request_definitions( chunk=chunk, diff --git a/tests/test-lang-topics b/tests/test-lang-topics new file mode 100755 index 00000000..2b668524 --- /dev/null +++ b/tests/test-lang-topics @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 + +import pulsar +from trustgraph.clients.prompt_client import PromptClient + +p = PromptClient(pulsar_host="pulsar://localhost:6650") + +chunk = """I noticed a cat in my garden. It is a four-legged animal +which is a mammal and can be tame or wild. I wonder if it will be friends +with me. I think the cat's name is Fred and it has 4 legs""" + +resp = p.request_topics( + chunk=chunk, +) + +for d in resp: + print(d.topic) + print(" ", d.definition) + diff --git a/tests/test-prompt-analyze b/tests/test-prompt-analyze new file mode 100755 index 00000000..53c1d76f --- /dev/null +++ b/tests/test-prompt-analyze @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +import json +from trustgraph.clients.prompt_client import PromptClient + +p = PromptClient(pulsar_host="pulsar://localhost:6650") + +description = """Fred is a 4-legged cat who is 12 years old""" + +resp = p.request( + id="analyze", + terms = { + "description": description, + } +) + +print(json.dumps(resp, indent=4)) + diff --git a/tests/test-prompt-extraction b/tests/test-prompt-extraction new file mode 100755 index 00000000..c73bd2e2 --- /dev/null +++ b/tests/test-prompt-extraction @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +import json +from trustgraph.clients.prompt_client import PromptClient + +p = PromptClient(pulsar_host="pulsar://localhost:6650") + +chunk=""" + The Space Shuttle was a reusable spacecraft that transported astronauts and cargo to and from Earth's orbit. It was designed to launch like a rocket, maneuver in orbit like a spacecraft, and land like an airplane. The Space Shuttle was NASA's space transportation system and was used for many purposes, including: + + Carrying astronauts + The Space Shuttle could carry up to seven astronauts at a time. + +Launching, recovering, and repairing satellites +The Space Shuttle could launch satellites into orbit, recover them, and repair them. +Building the International Space Station +The Space Shuttle carried large parts into space to build the International Space Station. +Conducting research +Astronauts conducted experiments in the Space Shuttle, which was like a science lab in space. + +The Space Shuttle was retired in 2011 after the Columbia accident in 2003. The Columbia Accident Investigation Board report found that the Space Shuttle was unsafe and expensive to make safe. +Here are some other facts about the Space Shuttle: + + The Space Shuttle was 184 ft tall and had a diameter of 29 ft. + +The Space Shuttle had a mass of 4,480,000 lb. +The Space Shuttle's first flight was on April 12, 1981. +The Space Shuttle's last mission was in 2011. +""" + +q = "Tell me some facts in the knowledge graph" + +resp = p.request( + id="extract-definition", + terms = { + "text": chunk, + } +) + +print(resp) + +for fact in resp: + print(fact["term"], "::") + print(fact["definition"]) + print() + diff --git a/tests/test-prompt-french-question b/tests/test-prompt-french-question new file mode 100755 index 00000000..4417cf41 --- /dev/null +++ b/tests/test-prompt-french-question @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +import pulsar +from trustgraph.clients.prompt_client import PromptClient + +p = PromptClient(pulsar_host="pulsar://localhost:6650") + +question = """What is the square root of 16?""" + +resp = p.request( + id="french-question", + terms = { + "question": question + } +) + +print(resp) + diff --git a/tests/test-prompt-knowledge b/tests/test-prompt-knowledge new file mode 100755 index 00000000..b1b94983 --- /dev/null +++ b/tests/test-prompt-knowledge @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +import json +from trustgraph.clients.prompt_client import PromptClient + +p = PromptClient(pulsar_host="pulsar://localhost:6650") + +knowledge = [ + ("accident", "evoked", "a wide range of deeply felt public responses"), + ("Space Shuttle concept", "had", "genesis"), + ("Commission", "had", "a mandate to develop recommendations for corrective or other action based upon the Commission's findings and determinations"), + ("Commission", "established", "teams of persons"), + ("Space Shuttle Challenger", "http://www.w3.org/2004/02/skos/core#definition", "A space shuttle that was destroyed in an accident during mission 51-L."), + ("The mid fuselage", "contains", "the payload bay"), + ("Volume I", "contains", "Chapter IX"), + ("accident", "resulted in", "firm national resolve that those men and women be forever enshrined in the annals of American heroes"), + ("Volume I", "contains", "Chapter VII"), + ("Volume I", "contains", "Chapter II"), + ("Volume I", "contains", "Chapter V"), + ("Commission", "believes", "its investigation and report have been responsive to the request of the President and hopes that they will serve the best interests of the nation in restoring the United States space program to its preeminent position in the world"), + ("Commission", "construe", "mandate"), + ("accident", "became", "a milestone on the way to achieving the full potential that space offers to mankind"), + ("Volume I", "contains", "The Commission"), + ("Commission", "http://www.w3.org/2004/02/skos/core#definition", "A group established to investigate the space shuttle accident"), + ("Volume I", "contains", "Appendix D"), + ("Commission", "had", "a mandate to review the circumstances surrounding the accident to establish the probable cause or causes of the accident"), + ("Volume I", "contains", "Recommendations") +] + +q = "Tell me some facts in the knowledge graph" + +resp = p.request( + id="graph-query", + terms = { + "name": "Jayney", + "knowledge": knowledge, + "question": q + } +) + +print(resp) + + + diff --git a/tests/test-prompt-question b/tests/test-prompt-question new file mode 100755 index 00000000..50660965 --- /dev/null +++ b/tests/test-prompt-question @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +import pulsar +from trustgraph.clients.prompt_client import PromptClient + +p = PromptClient(pulsar_host="pulsar://localhost:6650") + +question = """What is the square root of 16?""" + +resp = p.request( + id="question", + terms = { + "question": question + } +) + +print(resp) + diff --git a/tests/test-prompt-spanish-question b/tests/test-prompt-spanish-question new file mode 100755 index 00000000..e55a174b --- /dev/null +++ b/tests/test-prompt-spanish-question @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 + +import pulsar +from trustgraph.clients.prompt_client import PromptClient + +p = PromptClient(pulsar_host="pulsar://localhost:6650") + +question = """What is the square root of 16?""" + +resp = p.request( + id="question", + terms = { + "question": question, + "attitude": "Spanish-speaking bot" + } +) + +print(resp) + diff --git a/trustgraph-base/trustgraph/clients/prompt_client.py b/trustgraph-base/trustgraph/clients/prompt_client.py index f7f5a3ef..b43a854d 100644 --- a/trustgraph-base/trustgraph/clients/prompt_client.py +++ b/trustgraph-base/trustgraph/clients/prompt_client.py @@ -1,7 +1,9 @@ import _pulsar +import json +import dataclasses -from .. schema import PromptRequest, PromptResponse, Fact, RowSchema, Field +from .. schema import PromptRequest, PromptResponse from .. schema import prompt_request_queue from .. schema import prompt_response_queue from . base import BaseClient @@ -12,6 +14,23 @@ INFO=_pulsar.LoggerLevel.Info DEBUG=_pulsar.LoggerLevel.Debug +@dataclasses.dataclass +class Definition: + name: str + definition: str + +@dataclasses.dataclass +class Relationship: + s: str + p: str + o: str + o_entity: str + +@dataclasses.dataclass +class Topic: + topic: str + definition: str + class PromptClient(BaseClient): def __init__( @@ -38,63 +57,116 @@ def __init__( output_schema=PromptResponse, ) - def request_definitions(self, chunk, timeout=300): + def request(self, id, terms, timeout=300): - return self.call( - kind="extract-definitions", chunk=chunk, + resp = self.call( + id=id, + terms={ + k: json.dumps(v) + for k, v in terms.items() + }, timeout=timeout - ).definitions - - def request_topics(self, chunk, timeout=300): + ) + + if resp.text: return resp.text - return self.call( - kind="extract-topics", chunk=chunk, + return json.loads(resp.object) + + def request_definitions(self, chunk, timeout=300): + + defs = self.request( + id="extract-definitions", + terms={ + "text": chunk + }, timeout=timeout - ).topics + ) + + return [ + Definition(name=d["entity"], definition=d["definition"]) + for d in defs + ] def request_relationships(self, chunk, timeout=300): - return self.call( - kind="extract-relationships", chunk=chunk, + rels = self.request( + id="extract-relationships", + terms={ + "text": chunk + }, timeout=timeout - ).relationships + ) + + return [ + Relationship( + s=d["subject"], + p=d["predicate"], + o=d["object"], + o_entity=d["object-entity"] + ) + for d in rels + ] + + def request_topics(self, chunk, timeout=300): + + topics = self.request( + id="extract-topics", + terms={ + "text": chunk + }, + timeout=timeout + ) + + return [ + Topic(topic=d["topic"], definition=d["definition"]) + for d in topics + ] def request_rows(self, schema, chunk, timeout=300): - return self.call( - kind="extract-rows", chunk=chunk, - row_schema=RowSchema( - name=schema.name, - description=schema.description, - fields=[ - Field( - name=f.name, type=str(f.type), size=f.size, - primary=f.primary, description=f.description, - ) - for f in schema.fields - ] - ), + return self.request( + id="extract-rows", + terms={ + "chunk": chunk, + "row-schema": { + "name": schema.name, + "description": schema.description, + "fields": [ + { + "name": f.name, "type": str(f.type), + "size": f.size, "primary": f.primary, + "description": f.description, + } + for f in schema.fields + ] + } + }, timeout=timeout - ).rows + ) def request_kg_prompt(self, query, kg, timeout=300): - return self.call( - kind="kg-prompt", - query=query, - kg=[ - Fact(s=v[0], p=v[1], o=v[2]) - for v in kg - ], + return self.request( + id="kg-prompt", + terms={ + "query": query, + "knowledge": [ + { "s": v[0], "p": v[1], "o": v[2] } + for v in kg + ] + }, timeout=timeout - ).answer + ) def request_document_prompt(self, query, documents, timeout=300): - return self.call( - kind="document-prompt", - query=query, - documents=documents, + return self.request( + id="document-prompt", + terms={ + "query": query, + "documents": documents, + }, timeout=timeout - ).answer + ) + diff --git a/trustgraph-base/trustgraph/schema/prompt.py b/trustgraph-base/trustgraph/schema/prompt.py index c7dbfd43..9bcdf117 100644 --- a/trustgraph-base/trustgraph/schema/prompt.py +++ b/trustgraph-base/trustgraph/schema/prompt.py @@ -39,20 +39,21 @@ class Fact(Record): # schema, chunk -> rows class PromptRequest(Record): - kind = String() - chunk = String() - query = String() - kg = Array(Fact()) - documents = Array(Bytes()) - row_schema = RowSchema() + id = String() + + # JSON encoded values + terms = Map(String()) class PromptResponse(Record): + + # Error case error = Error() - answer = String() - definitions = Array(Definition()) - topics = Array(Topic()) - relationships = Array(Relationship()) - rows = Array(Map(String())) + + # Just plain text + text = String() + + # JSON encoded + object = String() prompt_request_queue = topic( 'prompt', kind='non-persistent', namespace='request' diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index 9b8da4af..b8cf40b4 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -56,6 +56,8 @@ "neo4j", "tiktoken", "google-generativeai", + "ibis", + "jsonschema", ], scripts=[ "scripts/chunker-recursive", diff --git a/trustgraph-flow/trustgraph/model/prompt/template/README.md b/trustgraph-flow/trustgraph/model/prompt/template/README.md new file mode 100644 index 00000000..0b98e906 --- /dev/null +++ b/trustgraph-flow/trustgraph/model/prompt/template/README.md @@ -0,0 +1,25 @@ + +prompt-template \ + -p pulsar://localhost:6650 \ + --system-prompt 'You are a {{attitude}}, you are called {{name}}' \ + --global-term \ + 'name=Craig' \ + 'attitude=LOUD, SHOUTY ANNOYING BOT' \ + --prompt \ + 'question={{question}}' \ + 'french-question={{question}}' \ + "analyze=Find the name and age in this text, and output a JSON structure containing just the name and age fields: {{description}}. Don't add markup, just output the raw JSON object." \ + "graph-query=Study the following knowledge graph, and then answer the question.\\n\nGraph:\\n{% for edge in knowledge %}({{edge.0}})-[{{edge.1}}]->({{edge.2}})\\n{%endfor%}\\nQuestion:\\n{{question}}" \ + "extract-definition=Analyse the text provided, and then return a list of terms and definitions. The output should be a JSON array, each item in the array is an object with fields 'term' and 'definition'.Don't add markup, just output the raw JSON object. Here is the text:\\n{{text}}" \ + --prompt-response-type \ + 'question=text' \ + 'analyze=json' \ + 'graph-query=text' \ + 'extract-definition=json' \ + --prompt-term \ + 'question=name:Bonny' \ + 'french-question=attitude:French-speaking bot' \ + --prompt-schema \ + 'analyze={ "type" : "object", "properties" : { "age": { "type" : "number" }, "name": { "type" : "string" } } }' \ + 'extract-definition={ "type": "array", "items": { "type": "object", "properties": { "term": { "type": "string" }, "definition": { "type": "string" } }, "required": [ "term", "definition" ] } }' + diff --git a/trustgraph-flow/trustgraph/model/prompt/template/prompt_manager.py b/trustgraph-flow/trustgraph/model/prompt/template/prompt_manager.py new file mode 100644 index 00000000..d8a032ca --- /dev/null +++ b/trustgraph-flow/trustgraph/model/prompt/template/prompt_manager.py @@ -0,0 +1,95 @@ + +import ibis +import json +from jsonschema import validate +import re + +from trustgraph.clients.llm_client import LlmClient + +class PromptConfiguration: + def __init__(self, system_template, global_terms={}, prompts={}): + self.system_template = system_template + self.global_terms = global_terms + self.prompts = prompts + +class Prompt: + def __init__(self, template, response_type = "text", terms=None, schema=None): + self.template = template + self.response_type = response_type + self.terms = terms + self.schema = schema + +class PromptManager: + + def __init__(self, llm, config): + self.llm = llm + self.config = config + self.terms = config.global_terms + + self.prompts = config.prompts + + try: + self.system_template = ibis.Template(config.system_template) + except: + raise RuntimeError("Error in system template") + + self.templates = {} + for k, v in self.prompts.items(): + try: + self.templates[k] = ibis.Template(v.template) + except: + raise RuntimeError(f"Error in template: {k}") + + if v.terms is None: + v.terms = {} + + def parse_json(self, text): + json_match = re.search(r'```(?:json)?(.*?)```', text, re.DOTALL) + + if json_match: + json_str = json_match.group(1).strip() + else: + # If no delimiters, assume the entire output is JSON + json_str = text.strip() + + return json.loads(json_str) + + def invoke(self, id, input): + + if id not in self.prompts: + raise RuntimeError("ID invalid") + + terms = self.terms | self.prompts[id].terms | input + + resp_type = self.prompts[id].response_type + + prompt = { + "system": self.system_template.render(terms), + "prompt": self.templates[id].render(terms) + } + + resp = self.llm.request(**prompt) + + print(resp, flush=True) + + if resp_type == "text": + return resp + + if resp_type != "json": + raise RuntimeError(f"Response type {resp_type} not known") + + try: + obj = self.parse_json(resp) + except: + raise RuntimeError("JSON parse fail") + + print(obj, flush=True) + if self.prompts[id].schema: + try: + print(self.prompts[id].schema) + validate(instance=obj, schema=self.prompts[id].schema) + except Exception as e: + raise RuntimeError(f"Schema validation fail: {e}") + + return obj + diff --git a/trustgraph-flow/trustgraph/model/prompt/template/prompts.py b/trustgraph-flow/trustgraph/model/prompt/template/prompts.py deleted file mode 100644 index e3148157..00000000 --- a/trustgraph-flow/trustgraph/model/prompt/template/prompts.py +++ /dev/null @@ -1,47 +0,0 @@ - -def to_relationships(template, text): - return template.format(text=text) - -def to_definitions(template, text): - return template.format(text=text) - -def to_topics(template, text): - return template.format(text=text) - -def to_rows(template, schema, text): - - field_schema = [ - f"- Name: {f.name}\n Type: {f.type}\n Definition: {f.description}" - for f in schema.fields - ] - - field_schema = "\n".join(field_schema) - - return template.format(schema=schema, text=text) - - schema = f"""Object name: {schema.name} -Description: {schema.description} - -Fields: -{schema}""" - - prompt = f"""""" - - return prompt - -def get_cypher(kg): - sg2 = [] - for f in kg: - sg2.append(f"({f.s})-[{f.p}]->({f.o})") - kg = "\n".join(sg2) - kg = kg.replace("\\", "-") - return kg - -def to_kg_query(template, query, kg): - cypher = get_cypher(kg) - return template.format(query=query, graph=cypher) - -def to_document_query(template, query, docs): - docs = "\n\n".join(docs) - return template.format(query=query, documents=docs) - diff --git a/trustgraph-flow/trustgraph/model/prompt/template/service.py b/trustgraph-flow/trustgraph/model/prompt/template/service.py index 14b65d5a..eea6b8c4 100755 --- a/trustgraph-flow/trustgraph/model/prompt/template/service.py +++ b/trustgraph-flow/trustgraph/model/prompt/template/service.py @@ -16,8 +16,7 @@ from .... base import ConsumerProducer from .... clients.llm_client import LlmClient -from . prompts import to_definitions, to_relationships, to_rows -from . prompts import to_kg_query, to_document_query, to_topics +from . prompt_manager import PromptConfiguration, Prompt, PromptManager module = ".".join(__name__.split(".")[1:-1]) @@ -29,6 +28,82 @@ class Processor(ConsumerProducer): def __init__(self, **params): + prompt_base = {} + + # Parsing the prompt information to the prompt configuration + # structure + prompt_arg = params.get("prompt", []) + if prompt_arg: + for p in prompt_arg: + toks = p.split("=", 1) + if len(toks) < 2: + raise RuntimeError(f"Prompt string not well-formed: {p}") + prompt_base[toks[0]] = { + "template": toks[1] + } + + prompt_response_type_arg = params.get("prompt_response_type", []) + if prompt_response_type_arg: + for p in prompt_response_type_arg: + toks = p.split("=", 1) + if len(toks) < 2: + raise RuntimeError(f"Response type not well-formed: {p}") + if toks[0] not in prompt_base: + raise RuntimeError(f"Response-type, {toks[0]} not known") + prompt_base[toks[0]]["response_type"] = toks[1] + + prompt_schema_arg = params.get("prompt_schema", []) + if prompt_schema_arg: + for p in prompt_schema_arg: + toks = p.split("=", 1) + if len(toks) < 2: + raise RuntimeError(f"Schema arg not well-formed: {p}") + if toks[0] not in prompt_base: + raise RuntimeError(f"Schema, {toks[0]} not known") + try: + prompt_base[toks[0]]["schema"] = json.loads(toks[1]) + except: + raise RuntimeError(f"Failed to parse JSON schema: {p}") + + prompt_term_arg = params.get("prompt_term", []) + if prompt_term_arg: + for p in prompt_term_arg: + toks = p.split("=", 1) + if len(toks) < 2: + raise RuntimeError(f"Term arg not well-formed: {p}") + if toks[0] not in prompt_base: + raise RuntimeError(f"Term, {toks[0]} not known") + kvtoks = toks[1].split(":", 1) + if len(kvtoks) < 2: + raise RuntimeError(f"Term not well-formed: {toks[1]}") + k, v = kvtoks + if "terms" not in prompt_base[toks[0]]: + prompt_base[toks[0]]["terms"] = {} + prompt_base[toks[0]]["terms"][k] = v + + global_terms = {} + + global_term_arg = params.get("global_term", []) + if global_term_arg: + for t in global_term_arg: + toks = t.split("=", 1) + if len(toks) < 2: + raise RuntimeError(f"Global term arg not well-formed: {t}") + global_terms[toks[0]] = toks[1] + + print(global_terms) + + prompts = { + k: Prompt(**v) + for k, v in prompt_base.items() + } + + prompt_configuration = PromptConfiguration( + system_template = params.get("system_prompt", ""), + global_terms = global_terms, + prompts = prompts + ) + input_queue = params.get("input_queue", default_input_queue) output_queue = params.get("output_queue", default_output_queue) subscriber = params.get("subscriber", default_subscriber) @@ -64,23 +139,21 @@ def __init__(self, **params): pulsar_host = self.pulsar_host ) - self.definition_template = definition_template - self.topic_template = topic_template - self.relationship_template = relationship_template - self.rows_template = rows_template - self.knowledge_query_template = knowledge_query_template - self.document_query_template = document_query_template + # System prompt hack + class Llm: + def __init__(self, llm): + self.llm = llm + def request(self, system, prompt): + print(system) + print(prompt, flush=True) + return self.llm.request(system + "\n\n" + prompt) - def parse_json(self, text): - json_match = re.search(r'```(?:json)?(.*?)```', text, re.DOTALL) - - if json_match: - json_str = json_match.group(1).strip() - else: - # If no delimiters, assume the entire output is JSON - json_str = text.strip() + self.llm = Llm(self.llm) - return json.loads(json_str) + self.manager = PromptManager( + llm = self.llm, + config = prompt_configuration, + ) def handle(self, msg): @@ -90,291 +163,52 @@ def handle(self, msg): id = msg.properties()["id"] - kind = v.kind - - print(f"Handling kind {kind}...", flush=True) - - if kind == "extract-definitions": - - self.handle_extract_definitions(id, v) - return - - elif kind == "extract-topics": - - self.handle_extract_topics(id, v) - return - - elif kind == "extract-relationships": - - self.handle_extract_relationships(id, v) - return - - elif kind == "extract-rows": - - self.handle_extract_rows(id, v) - return - - elif kind == "kg-prompt": - - self.handle_kg_prompt(id, v) - return - - elif kind == "document-prompt": - - self.handle_document_prompt(id, v) - return - - else: - - print("Invalid kind.", flush=True) - return - - def handle_extract_definitions(self, id, v): - - try: - - prompt = to_definitions(self.definition_template, v.chunk) - - ans = self.llm.request(prompt) - - # Silently ignore JSON parse error - try: - defs = self.parse_json(ans) - except: - print("JSON parse error, ignored", flush=True) - defs = [] - - output = [] - - for defn in defs: - - try: - e = defn["entity"] - d = defn["definition"] - - if e == "": continue - if e is None: continue - if d == "": continue - if d is None: continue - - output.append( - Definition( - name=e, definition=d - ) - ) - - except: - print("definition fields missing, ignored", flush=True) - - print("Send response...", flush=True) - r = PromptResponse(definitions=output, error=None) - self.producer.send(r, properties={"id": id}) - - print("Done.", flush=True) - - except Exception as e: - - print(f"Exception: {e}") - - print("Send error response...", flush=True) - - r = PromptResponse( - error=Error( - type = "llm-error", - message = str(e), - ), - response=None, - ) - - self.producer.send(r, properties={"id": id}) - - def handle_extract_topics(self, id, v): - - try: - - prompt = to_topics(self.topic_template, v.chunk) - - ans = self.llm.request(prompt) - - # Silently ignore JSON parse error - try: - defs = self.parse_json(ans) - except: - print("JSON parse error, ignored", flush=True) - defs = [] - - output = [] - - for defn in defs: - - try: - e = defn["topic"] - d = defn["definition"] - - if e == "": continue - if e is None: continue - if d == "": continue - if d is None: continue - - output.append( - Topic( - name=e, definition=d - ) - ) - - except: - print("definition fields missing, ignored", flush=True) - - print("Send response...", flush=True) - r = PromptResponse(topics=output, error=None) - self.producer.send(r, properties={"id": id}) - - print("Done.", flush=True) - - except Exception as e: - - print(f"Exception: {e}") - - print("Send error response...", flush=True) - - r = PromptResponse( - error=Error( - type = "llm-error", - message = str(e), - ), - response=None, - ) - - self.producer.send(r, properties={"id": id}) - - - def handle_extract_relationships(self, id, v): - - try: - - prompt = to_relationships(self.relationship_template, v.chunk) - - ans = self.llm.request(prompt) - - # Silently ignore JSON parse error - try: - defs = self.parse_json(ans) - except: - print("JSON parse error, ignored", flush=True) - defs = [] - - output = [] - - for defn in defs: - - try: - - s = defn["subject"] - p = defn["predicate"] - o = defn["object"] - o_entity = defn["object-entity"] - - if s == "": continue - if s is None: continue - - if p == "": continue - if p is None: continue - - if o == "": continue - if o is None: continue - - if o_entity == "" or o_entity is None: - o_entity = False - - output.append( - Relationship( - s = s, - p = p, - o = o, - o_entity = o_entity, - ) - ) - - except Exception as e: - print("relationship fields missing, ignored", flush=True) - - print("Send response...", flush=True) - r = PromptResponse(relationships=output, error=None) - self.producer.send(r, properties={"id": id}) - - print("Done.", flush=True) - - except Exception as e: - - print(f"Exception: {e}") - - print("Send error response...", flush=True) - - r = PromptResponse( - error=Error( - type = "llm-error", - message = str(e), - ), - response=None, - ) - - self.producer.send(r, properties={"id": id}) - - def handle_extract_rows(self, id, v): + kind = v.id try: - fields = v.row_schema.fields - - prompt = to_rows(self.rows_template, v.row_schema, v.chunk) - - print(prompt) - - ans = self.llm.request(prompt) + print(v.terms) - print(ans) - - # Silently ignore JSON parse error - try: - objs = self.parse_json(ans) - except: - print("JSON parse error, ignored", flush=True) - objs = [] - - output = [] - - for obj in objs: + input = { + k: json.loads(v) + for k, v in v.terms.items() + } + + print(f"Handling kind {kind}...", flush=True) + print(input, flush=True) - try: + resp = self.manager.invoke(kind, input) - row = {} + if isinstance(resp, str): - for f in fields: + print("Send text response...", flush=True) + print(resp, flush=True) - if f.name not in obj: - print(f"Object ignored, missing field {f.name}") - row = {} - break + r = PromptResponse( + text=resp, + object=None, + error=None, + ) - row[f.name] = obj[f.name] + self.producer.send(r, properties={"id": id}) - if row == {}: - continue + return - output.append(row) + else: - except Exception as e: - print("row fields missing, ignored", flush=True) + print("Send object response...", flush=True) + print(json.dumps(resp, indent=4), flush=True) - for row in output: - print(row) + r = PromptResponse( + text=None, + object=json.dumps(resp), + error=None, + ) - print("Send response...", flush=True) - r = PromptResponse(rows=output, error=None) - self.producer.send(r, properties={"id": id}) - - print("Done.", flush=True) + self.producer.send(r, properties={"id": id}) + return + except Exception as e: print(f"Exception: {e}") @@ -390,24 +224,6 @@ def handle_extract_rows(self, id, v): ) self.producer.send(r, properties={"id": id}) - - def handle_kg_prompt(self, id, v): - - try: - - prompt = to_kg_query(self.knowledge_query_template, v.query, v.kg) - - print(prompt) - - ans = self.llm.request(prompt) - - print(ans) - - print("Send response...", flush=True) - r = PromptResponse(answer=ans, error=None) - self.producer.send(r, properties={"id": id}) - - print("Done.", flush=True) except Exception as e: @@ -424,43 +240,7 @@ def handle_kg_prompt(self, id, v): ) self.producer.send(r, properties={"id": id}) - - def handle_document_prompt(self, id, v): - - try: - - prompt = to_document_query( - self.document_query_template, v.query, v.documents - ) - - print(prompt) - ans = self.llm.request(prompt) - - print(ans) - - print("Send response...", flush=True) - r = PromptResponse(answer=ans, error=None) - self.producer.send(r, properties={"id": id}) - - print("Done.", flush=True) - - except Exception as e: - - print(f"Exception: {e}") - - print("Send error response...", flush=True) - - r = PromptResponse( - error=Error( - type = "llm-error", - message = str(e), - ), - response=None, - ) - - self.producer.send(r, properties={"id": id}) - @staticmethod def add_args(parser): @@ -482,39 +262,33 @@ def add_args(parser): ) parser.add_argument( - '--definition-template', - required=True, - help=f'Definition extraction template', + '--prompt', nargs='*', + help=f'Prompt template form id=template', ) parser.add_argument( - '--topic-template', - required=True, - help=f'Topic extraction template', + '--prompt-response-type', nargs='*', + help=f'Prompt response type, form id=json|text', ) parser.add_argument( - '--rows-template', - required=True, - help=f'Rows extraction template', + '--prompt-term', nargs='*', + help=f'Prompt response type, form id=key:value', ) parser.add_argument( - '--relationship-template', - required=True, - help=f'Relationship extraction template', + '--prompt-schema', nargs='*', + help=f'Prompt response schema, form id=schema', ) parser.add_argument( - '--knowledge-query-template', - required=True, - help=f'Knowledge query template', + '--system-prompt', + help=f'System prompt template', ) parser.add_argument( - '--document-query-template', - required=True, - help=f'Document query template', + '--global-term', nargs='+', + help=f'Global term, form key:value' ) def run():