From b5f17fdc625f1ac5855311a637af9c457b7140d4 Mon Sep 17 00:00:00 2001 From: Kavitha Srinivas Date: Mon, 11 May 2020 16:20:48 -0400 Subject: [PATCH] example usage queries --- usage_queries/debug_stackoverflow.sparql | 45 ++++++++++ usage_queries/ensure_multiple_models.sparql | 54 ++++++++++++ .../find_hyperparameter_distributions.sparql | 42 ++++++++++ .../must_not_set_hyperparameters.sparql | 47 +++++++++++ .../train_test_different_data.sparql | 82 +++++++++++++++++++ usage_queries/type_inference.sparql | 71 ++++++++++++++++ 6 files changed, 341 insertions(+) create mode 100644 usage_queries/debug_stackoverflow.sparql create mode 100644 usage_queries/ensure_multiple_models.sparql create mode 100644 usage_queries/find_hyperparameter_distributions.sparql create mode 100644 usage_queries/must_not_set_hyperparameters.sparql create mode 100644 usage_queries/train_test_different_data.sparql create mode 100644 usage_queries/type_inference.sparql diff --git a/usage_queries/debug_stackoverflow.sparql b/usage_queries/debug_stackoverflow.sparql new file mode 100644 index 0000000..22be1b9 --- /dev/null +++ b/usage_queries/debug_stackoverflow.sparql @@ -0,0 +1,45 @@ +PREFIX sio: +PREFIX graph4code: +PREFIX xsd: +PREFIX prov: +PREFIX schema: +PREFIX dc: +PREFIX rdfs: +PREFIX sioc: +PREFIX rdf: +PREFIX python: + +# This query assumes that the user has a context in the program from which they are launching their search. +# ?f specifies a list of functions that represent the calling context +# is the RDF graph that contains all stack overflow posts and questions +# Other forums are in the following graphs: +# Stats StackExchange: https://stats.stackexchange.com/, +# Datascience StackExchange: https://datascience.stackexchange.com/, +# Math StackExchange: https://math.stackexchange.com/ +# AI StackExchange: https://ai.stackexchange.com/ +# +select ?q ?t ?q_content ?a_content ?c where { + graph { + { + # gather questions that are about the list of functions, counting the number of hits to functions + # per question. Here we used values to specify that list as ?f + select ?q (count(?q) as ?c) { + values (?f) { + (python:sklearn.model_selection.train_test_split) + (python:sklearn.svm.SVC.fit) + } + ?q rdf:type schema:Question; + schema:about ?f ; + + } group by ?q + } + # gather the content and title of the question, its suggested answers and their content + # ensuring the answer contains some phrase + ?q schema:suggestedAnswer ?a ; + sioc:content ?q_content ; + schema:name ?t. + ?a rdf:type schema:Answer ; + sioc:content ?a_content . + filter(contains(?a_content, "memory issue")) + } +} order by desc(?c) diff --git a/usage_queries/ensure_multiple_models.sparql b/usage_queries/ensure_multiple_models.sparql new file mode 100644 index 0000000..cd0e4ec --- /dev/null +++ b/usage_queries/ensure_multiple_models.sparql @@ -0,0 +1,54 @@ +PREFIX sio: +PREFIX graph4code: +PREFIX graph4codeOntology: +PREFIX xsd: +PREFIX prov: +PREFIX schema: +PREFIX dc: +PREFIX rdfs: +PREFIX rdf: + + +select distinct ?g ?fit1 ?l1 ?l2 ?txt ?cls1 where { + graph ?g { + + # find a dataflow between a read of dataframe and a fit call ?fit1 on some classifier + ?read rdfs:label "pandas.read_csv" . + ?fit1 schema:about "fit" . + ?read graph4code:flowsTo+ ?fit1 . + + # find predecessors ?pred1 of fit1 (note that here we use a transitive property + # to find predecessors because every method call in Python is read of an object to find + # the 'property' which is the method call + ?pred1 graph4code:flowsTo+ ?fit1 . + ?pred1 rdfs:label ?name1 . + + # find the location of the fit call in source with lines (?l1, ?l2) and text (?txt) + ?fit1 sio:SIO_000061 ?p . + ?p graph4code:lastLine ?l1 . + ?p graph4code:firstLine ?l2 . + ?fit1 ?txt . + + # find the predecessors of the ?fit1 call that are classifiers. Note one could do the same + # with sklearn regressors. + graph graph4code:docstrings { + ?cls1 rdfs:label ?name1 ; + rdfs:subClassOf* . + } + + # Filter out all cases where people have used different models (?cls2 mirrors the pattern + # used to find ?cls1, and ensures ?cls1 != ?cls2). + filter not exists { + ?fit2 schema:about "fit" . + ?read graph4code:flowsTo+ ?fit2 . + ?pred2 graph4code:flowsTo+ ?fit2 . + ?pred2 rdfs:label ?name2 . + graph graph4code:docstrings { + ?cls2 rdfs:label ?name2 ; + rdfs:subClassOf* . + } + + filter (?cls1 != ?cls2) + } + } +} \ No newline at end of file diff --git a/usage_queries/find_hyperparameter_distributions.sparql b/usage_queries/find_hyperparameter_distributions.sparql new file mode 100644 index 0000000..a94fe2c --- /dev/null +++ b/usage_queries/find_hyperparameter_distributions.sparql @@ -0,0 +1,42 @@ +PREFIX sio: +PREFIX graph4code: +PREFIX graph4codeOntology: +PREFIX xsd: +PREFIX prov: +PREFIX schema: +PREFIX dc: +PREFIX rdfs: +PREFIX rdf: + + +select * where { + graph ?g { + # find all estimators in sklearn for classifiers, get the parameter names for + # each classifier and its name + graph graph4code:docstrings { + ?cls rdfs:subClassOf* . + ?cls rdfs:label ?name . + ?method dc:isPartOf ?cls . + ?method graph4codeOntology:name_end "__init__" . + ?method graph4codeOntology:param ?param . + ?param rdfs:label ?param_name ; + graph4codeOntology:param_index ?param_index . + } + + # find a call that is the constructor of those classes in ?cls, locate its source in text in + # ?l1 and ?l2 (lines in source) and capture the source text in ?txt + ?clf rdfs:label ?name . + ?clf sio:SIO_000061 ?p . + ?p graph4code:lastLine ?l1 . + ?p graph4code:firstLine ?l2 . + ?clf ?txt . + + # find all arguments passed into the constructor with the same named argument + # as a known parameter. Positional args are ignored because its unclear they match + # parameter names (TBD - need to fix this) + ?clf sio:SIO_000230 ?anon . + ?anon sio:SIO_000300 ?value . + ?anon graph4code:flowsTo ?clf . + ?anon sio:SIO_000116 ?param_name . + } +} \ No newline at end of file diff --git a/usage_queries/must_not_set_hyperparameters.sparql b/usage_queries/must_not_set_hyperparameters.sparql new file mode 100644 index 0000000..3f25a30 --- /dev/null +++ b/usage_queries/must_not_set_hyperparameters.sparql @@ -0,0 +1,47 @@ +PREFIX sio: +PREFIX graph4code: +PREFIX xsd: +PREFIX prov: +PREFIX schema: +PREFIX dc: +PREFIX rdfs: +PREFIX rdf: + + +select * where { + graph ?g { + # find all classifiers and transformers from the docstrings graph + graph graph4code:docstrings { + { + ?cls rdfs:subClassOf* . + } UNION + { + ?cls rdfs:subClassOf* . + } + ?cls rdfs:label ?name . + } + + # find a call that is the constructor of those classes in ?cls, locate its source in text in + # ?l1 and ?l2 (lines in source) and capture the source text in ?txt + ?clf rdfs:label ?name . + ?clf sio:SIO_000061 ?p . + ?p graph4code:lastLine ?l1 . + ?p graph4code:firstLine ?l2 . + ?clf ?txt . + + # that constructor must have some argument in positions greater than 0 (0 is the receiver object) + # which reflects some value being passed into the constructor call as a hyper-parameter. Note that + ?clf sio:SIO_000230 ?anon . + ?anon sio:SIO_000613 ?v . + ?anon sio:SIO_000300 ?z . + ?anon graph4code:flowsTo ?clf . + filter(?v > 0) + + # check the case where the user is passing in parameters but through an algorithm designed to do hyper + # parameter optimizations + filter not exists { + ?h rdfs:label "hyperopt.fmin" ; + graph4code:flowsTo+ ?clf . + } + } +} \ No newline at end of file diff --git a/usage_queries/train_test_different_data.sparql b/usage_queries/train_test_different_data.sparql new file mode 100644 index 0000000..410b7e1 --- /dev/null +++ b/usage_queries/train_test_different_data.sparql @@ -0,0 +1,82 @@ +PREFIX sio: +PREFIX graph4code: +PREFIX graph4codeOntology: +PREFIX xsd: +PREFIX prov: +PREFIX schema: +PREFIX dc: +PREFIX rdfs: +PREFIX rdf: + +select * where { + graph ?g { + + ?fit schema:about "fit" . + + # find predecessors ?pred1 of fit1 (note that here we use a transitive property + # to find predecessors because every method call in Python is read of an object to find + # the 'property' which is the method call + ?pred1 graph4code:flowsTo+ ?fit . + ?pred1 rdfs:label ?name1 . + + # find the location of the fit call in source with lines (?l1, ?l2) and text (?txt) + ?fit sio:SIO_000061 ?p . + ?p graph4code:lastLine ?l1 . + ?p graph4code:firstLine ?l2 . + ?fit ?txt . + + # find the predecessors of the ?fit call that are classifiers. Note one could do the same + # with sklearn regressors. + graph graph4code:docstrings { + ?cls1 rdfs:label ?name1 ; + rdfs:subClassOf* . + } + + # find a predict call on the same model + ?predict schema:about "predict" . + ?pred1 graph4code:flowsTo+ ?predict . + + # find the location of the fit call in source with lines (?l1, ?l2) and text (?txt) + ?predict sio:SIO_000061 ?pp . + ?pp graph4code:lastLine ?pl1 . + ?pp graph4code:firstLine ?pl2 . + ?predict ?ptxt . + + # common data + ?data graph4code:flowsTo* ?dfp ; + graph4code:flowsTo* ?predict ; + ?dtxt ; + sio:SIO_000061 ?dp . + ?dp graph4code:lastLine ?l11 . + ?dp graph4code:firstLine ?dl2 . + + ?dfp sio:SIO_000230 ?dfpa . + ?dfpa ?fit ; + sio:SIO_000613 ?dfpo . + filter(?dfpo > 1) + + filter not exists { + ?data graph4code:flowsTo* ?x . + ?x graph4code:flowsTo* ?predict . + ?x graph4code:read ?y . + ?y sio:SIO_000068 ?dc . + ?data graph4code:flowsTo* ?dc . + ?y sio:SIO_000300 ?readv . + filter not exists { + ?data graph4code:flowsTo* ?fx . + ?fx graph4code:flowsTo* ?fit . + ?fx graph4code:read ?fy . + ?fy sio:SIO_000068 ?fdc . + ?data graph4code:flowsTo* ?fdc . + ?fy sio:SIO_000300 ?readv . + } + } + + filter not exists { + ?data graph4code:flowsTo* ?x . + ?x graph4code:flowsTo+ ?predict . + ?x "train_test_split" . + } + + } +} \ No newline at end of file diff --git a/usage_queries/type_inference.sparql b/usage_queries/type_inference.sparql new file mode 100644 index 0000000..3135755 --- /dev/null +++ b/usage_queries/type_inference.sparql @@ -0,0 +1,71 @@ +PREFIX sio: +PREFIX graph4code: +PREFIX graph4codeOntology: +PREFIX xsd: +PREFIX prov: +PREFIX schema: +PREFIX dc: +PREFIX rdfs: + +select ?n1 ?cls ?super ?l1 ?l2 ?txt ?name where { + graph { + { + select ?n1 ?cls where { + { + # count everything (as ?all) that n1 flowsTo as a receiver (that is as argument 0) + # e.g., read_csv (n1) flowsTo say read_csv.head (n2), read_csv.drop (n2), etc. + select ?n1 (count(distinct ?n2) as ?all) where { + ?n1 graph4code:flowsTo ?n2 . + ?n1 sio:SIO_000230 ?anon . # we have some anonymous node that represents an input to n1 + ?anon sio:SIO_000613 "0"^^xsd:int . # the anon has an ordinal position of 0 + ?anon prov:isSpecializationOf ?n2 . # anonymous node is a specialization of n3 + } group by ?n1 + } + { + # again count everything (as ?ok) that n1 flowsTo as a receiver + # (that is as argument 0) but group by each class that has a method + # with the right name. Thus ?ok counts all calls on ?n1 that are + # supported by ?cls. Continuing our example, head, drop etc would be + # associated with any class (?cls) that had the method + # the name of method here is ?p2) + # + # In addition to ?n1, also select the class as ?cls. + # + select ?n1 ?cls (count(distinct ?n2) as ?ok) where { + ?n1 graph4code:flowsTo ?n2 . + ?n1 sio:SIO_000230 ?anon . # we have some anonymous node that represents an input to n1 + ?anon sio:SIO_000613 "0"^^xsd:int . # the anon has an ordinal position of 0 + ?anon prov:isSpecializationOf ?n2 . # anonymous node is a specialization of n2 + ?n2 schema:about ?p2 . + graph graph4code:docstrings { + ?s graph4codeOntology:name_end ?p2 ; + dc:isPartOf ?cls . + } + } group by ?n1 ?cls + } + # The filter step of ?ok = ?all ensures that *every* method + # call made on ?n1 has a method in ?cls. This is because + # we counted all such calls as ?all and ones supported by ?cls + # as ?ok. Since ?cls thus supports all the methods called on + # ?n1, it is a valid type in that context (this is known as duck + # typing) + filter(?ok = ?all) + } + } + ?n1 sio:SIO_000061 ?p . + ?p graph4code:lastLine ?l1 . + ?p graph4code:firstLine ?l2 . + ?n1 ?txt . + } + + graph graph4code:docstrings { + ?cls dc:isPartOf ?module . + ?module rdfs:label ?name . + } + # ?imp rdfs:label ?name . + + + graph graph4code:docstrings { + ?cls rdfs:subClassOf* ?super . + } +} \ No newline at end of file