Skip to content

Commit

Permalink
example usage queries
Browse files Browse the repository at this point in the history
  • Loading branch information
ksrinivs64 committed May 11, 2020
1 parent 2104b87 commit b5f17fd
Show file tree
Hide file tree
Showing 6 changed files with 341 additions and 0 deletions.
45 changes: 45 additions & 0 deletions usage_queries/debug_stackoverflow.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX graph4code: <http://purl.org/twc/graph4code/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX schema: <http://schema.org/>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sioc: <http://rdfs.org/sioc/ns#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX python: <http://purl.org/twc/graph4code/python/>

# This query assumes that the user has a context in the program from which they are launching their search.
# ?f specifies a list of functions that represent the calling context
# <https://stackoverflow.com/questions/> is the RDF graph that contains all stack overflow posts and questions
# Other forums are in the following graphs:
# Stats StackExchange: https://stats.stackexchange.com/,
# Datascience StackExchange: https://datascience.stackexchange.com/,
# Math StackExchange: https://math.stackexchange.com/
# AI StackExchange: https://ai.stackexchange.com/
#
select ?q ?t ?q_content ?a_content ?c where {
graph <https://stackoverflow.com/questions/> {
{
# gather questions that are about the list of functions, counting the number of hits to functions
# per question. Here we used values to specify that list as ?f
select ?q (count(?q) as ?c) {
values (?f) {
(python:sklearn.model_selection.train_test_split)
(python:sklearn.svm.SVC.fit)
}
?q rdf:type schema:Question;
schema:about ?f ;

} group by ?q
}
# gather the content and title of the question, its suggested answers and their content
# ensuring the answer contains some phrase
?q schema:suggestedAnswer ?a ;
sioc:content ?q_content ;
schema:name ?t.
?a rdf:type schema:Answer ;
sioc:content ?a_content .
filter(contains(?a_content, "memory issue"))
}
} order by desc(?c)
54 changes: 54 additions & 0 deletions usage_queries/ensure_multiple_models.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX graph4code: <http://purl.org/twc/graph4code/>
PREFIX graph4codeOntology: <http://purl.org/twc/graph4code/ontology/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX schema: <http://schema.org/>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>


select distinct ?g ?fit1 ?l1 ?l2 ?txt ?cls1 where {
graph ?g {

# find a dataflow between a read of dataframe and a fit call ?fit1 on some classifier
?read rdfs:label "pandas.read_csv" .
?fit1 schema:about "fit" .
?read graph4code:flowsTo+ ?fit1 .

# find predecessors ?pred1 of fit1 (note that here we use a transitive property
# to find predecessors because every method call in Python is read of an object to find
# the 'property' which is the method call
?pred1 graph4code:flowsTo+ ?fit1 .
?pred1 rdfs:label ?name1 .

# find the location of the fit call in source with lines (?l1, ?l2) and text (?txt)
?fit1 sio:SIO_000061 ?p .
?p graph4code:lastLine ?l1 .
?p graph4code:firstLine ?l2 .
?fit1 <https://schema.org/text> ?txt .

# find the predecessors of the ?fit1 call that are classifiers. Note one could do the same
# with sklearn regressors.
graph graph4code:docstrings {
?cls1 rdfs:label ?name1 ;
rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.ClassifierMixin> .
}

# Filter out all cases where people have used different models (?cls2 mirrors the pattern
# used to find ?cls1, and ensures ?cls1 != ?cls2).
filter not exists {
?fit2 schema:about "fit" .
?read graph4code:flowsTo+ ?fit2 .
?pred2 graph4code:flowsTo+ ?fit2 .
?pred2 rdfs:label ?name2 .
graph graph4code:docstrings {
?cls2 rdfs:label ?name2 ;
rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.ClassifierMixin> .
}

filter (?cls1 != ?cls2)
}
}
}
42 changes: 42 additions & 0 deletions usage_queries/find_hyperparameter_distributions.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX graph4code: <http://purl.org/twc/graph4code/>
PREFIX graph4codeOntology: <http://purl.org/twc/graph4code/ontology/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX schema: <http://schema.org/>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>


select * where {
graph ?g {
# find all estimators in sklearn for classifiers, get the parameter names for
# each classifier and its name
graph graph4code:docstrings {
?cls rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.ClassifierMixin> .
?cls rdfs:label ?name .
?method dc:isPartOf ?cls .
?method graph4codeOntology:name_end "__init__" .
?method graph4codeOntology:param ?param .
?param rdfs:label ?param_name ;
graph4codeOntology:param_index ?param_index .
}

# find a call that is the constructor of those classes in ?cls, locate its source in text in
# ?l1 and ?l2 (lines in source) and capture the source text in ?txt
?clf rdfs:label ?name .
?clf sio:SIO_000061 ?p .
?p graph4code:lastLine ?l1 .
?p graph4code:firstLine ?l2 .
?clf <https://schema.org/text> ?txt .

# find all arguments passed into the constructor with the same named argument
# as a known parameter. Positional args are ignored because its unclear they match
# parameter names (TBD - need to fix this)
?clf sio:SIO_000230 ?anon .
?anon sio:SIO_000300 ?value .
?anon graph4code:flowsTo ?clf .
?anon sio:SIO_000116 ?param_name .
}
}
47 changes: 47 additions & 0 deletions usage_queries/must_not_set_hyperparameters.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX graph4code: <http://purl.org/twc/graph4code/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX schema: <http://schema.org/>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>


select * where {
graph ?g {
# find all classifiers and transformers from the docstrings graph
graph graph4code:docstrings {
{
?cls rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.ClassifierMixin> .
} UNION
{
?cls rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.RegressorMixin> .
}
?cls rdfs:label ?name .
}

# find a call that is the constructor of those classes in ?cls, locate its source in text in
# ?l1 and ?l2 (lines in source) and capture the source text in ?txt
?clf rdfs:label ?name .
?clf sio:SIO_000061 ?p .
?p graph4code:lastLine ?l1 .
?p graph4code:firstLine ?l2 .
?clf <https://schema.org/text> ?txt .

# that constructor must have some argument in positions greater than 0 (0 is the receiver object)
# which reflects some value being passed into the constructor call as a hyper-parameter. Note that
?clf sio:SIO_000230 ?anon .
?anon sio:SIO_000613 ?v .
?anon sio:SIO_000300 ?z .
?anon graph4code:flowsTo ?clf .
filter(?v > 0)

# check the case where the user is passing in parameters but through an algorithm designed to do hyper
# parameter optimizations
filter not exists {
?h rdfs:label "hyperopt.fmin" ;
graph4code:flowsTo+ ?clf .
}
}
}
82 changes: 82 additions & 0 deletions usage_queries/train_test_different_data.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX graph4code: <http://purl.org/twc/graph4code/>
PREFIX graph4codeOntology: <http://purl.org/twc/graph4code/ontology/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX schema: <http://schema.org/>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

select * where {
graph ?g {

?fit schema:about "fit" .

# find predecessors ?pred1 of fit1 (note that here we use a transitive property
# to find predecessors because every method call in Python is read of an object to find
# the 'property' which is the method call
?pred1 graph4code:flowsTo+ ?fit .
?pred1 rdfs:label ?name1 .

# find the location of the fit call in source with lines (?l1, ?l2) and text (?txt)
?fit sio:SIO_000061 ?p .
?p graph4code:lastLine ?l1 .
?p graph4code:firstLine ?l2 .
?fit <https://schema.org/text> ?txt .

# find the predecessors of the ?fit call that are classifiers. Note one could do the same
# with sklearn regressors.
graph graph4code:docstrings {
?cls1 rdfs:label ?name1 ;
rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.ClassifierMixin> .
}

# find a predict call on the same model
?predict schema:about "predict" .
?pred1 graph4code:flowsTo+ ?predict .

# find the location of the fit call in source with lines (?l1, ?l2) and text (?txt)
?predict sio:SIO_000061 ?pp .
?pp graph4code:lastLine ?pl1 .
?pp graph4code:firstLine ?pl2 .
?predict <https://schema.org/text> ?ptxt .

# common data
?data graph4code:flowsTo* ?dfp ;
graph4code:flowsTo* ?predict ;
<https://schema.org/text> ?dtxt ;
sio:SIO_000061 ?dp .
?dp graph4code:lastLine ?l11 .
?dp graph4code:firstLine ?dl2 .

?dfp sio:SIO_000230 ?dfpa .
?dfpa <http://www.w3.org/ns/prov#isSpecializationOf> ?fit ;
sio:SIO_000613 ?dfpo .
filter(?dfpo > 1)

filter not exists {
?data graph4code:flowsTo* ?x .
?x graph4code:flowsTo* ?predict .
?x graph4code:read ?y .
?y sio:SIO_000068 ?dc .
?data graph4code:flowsTo* ?dc .
?y sio:SIO_000300 ?readv .
filter not exists {
?data graph4code:flowsTo* ?fx .
?fx graph4code:flowsTo* ?fit .
?fx graph4code:read ?fy .
?fy sio:SIO_000068 ?fdc .
?data graph4code:flowsTo* ?fdc .
?fy sio:SIO_000300 ?readv .
}
}

filter not exists {
?data graph4code:flowsTo* ?x .
?x graph4code:flowsTo+ ?predict .
?x <http://schema.org/about> "train_test_split" .
}

}
}
71 changes: 71 additions & 0 deletions usage_queries/type_inference.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX graph4code: <http://purl.org/twc/graph4code/>
PREFIX graph4codeOntology: <http://purl.org/twc/graph4code/ontology/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX schema: <http://schema.org/>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

select ?n1 ?cls ?super ?l1 ?l2 ?txt ?name where {
graph <http://github/samples/sample813882.py> {
{
select ?n1 ?cls where {
{
# count everything (as ?all) that n1 flowsTo as a receiver (that is as argument 0)
# e.g., read_csv (n1) flowsTo say read_csv.head (n2), read_csv.drop (n2), etc.
select ?n1 (count(distinct ?n2) as ?all) where {
?n1 graph4code:flowsTo ?n2 .
?n1 sio:SIO_000230 ?anon . # we have some anonymous node that represents an input to n1
?anon sio:SIO_000613 "0"^^xsd:int . # the anon has an ordinal position of 0
?anon prov:isSpecializationOf ?n2 . # anonymous node is a specialization of n3
} group by ?n1
}
{
# again count everything (as ?ok) that n1 flowsTo as a receiver
# (that is as argument 0) but group by each class that has a method
# with the right name. Thus ?ok counts all calls on ?n1 that are
# supported by ?cls. Continuing our example, head, drop etc would be
# associated with any class (?cls) that had the method
# the name of method here is ?p2)
#
# In addition to ?n1, also select the class as ?cls.
#
select ?n1 ?cls (count(distinct ?n2) as ?ok) where {
?n1 graph4code:flowsTo ?n2 .
?n1 sio:SIO_000230 ?anon . # we have some anonymous node that represents an input to n1
?anon sio:SIO_000613 "0"^^xsd:int . # the anon has an ordinal position of 0
?anon prov:isSpecializationOf ?n2 . # anonymous node is a specialization of n2
?n2 schema:about ?p2 .
graph graph4code:docstrings {
?s graph4codeOntology:name_end ?p2 ;
dc:isPartOf ?cls .
}
} group by ?n1 ?cls
}
# The filter step of ?ok = ?all ensures that *every* method
# call made on ?n1 has a method in ?cls. This is because
# we counted all such calls as ?all and ones supported by ?cls
# as ?ok. Since ?cls thus supports all the methods called on
# ?n1, it is a valid type in that context (this is known as duck
# typing)
filter(?ok = ?all)
}
}
?n1 sio:SIO_000061 ?p .
?p graph4code:lastLine ?l1 .
?p graph4code:firstLine ?l2 .
?n1 <https://schema.org/text> ?txt .
}

graph graph4code:docstrings {
?cls dc:isPartOf ?module .
?module rdfs:label ?name .
}
# ?imp rdfs:label ?name .


graph graph4code:docstrings {
?cls rdfs:subClassOf* ?super .
}
}

0 comments on commit b5f17fd

Please sign in to comment.