forked from wala/graph4code
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2104b87
commit b5f17fd
Showing
6 changed files
with
341 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
PREFIX sio: <http://semanticscience.org/resource/> | ||
PREFIX graph4code: <http://purl.org/twc/graph4code/> | ||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | ||
PREFIX prov: <http://www.w3.org/ns/prov#> | ||
PREFIX schema: <http://schema.org/> | ||
PREFIX dc: <http://purl.org/dc/terms/> | ||
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | ||
PREFIX sioc: <http://rdfs.org/sioc/ns#> | ||
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
PREFIX python: <http://purl.org/twc/graph4code/python/> | ||
|
||
# This query assumes that the user has a context in the program from which they are launching their search. | ||
# ?f specifies a list of functions that represent the calling context | ||
# <https://stackoverflow.com/questions/> is the RDF graph that contains all stack overflow posts and questions | ||
# Other forums are in the following graphs: | ||
# Stats StackExchange: https://stats.stackexchange.com/, | ||
# Datascience StackExchange: https://datascience.stackexchange.com/, | ||
# Math StackExchange: https://math.stackexchange.com/ | ||
# AI StackExchange: https://ai.stackexchange.com/ | ||
# | ||
select ?q ?t ?q_content ?a_content ?c where { | ||
graph <https://stackoverflow.com/questions/> { | ||
{ | ||
# gather questions that are about the list of functions, counting the number of hits to functions | ||
# per question. Here we used values to specify that list as ?f | ||
select ?q (count(?q) as ?c) { | ||
values (?f) { | ||
(python:sklearn.model_selection.train_test_split) | ||
(python:sklearn.svm.SVC.fit) | ||
} | ||
?q rdf:type schema:Question; | ||
schema:about ?f ; | ||
|
||
} group by ?q | ||
} | ||
# gather the content and title of the question, its suggested answers and their content | ||
# ensuring the answer contains some phrase | ||
?q schema:suggestedAnswer ?a ; | ||
sioc:content ?q_content ; | ||
schema:name ?t. | ||
?a rdf:type schema:Answer ; | ||
sioc:content ?a_content . | ||
filter(contains(?a_content, "memory issue")) | ||
} | ||
} order by desc(?c) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
PREFIX sio: <http://semanticscience.org/resource/> | ||
PREFIX graph4code: <http://purl.org/twc/graph4code/> | ||
PREFIX graph4codeOntology: <http://purl.org/twc/graph4code/ontology/> | ||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | ||
PREFIX prov: <http://www.w3.org/ns/prov#> | ||
PREFIX schema: <http://schema.org/> | ||
PREFIX dc: <http://purl.org/dc/terms/> | ||
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | ||
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
|
||
|
||
select distinct ?g ?fit1 ?l1 ?l2 ?txt ?cls1 where { | ||
graph ?g { | ||
|
||
# find a dataflow between a read of dataframe and a fit call ?fit1 on some classifier | ||
?read rdfs:label "pandas.read_csv" . | ||
?fit1 schema:about "fit" . | ||
?read graph4code:flowsTo+ ?fit1 . | ||
|
||
# find predecessors ?pred1 of fit1 (note that here we use a transitive property | ||
# to find predecessors because every method call in Python is read of an object to find | ||
# the 'property' which is the method call | ||
?pred1 graph4code:flowsTo+ ?fit1 . | ||
?pred1 rdfs:label ?name1 . | ||
|
||
# find the location of the fit call in source with lines (?l1, ?l2) and text (?txt) | ||
?fit1 sio:SIO_000061 ?p . | ||
?p graph4code:lastLine ?l1 . | ||
?p graph4code:firstLine ?l2 . | ||
?fit1 <https://schema.org/text> ?txt . | ||
|
||
# find the predecessors of the ?fit1 call that are classifiers. Note one could do the same | ||
# with sklearn regressors. | ||
graph graph4code:docstrings { | ||
?cls1 rdfs:label ?name1 ; | ||
rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.ClassifierMixin> . | ||
} | ||
|
||
# Filter out all cases where people have used different models (?cls2 mirrors the pattern | ||
# used to find ?cls1, and ensures ?cls1 != ?cls2). | ||
filter not exists { | ||
?fit2 schema:about "fit" . | ||
?read graph4code:flowsTo+ ?fit2 . | ||
?pred2 graph4code:flowsTo+ ?fit2 . | ||
?pred2 rdfs:label ?name2 . | ||
graph graph4code:docstrings { | ||
?cls2 rdfs:label ?name2 ; | ||
rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.ClassifierMixin> . | ||
} | ||
|
||
filter (?cls1 != ?cls2) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
PREFIX sio: <http://semanticscience.org/resource/> | ||
PREFIX graph4code: <http://purl.org/twc/graph4code/> | ||
PREFIX graph4codeOntology: <http://purl.org/twc/graph4code/ontology/> | ||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | ||
PREFIX prov: <http://www.w3.org/ns/prov#> | ||
PREFIX schema: <http://schema.org/> | ||
PREFIX dc: <http://purl.org/dc/terms/> | ||
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | ||
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
|
||
|
||
select * where { | ||
graph ?g { | ||
# find all estimators in sklearn for classifiers, get the parameter names for | ||
# each classifier and its name | ||
graph graph4code:docstrings { | ||
?cls rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.ClassifierMixin> . | ||
?cls rdfs:label ?name . | ||
?method dc:isPartOf ?cls . | ||
?method graph4codeOntology:name_end "__init__" . | ||
?method graph4codeOntology:param ?param . | ||
?param rdfs:label ?param_name ; | ||
graph4codeOntology:param_index ?param_index . | ||
} | ||
|
||
# find a call that is the constructor of those classes in ?cls, locate its source in text in | ||
# ?l1 and ?l2 (lines in source) and capture the source text in ?txt | ||
?clf rdfs:label ?name . | ||
?clf sio:SIO_000061 ?p . | ||
?p graph4code:lastLine ?l1 . | ||
?p graph4code:firstLine ?l2 . | ||
?clf <https://schema.org/text> ?txt . | ||
|
||
# find all arguments passed into the constructor with the same named argument | ||
# as a known parameter. Positional args are ignored because its unclear they match | ||
# parameter names (TBD - need to fix this) | ||
?clf sio:SIO_000230 ?anon . | ||
?anon sio:SIO_000300 ?value . | ||
?anon graph4code:flowsTo ?clf . | ||
?anon sio:SIO_000116 ?param_name . | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
PREFIX sio: <http://semanticscience.org/resource/> | ||
PREFIX graph4code: <http://purl.org/twc/graph4code/> | ||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | ||
PREFIX prov: <http://www.w3.org/ns/prov#> | ||
PREFIX schema: <http://schema.org/> | ||
PREFIX dc: <http://purl.org/dc/terms/> | ||
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | ||
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
|
||
|
||
select * where { | ||
graph ?g { | ||
# find all classifiers and transformers from the docstrings graph | ||
graph graph4code:docstrings { | ||
{ | ||
?cls rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.ClassifierMixin> . | ||
} UNION | ||
{ | ||
?cls rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.RegressorMixin> . | ||
} | ||
?cls rdfs:label ?name . | ||
} | ||
|
||
# find a call that is the constructor of those classes in ?cls, locate its source in text in | ||
# ?l1 and ?l2 (lines in source) and capture the source text in ?txt | ||
?clf rdfs:label ?name . | ||
?clf sio:SIO_000061 ?p . | ||
?p graph4code:lastLine ?l1 . | ||
?p graph4code:firstLine ?l2 . | ||
?clf <https://schema.org/text> ?txt . | ||
|
||
# that constructor must have some argument in positions greater than 0 (0 is the receiver object) | ||
# which reflects some value being passed into the constructor call as a hyper-parameter. Note that | ||
?clf sio:SIO_000230 ?anon . | ||
?anon sio:SIO_000613 ?v . | ||
?anon sio:SIO_000300 ?z . | ||
?anon graph4code:flowsTo ?clf . | ||
filter(?v > 0) | ||
|
||
# check the case where the user is passing in parameters but through an algorithm designed to do hyper | ||
# parameter optimizations | ||
filter not exists { | ||
?h rdfs:label "hyperopt.fmin" ; | ||
graph4code:flowsTo+ ?clf . | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
PREFIX sio: <http://semanticscience.org/resource/> | ||
PREFIX graph4code: <http://purl.org/twc/graph4code/> | ||
PREFIX graph4codeOntology: <http://purl.org/twc/graph4code/ontology/> | ||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | ||
PREFIX prov: <http://www.w3.org/ns/prov#> | ||
PREFIX schema: <http://schema.org/> | ||
PREFIX dc: <http://purl.org/dc/terms/> | ||
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | ||
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
|
||
select * where { | ||
graph ?g { | ||
|
||
?fit schema:about "fit" . | ||
|
||
# find predecessors ?pred1 of fit1 (note that here we use a transitive property | ||
# to find predecessors because every method call in Python is read of an object to find | ||
# the 'property' which is the method call | ||
?pred1 graph4code:flowsTo+ ?fit . | ||
?pred1 rdfs:label ?name1 . | ||
|
||
# find the location of the fit call in source with lines (?l1, ?l2) and text (?txt) | ||
?fit sio:SIO_000061 ?p . | ||
?p graph4code:lastLine ?l1 . | ||
?p graph4code:firstLine ?l2 . | ||
?fit <https://schema.org/text> ?txt . | ||
|
||
# find the predecessors of the ?fit call that are classifiers. Note one could do the same | ||
# with sklearn regressors. | ||
graph graph4code:docstrings { | ||
?cls1 rdfs:label ?name1 ; | ||
rdfs:subClassOf* <http://purl.org/twc/graph4code/python/sklearn.base.ClassifierMixin> . | ||
} | ||
|
||
# find a predict call on the same model | ||
?predict schema:about "predict" . | ||
?pred1 graph4code:flowsTo+ ?predict . | ||
|
||
# find the location of the fit call in source with lines (?l1, ?l2) and text (?txt) | ||
?predict sio:SIO_000061 ?pp . | ||
?pp graph4code:lastLine ?pl1 . | ||
?pp graph4code:firstLine ?pl2 . | ||
?predict <https://schema.org/text> ?ptxt . | ||
|
||
# common data | ||
?data graph4code:flowsTo* ?dfp ; | ||
graph4code:flowsTo* ?predict ; | ||
<https://schema.org/text> ?dtxt ; | ||
sio:SIO_000061 ?dp . | ||
?dp graph4code:lastLine ?l11 . | ||
?dp graph4code:firstLine ?dl2 . | ||
|
||
?dfp sio:SIO_000230 ?dfpa . | ||
?dfpa <http://www.w3.org/ns/prov#isSpecializationOf> ?fit ; | ||
sio:SIO_000613 ?dfpo . | ||
filter(?dfpo > 1) | ||
|
||
filter not exists { | ||
?data graph4code:flowsTo* ?x . | ||
?x graph4code:flowsTo* ?predict . | ||
?x graph4code:read ?y . | ||
?y sio:SIO_000068 ?dc . | ||
?data graph4code:flowsTo* ?dc . | ||
?y sio:SIO_000300 ?readv . | ||
filter not exists { | ||
?data graph4code:flowsTo* ?fx . | ||
?fx graph4code:flowsTo* ?fit . | ||
?fx graph4code:read ?fy . | ||
?fy sio:SIO_000068 ?fdc . | ||
?data graph4code:flowsTo* ?fdc . | ||
?fy sio:SIO_000300 ?readv . | ||
} | ||
} | ||
|
||
filter not exists { | ||
?data graph4code:flowsTo* ?x . | ||
?x graph4code:flowsTo+ ?predict . | ||
?x <http://schema.org/about> "train_test_split" . | ||
} | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
PREFIX sio: <http://semanticscience.org/resource/> | ||
PREFIX graph4code: <http://purl.org/twc/graph4code/> | ||
PREFIX graph4codeOntology: <http://purl.org/twc/graph4code/ontology/> | ||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | ||
PREFIX prov: <http://www.w3.org/ns/prov#> | ||
PREFIX schema: <http://schema.org/> | ||
PREFIX dc: <http://purl.org/dc/terms/> | ||
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | ||
|
||
select ?n1 ?cls ?super ?l1 ?l2 ?txt ?name where { | ||
graph <http://github/samples/sample813882.py> { | ||
{ | ||
select ?n1 ?cls where { | ||
{ | ||
# count everything (as ?all) that n1 flowsTo as a receiver (that is as argument 0) | ||
# e.g., read_csv (n1) flowsTo say read_csv.head (n2), read_csv.drop (n2), etc. | ||
select ?n1 (count(distinct ?n2) as ?all) where { | ||
?n1 graph4code:flowsTo ?n2 . | ||
?n1 sio:SIO_000230 ?anon . # we have some anonymous node that represents an input to n1 | ||
?anon sio:SIO_000613 "0"^^xsd:int . # the anon has an ordinal position of 0 | ||
?anon prov:isSpecializationOf ?n2 . # anonymous node is a specialization of n3 | ||
} group by ?n1 | ||
} | ||
{ | ||
# again count everything (as ?ok) that n1 flowsTo as a receiver | ||
# (that is as argument 0) but group by each class that has a method | ||
# with the right name. Thus ?ok counts all calls on ?n1 that are | ||
# supported by ?cls. Continuing our example, head, drop etc would be | ||
# associated with any class (?cls) that had the method | ||
# the name of method here is ?p2) | ||
# | ||
# In addition to ?n1, also select the class as ?cls. | ||
# | ||
select ?n1 ?cls (count(distinct ?n2) as ?ok) where { | ||
?n1 graph4code:flowsTo ?n2 . | ||
?n1 sio:SIO_000230 ?anon . # we have some anonymous node that represents an input to n1 | ||
?anon sio:SIO_000613 "0"^^xsd:int . # the anon has an ordinal position of 0 | ||
?anon prov:isSpecializationOf ?n2 . # anonymous node is a specialization of n2 | ||
?n2 schema:about ?p2 . | ||
graph graph4code:docstrings { | ||
?s graph4codeOntology:name_end ?p2 ; | ||
dc:isPartOf ?cls . | ||
} | ||
} group by ?n1 ?cls | ||
} | ||
# The filter step of ?ok = ?all ensures that *every* method | ||
# call made on ?n1 has a method in ?cls. This is because | ||
# we counted all such calls as ?all and ones supported by ?cls | ||
# as ?ok. Since ?cls thus supports all the methods called on | ||
# ?n1, it is a valid type in that context (this is known as duck | ||
# typing) | ||
filter(?ok = ?all) | ||
} | ||
} | ||
?n1 sio:SIO_000061 ?p . | ||
?p graph4code:lastLine ?l1 . | ||
?p graph4code:firstLine ?l2 . | ||
?n1 <https://schema.org/text> ?txt . | ||
} | ||
|
||
graph graph4code:docstrings { | ||
?cls dc:isPartOf ?module . | ||
?module rdfs:label ?name . | ||
} | ||
# ?imp rdfs:label ?name . | ||
|
||
|
||
graph graph4code:docstrings { | ||
?cls rdfs:subClassOf* ?super . | ||
} | ||
} |