Skip to content

Commit

Permalink
Merge pull request #11 from muchdogesec/remove-whitelist-and-aliases
Browse files Browse the repository at this point in the history
Remove whitelist and aliases
  • Loading branch information
himynamesdave authored Nov 13, 2024
2 parents 791497e + 232ba4a commit d2b6bd9
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 87 deletions.
2 changes: 0 additions & 2 deletions dogesec_commons/stixifier/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ class Profile(models.Model):
created = models.DateTimeField(auto_now_add=True)
name = models.CharField(max_length=250, unique=True)
extractions = ArrayField(base_field=models.CharField(max_length=256))
whitelists = ArrayField(base_field=models.CharField(max_length=256), default=list)
aliases = ArrayField(base_field=models.CharField(max_length=256), default=list)
relationship_mode = models.CharField(choices=RelationshipMode.choices, max_length=20, default=RelationshipMode.STANDARD)
extract_text_from_image = models.BooleanField(default=False)
defang = models.BooleanField(help_text='If the text should be defanged before processing')
Expand Down
10 changes: 0 additions & 10 deletions dogesec_commons/stixifier/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,6 @@ class ProfileSerializer(serializers.ModelSerializer):
child=serializers.CharField(max_length=256, validators=[partial(validate_extractor, 'extractor', ["ai", "pattern", "lookup"])]),
help_text="extraction id(s)",
)
aliases = serializers.ListField(
child=serializers.CharField(max_length=256, validators=[partial(validate_extractor, 'alias', ["alias"])]),
help_text="alias id(s)",
required=False,
)
whitelists = serializers.ListField(
child=serializers.CharField(max_length=256, validators=[partial(validate_extractor, 'whitelist', ["whitelist"])]),
help_text="whitelist id(s)",
required=False,
)

class Meta:
model = Profile
Expand Down
13 changes: 5 additions & 8 deletions dogesec_commons/stixifier/stixifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ def txt2stix(self):
extractors_map[extractor.type][extractor.slug] = extractor
else:
extractors_map[extractor.type] = {extractor.slug: extractor}
aliases = all_extractors(self.profile.aliases)
whitelists = all_extractors(self.profile.whitelists)


bundler = txt2stixBundler(
self.report_prop.name,
Expand All @@ -105,19 +104,17 @@ def txt2stix(self):
)
self.extra_data['_stixify_report_id'] = str(bundler.report.id)
input_text = txt2stix.remove_data_images(self.output_md)
aliased_input = txt2stix.aliases.transform_all(aliases.values(), input_text)
bundler.whitelisted_values = txt2stix.lookups.merge_whitelists(whitelists.values())


ai_extractors = [txt2stix.parse_model(model_str) for model_str in self.profile.ai_settings_extractions]
txt2stix.validate_token_count(settings.INPUT_TOKEN_LIMIT, aliased_input, ai_extractors)
txt2stix.validate_token_count(settings.INPUT_TOKEN_LIMIT, input_text, ai_extractors)

all_extracts = txt2stix.extract_all(bundler, extractors_map, aliased_input, ai_extractors=ai_extractors)
all_extracts = txt2stix.extract_all(bundler, extractors_map, input_text, ai_extractors=ai_extractors)

if self.profile.relationship_mode == models.RelationshipMode.AI and sum(map(lambda x: len(x), all_extracts.values())):
ai_ref_extractor = txt2stix.parse_model(self.profile.ai_settings_relationships)
txt2stix.validate_token_count(settings.INPUT_TOKEN_LIMIT, aliased_input, [ai_ref_extractor])
txt2stix.extract_relationships_with_ai(bundler, aliased_input, all_extracts, ai_ref_extractor)
txt2stix.validate_token_count(settings.INPUT_TOKEN_LIMIT, input_text, [ai_ref_extractor])
txt2stix.extract_relationships_with_ai(bundler, input_text, all_extracts, ai_ref_extractor)
return bundler


Expand Down
70 changes: 3 additions & 67 deletions dogesec_commons/stixifier/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
@extend_schema_view(
list=extend_schema(
summary="Search profiles",
description="Profiles determine how txt2stix processes the text in each File. A profile consists of an extractors, aliases, and/or whitelists. You can search for existing profiles here.",
description="Profiles determine how txt2stix processes the text in each File. A profile consists of extractors. You can search for existing profiles here.",
responses={400: DEFAULT_400_ERROR, 200: ProfileSerializer},
),
retrieve=extend_schema(
Expand All @@ -30,12 +30,10 @@
summary="Create a new profile",
description=textwrap.dedent(
"""
Add a new Profile that can be applied to new Files. A profile consists of extractors, aliases, and/or whitelists. You can find available extractors, aliases, and whitelists via their respective endpoints.\n\n
Add a new Profile that can be applied to new Files. A profile consists of extractors. You can find available extractors via their respective endpoints.\n\n
The following key/values are accepted in the body of the request:\n\n
* `name` (required - must be unique)
* `extractions` (required - at least one extraction ID): can be obtained from the GET Extractors endpoint. This is a [txt2stix](https://github.com/muchdogesec/txt2stix/) setting.
* `whitelists` (optional): can be obtained from the GET Whitelists endpoint. This is a [txt2stix](https://github.com/muchdogesec/txt2stix/) setting.
* `aliases` (optional): can be obtained from the GET Whitelists endpoint. This is a [txt2stix](https://github.com/muchdogesec/txt2stix/) setting.
* `relationship_mode` (required): either `ai` or `standard`. Required AI provider to be configured if using `ai` mode. This is a [txt2stix](https://github.com/muchdogesec/txt2stix/) setting.
* `ai_settings_extractions` (required if AI extraction used): A list of AI providers and models to be used for extraction in format `["provider:model","provider:model"]` e.g. `["openai:gpt-4o"]`.
* `ai_settings_relationships` (required if AI relationship used): An AI provider and models to be used for relationship generation in format `"provider:model"` e.g. `"openai:gpt-4o"`.
Expand Down Expand Up @@ -143,66 +141,4 @@ class ExtractorsView(txt2stixView):
pagination_class = Pagination("extractors")

def get_all(self):
return self.all_extractors(["lookup", "pattern", "ai"])

@extend_schema_view(
list=extend_schema(
summary="Search for Whitelists",
description=textwrap.dedent(
"""
In many cases files will have IoC extractions that are not malicious. e.g. `google.com` (and thus they don't want them to be extracted). Whitelists provide a list of values to be compared to extractions. If a whitelist value matches an extraction, that extraction is removed. To see the values used in this Whitelist, visit the URL shown as the value for the `file` key.\n\n
For more information see [txt2stix](https://github.com/muchdogesec/txt2stix/).
"""
),
responses={400: DEFAULT_400_ERROR, 200: Txt2stixExtractorSerializer},
),
retrieve=extend_schema(
summary="Get a whitelist",
description="Get a specific Whitelist. To see the values used in this Whitelist, visit the URL shown as the value for the `file` key",
responses={400: DEFAULT_400_ERROR, 404: DEFAULT_404_ERROR, 200: Txt2stixExtractorSerializer},
),
)
class WhitelistsView(txt2stixView):
lookup_url_kwarg = "whitelist_id"
openapi_path_params = [
OpenApiParameter(
lookup_url_kwarg, location=OpenApiParameter.PATH, type=OpenApiTypes.UUID, description="The `id` of the Whitelist."
)
]
openapi_tags = ["Whitelists"]
pagination_class = Pagination("whitelists")

def get_all(self):
return self.all_extractors(["whitelist"])

@extend_schema_view(
list=extend_schema(
summary="Search for aliases",
description=textwrap.dedent(
"""
Aliases replace strings in the blog post with values defined in the Alias. Aliases are applied before extractions. For example, an alias of `USA` with a value `United States` will change all records of `USA` in the blog post with `United States`. To see the values used in this Alias, visit the URL shown as the value for the `file` key\n\n
For more information see [txt2stix](https://github.com/muchdogesec/txt2stix/).
"""
),
responses={400: DEFAULT_400_ERROR, 200: Txt2stixExtractorSerializer},
),
retrieve=extend_schema(
summary="Get an Alias",
description="Get a specific Alias. To see the values used in this Alias, visit the URL shown as the value for the `file` key",
responses={400: DEFAULT_400_ERROR, 404: DEFAULT_404_ERROR, 200: Txt2stixExtractorSerializer},
),
)
class AliasesView(txt2stixView):
openapi_tags = ["Aliases"]
pagination_class = Pagination("aliases")

lookup_url_kwarg = "alias_id"

openapi_path_params = [
OpenApiParameter(
lookup_url_kwarg, location=OpenApiParameter.PATH, type=OpenApiTypes.UUID, description="The `id` of the Alias."
)
]

def get_all(self):
return self.all_extractors(["alias"])
return self.all_extractors(["lookup", "pattern", "ai"])

0 comments on commit d2b6bd9

Please sign in to comment.