Merge pull request #11 from muchdogesec/remove-whitelist-and-aliases

Remove whitelist and aliases
muchdogesec · Nov 13, 2024 · d2b6bd9 · d2b6bd9
2 parents 791497e + 232ba4a
commit d2b6bd9
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 87 deletions.
diff --git a/dogesec_commons/stixifier/models.py b/dogesec_commons/stixifier/models.py
@@ -28,8 +28,6 @@ class Profile(models.Model):
     created = models.DateTimeField(auto_now_add=True)
     name = models.CharField(max_length=250, unique=True)
     extractions = ArrayField(base_field=models.CharField(max_length=256))
-    whitelists  = ArrayField(base_field=models.CharField(max_length=256), default=list)
-    aliases     = ArrayField(base_field=models.CharField(max_length=256), default=list)
     relationship_mode = models.CharField(choices=RelationshipMode.choices, max_length=20, default=RelationshipMode.STANDARD)
     extract_text_from_image = models.BooleanField(default=False)
     defang = models.BooleanField(help_text='If the text should be defanged before processing')

diff --git a/dogesec_commons/stixifier/serializers.py b/dogesec_commons/stixifier/serializers.py
@@ -71,16 +71,6 @@ class ProfileSerializer(serializers.ModelSerializer):
         child=serializers.CharField(max_length=256, validators=[partial(validate_extractor, 'extractor', ["ai", "pattern", "lookup"])]),
         help_text="extraction id(s)",
     )
-    aliases = serializers.ListField(
-        child=serializers.CharField(max_length=256, validators=[partial(validate_extractor, 'alias', ["alias"])]),
-        help_text="alias id(s)",
-        required=False,
-    )
-    whitelists = serializers.ListField(
-        child=serializers.CharField(max_length=256, validators=[partial(validate_extractor, 'whitelist', ["whitelist"])]),
-        help_text="whitelist id(s)",
-        required=False,
-    )
 
     class Meta:
         model = Profile

diff --git a/dogesec_commons/stixifier/stixifier.py b/dogesec_commons/stixifier/stixifier.py
@@ -88,8 +88,7 @@ def txt2stix(self):
                 extractors_map[extractor.type][extractor.slug] = extractor
             else:
                 extractors_map[extractor.type] = {extractor.slug: extractor}
-        aliases = all_extractors(self.profile.aliases)
-        whitelists = all_extractors(self.profile.whitelists)
+
 
         bundler = txt2stixBundler(
             self.report_prop.name,
@@ -105,19 +104,17 @@ def txt2stix(self):
         )
         self.extra_data['_stixify_report_id'] = str(bundler.report.id)
         input_text = txt2stix.remove_data_images(self.output_md)
-        aliased_input = txt2stix.aliases.transform_all(aliases.values(), input_text)
-        bundler.whitelisted_values = txt2stix.lookups.merge_whitelists(whitelists.values())
 
 
         ai_extractors = [txt2stix.parse_model(model_str) for model_str in self.profile.ai_settings_extractions]
-        txt2stix.validate_token_count(settings.INPUT_TOKEN_LIMIT, aliased_input, ai_extractors)
+        txt2stix.validate_token_count(settings.INPUT_TOKEN_LIMIT, input_text, ai_extractors)
 
-        all_extracts = txt2stix.extract_all(bundler, extractors_map, aliased_input, ai_extractors=ai_extractors)
+        all_extracts = txt2stix.extract_all(bundler, extractors_map, input_text, ai_extractors=ai_extractors)
 
         if self.profile.relationship_mode == models.RelationshipMode.AI and sum(map(lambda x: len(x), all_extracts.values())):
             ai_ref_extractor = txt2stix.parse_model(self.profile.ai_settings_relationships)
-            txt2stix.validate_token_count(settings.INPUT_TOKEN_LIMIT, aliased_input, [ai_ref_extractor])
-            txt2stix.extract_relationships_with_ai(bundler, aliased_input, all_extracts, ai_ref_extractor)
+            txt2stix.validate_token_count(settings.INPUT_TOKEN_LIMIT, input_text, [ai_ref_extractor])
+            txt2stix.extract_relationships_with_ai(bundler, input_text, all_extracts, ai_ref_extractor)
         return bundler
 
 

diff --git a/dogesec_commons/stixifier/views.py b/dogesec_commons/stixifier/views.py
@@ -18,7 +18,7 @@
 @extend_schema_view(
     list=extend_schema(
         summary="Search profiles",
-        description="Profiles determine how txt2stix processes the text in each File. A profile consists of an extractors, aliases, and/or whitelists. You can search for existing profiles here.",
+        description="Profiles determine how txt2stix processes the text in each File. A profile consists of extractors. You can search for existing profiles here.",
         responses={400: DEFAULT_400_ERROR, 200: ProfileSerializer},
     ),
     retrieve=extend_schema(
@@ -30,12 +30,10 @@
         summary="Create a new profile",
                 description=textwrap.dedent(
             """
-            Add a new Profile that can be applied to new Files. A profile consists of extractors, aliases, and/or whitelists. You can find available extractors, aliases, and whitelists via their respective endpoints.\n\n
+            Add a new Profile that can be applied to new Files. A profile consists of extractors. You can find available extractors via their respective endpoints.\n\n
             The following key/values are accepted in the body of the request:\n\n
             * `name` (required - must be unique)
             * `extractions` (required - at least one extraction ID): can be obtained from the GET Extractors endpoint. This is a [txt2stix](https://github.com/muchdogesec/txt2stix/) setting.
-            * `whitelists` (optional): can be obtained from the GET Whitelists endpoint. This is a [txt2stix](https://github.com/muchdogesec/txt2stix/) setting.
-            * `aliases` (optional): can be obtained from the GET Whitelists endpoint. This is a [txt2stix](https://github.com/muchdogesec/txt2stix/) setting.
             * `relationship_mode` (required): either `ai` or `standard`. Required AI provider to be configured if using `ai` mode. This is a [txt2stix](https://github.com/muchdogesec/txt2stix/) setting.
             * `ai_settings_extractions` (required if AI extraction used): A list of AI providers and models to be used for extraction in format `["provider:model","provider:model"]` e.g. `["openai:gpt-4o"]`.
             * `ai_settings_relationships` (required if AI relationship used): An AI provider and models to be used for relationship generation in format `"provider:model"` e.g. `"openai:gpt-4o"`.
@@ -143,66 +141,4 @@ class ExtractorsView(txt2stixView):
     pagination_class = Pagination("extractors")
 
     def get_all(self):
-        return self.all_extractors(["lookup", "pattern", "ai"])
-
-@extend_schema_view(
-    list=extend_schema(
-        summary="Search for Whitelists",
-        description=textwrap.dedent(
-            """
-            In many cases files will have IoC extractions that are not malicious. e.g. `google.com` (and thus they don't want them to be extracted). Whitelists provide a list of values to be compared to extractions. If a whitelist value matches an extraction, that extraction is removed. To see the values used in this Whitelist, visit the URL shown as the value for the `file` key.\n\n
-            For more information see [txt2stix](https://github.com/muchdogesec/txt2stix/).
-            """
-        ),
-        responses={400: DEFAULT_400_ERROR, 200: Txt2stixExtractorSerializer},
-    ),
-    retrieve=extend_schema(
-        summary="Get a whitelist",
-        description="Get a specific Whitelist. To see the values used in this Whitelist, visit the URL shown as the value for the `file` key",
-        responses={400: DEFAULT_400_ERROR, 404: DEFAULT_404_ERROR, 200: Txt2stixExtractorSerializer},
-    ),
-)
-class WhitelistsView(txt2stixView):
-    lookup_url_kwarg = "whitelist_id"
-    openapi_path_params = [
-        OpenApiParameter(
-            lookup_url_kwarg, location=OpenApiParameter.PATH, type=OpenApiTypes.UUID, description="The `id` of the Whitelist."
-        )
-    ]
-    openapi_tags = ["Whitelists"]
-    pagination_class = Pagination("whitelists")
-
-    def get_all(self):
-        return self.all_extractors(["whitelist"])
-
-@extend_schema_view(
-    list=extend_schema(
-        summary="Search for aliases",
-        description=textwrap.dedent(
-            """
-            Aliases replace strings in the blog post with values defined in the Alias. Aliases are applied before extractions. For example, an alias of `USA` with a value `United States` will change all records of `USA` in the blog post with `United States`. To see the values used in this Alias, visit the URL shown as the value for the `file` key\n\n
-            For more information see [txt2stix](https://github.com/muchdogesec/txt2stix/).
-            """
-        ),
-        responses={400: DEFAULT_400_ERROR, 200: Txt2stixExtractorSerializer},
-    ),
-    retrieve=extend_schema(
-        summary="Get an Alias",
-        description="Get a specific Alias. To see the values used in this Alias, visit the URL shown as the value for the `file` key",
-        responses={400: DEFAULT_400_ERROR, 404: DEFAULT_404_ERROR, 200: Txt2stixExtractorSerializer},
-    ),
-)
-class AliasesView(txt2stixView):
-    openapi_tags = ["Aliases"]
-    pagination_class = Pagination("aliases")
-
-    lookup_url_kwarg = "alias_id"
-
-    openapi_path_params = [
-        OpenApiParameter(
-            lookup_url_kwarg, location=OpenApiParameter.PATH, type=OpenApiTypes.UUID, description="The `id` of the Alias."
-        )
-    ]
-
-    def get_all(self):
-        return self.all_extractors(["alias"])
+        return self.all_extractors(["lookup", "pattern", "ai"])