hystax · stanfra · Jul 1, 2024 · Jun 28, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/bulldozer/bulldozer_worker/tasks.py b/bulldozer/bulldozer_worker/tasks.py
@@ -40,6 +40,10 @@ class GoalReached(DestroyConditionException):
     pass
 
 
+class TaskInErrorState(Exception):
+    pass
+
+
 class TaskState:
 
     STARTING_PREPARING = 1
@@ -270,6 +274,16 @@ def _exec(self):
 
 class SetFinished(Base):
 
+    def _handle_error(self, err):
+        if self.body["try"] < MAX_RETRIES:
+            self.body["try"] += 1
+        elif self.body["state"] == TaskState.ERROR:
+            raise TaskInErrorState('Task is already in ERROR state')
+        else:
+            self.body["state"] = TaskState.ERROR
+            self.body["reason"] = f"{str(err)}" or None
+        self.on_continue(self.body, self.delayed)
+
     def update_reason(self):
 
         reason = self.body.get("reason")

diff --git a/ngui/ui/public/docs/datasets.md b/ngui/ui/public/docs/datasets.md
@@ -0,0 +1,24 @@
+### Summary
+
+Use the Datasets Page to keep track of Your machine learning datasets.
+Add your datasets, ensuring you include relevant metadata such as the dataset name, description, source path, labels, and validity period. To log a specific dataset for our training code, include the Dataset ID in your code like this: `arcee.dataset("ID")`. Any datasets you register within your training code will automatically appear on this page.
+
+### View
+
+- Details: Find the path to source (ID) of the dataset and implement into Your code.
+
+### Actions
+
+- Update the Content: Click the Refresh button to view the latest information.
+
+- Add a New Dataset: Easily create a new dataset by clicking the green "Add" button. Specify the ID (path to source) and other properties.
+
+- Manage Datasets: Use the "Actions" column icons to edit/delete the dataset.
+
+### Tips
+
+- Data Cleaning: Regularly clean and preprocess your data to remove noise, handle missing values, and normalize data formats. Consistent preprocessing ensures that models are trained on high-quality data, leading to better performance and reliability.
+
+- Centralized Data Storage: Cloud storage allows easy access and sharing of datasets across teams. This promotes collaboration and ensures that everyone is working with the most up-to-date data.
+
+- Access Controls: Implement robust access controls and permissions to ensure that only authorized users can access sensitive data. This helps in maintaining data security and compliance with privacy regulations.
diff --git a/rest_api/rest_api_server/controllers/offer_breakdown.py b/rest_api/rest_api_server/controllers/offer_breakdown.py
@@ -1,8 +1,6 @@
 import logging
-import json
 from clickhouse_driver import Client as ClickHouseClient
 from collections import defaultdict
-from datetime import datetime, timedelta
 from rest_api.rest_api_server.controllers.ri_breakdown import (
     RiBreakdownController)
 from rest_api.rest_api_server.controllers.base_async import (

diff --git a/rest_api/rest_api_server/controllers/organization_gemini.py b/rest_api/rest_api_server/controllers/organization_gemini.py
@@ -3,18 +3,16 @@
 from typing import List
 
 from clickhouse_driver import Client as ClickHouseClient
-from sqlalchemy.sql import and_, exists
-
-from tools.optscale_exceptions.common_exc import WrongArgumentsException
 
 from rest_api.rest_api_server.controllers.base import (
     BaseController, ClickHouseMixin)
-from rest_api.rest_api_server.controllers.base_async import BaseAsyncControllerWrapper
-from rest_api.rest_api_server.exceptions import Err
-from rest_api.rest_api_server.models.models import (
-    OrganizationGemini, CloudAccount)
+from rest_api.rest_api_server.controllers.base_async import (
+    BaseAsyncControllerWrapper)
+from rest_api.rest_api_server.models.models import OrganizationGemini
 from rest_api.rest_api_server.utils import (
-    check_string_attribute, check_int_attribute, check_dict_attribute)
+    check_int_attribute, check_dict_attribute, check_list_attribute,
+    check_float_attribute
+)
 
 
 LOG = logging.getLogger(__name__)
@@ -33,11 +31,15 @@ def _validate_stats(**kwargs):
         stats = kwargs.get('stats')
         check_dict_attribute('stats', stats, allow_empty=True)
         if stats:
-            for param in ['total_objects', 'considered_objects', 'total_size',
-                          'might_deleted']:
+            for param in ['total_objects', 'filtered_objects',
+                          'duplicated_objects']:
                 value = stats.get(param)
                 if value is not None:
                     check_int_attribute(param, value)
+            for param in ['total_size', 'duplicates_size', 'monthly_savings']:
+                value = stats.get(param)
+                if value is not None:
+                    check_float_attribute(param, value)
 
     def edit(self, item_id, **kwargs):
         self._validate_stats(**kwargs)
@@ -135,31 +137,16 @@ def list(self, organization_id: str = None, **kwargs) -> List[OrganizationGemini
         else:
             return super().list()
 
-    def _validate_filters(self, organization_id, filters):
+    def _validate_filters(self, filters):
         check_dict_attribute('filters', filters, allow_empty=True)
         if filters:
             if 'buckets' in filters:
-                check_string_attribute('buckets', filters['buckets'])
+                check_list_attribute('buckets', filters['buckets'])
             if 'min_size' in filters:
                 check_int_attribute('min_size', filters['min_size'])
-            if 'cloud_account_id' in filters:
-                cloud_account_id = filters['cloud_account_id']
-                check_string_attribute('cloud_account_id', cloud_account_id)
-                cloud_account_exist = self.session.query(
-                    exists().where(
-                        and_(
-                            CloudAccount.deleted.is_(False),
-                            CloudAccount.organization_id == organization_id,
-                            CloudAccount.id == cloud_account_id
-                        )
-                    )
-                ).scalar()
-                if not cloud_account_exist:
-                    raise WrongArgumentsException(
-                        Err.OE0217, ['cloud_account_id'])
 
     def create(self, organization_id: str, filters: dict) -> OrganizationGemini:
-        self._validate_filters(organization_id, filters)
+        self._validate_filters(filters)
         return super().create(organization_id=organization_id,
                               filters=json.dumps(filters))
 

diff --git a/rest_api/rest_api_server/controllers/ri_breakdown.py b/rest_api/rest_api_server/controllers/ri_breakdown.py
@@ -114,16 +114,23 @@ def get_total_stats(self, cloud_account_ids):
 
     def get_flavors(self, cloud_account_ids):
         flavor_factor_map = defaultdict(float)
-        expenses = self.raw_expenses_collection.distinct(
-            'product/instanceType', {
-                'cloud_account_id': {'$in': cloud_account_ids},
-                'start_date': {
-                    '$gte': datetime.fromtimestamp(self.start_date),
-                    '$lte': datetime.fromtimestamp(self.end_date)},
-                'lineItem/LineItemType': 'DiscountedUsage'
-            }
-        )
-        for flavor_name in expenses:
+        flavors = self.execute_clickhouse(
+            """SELECT DISTINCT instance_type
+               FROM ri_sp_usage
+               WHERE cloud_account_id IN cloud_account_ids AND
+                 date >= %(start_date)s AND date <= %(end_date)s AND
+                 offer_type='ri'
+               """,
+            params={
+                'start_date': datetime.fromtimestamp(self.start_date),
+                'end_date': datetime.fromtimestamp(self.end_date)
+            },
+            external_tables=[{'name': 'cloud_account_ids',
+                              'structure': [('id', 'String')],
+                              'data': [{'id': r_id} for r_id in
+                                       cloud_account_ids]}])
+        flavors = [x[0] for x in flavors]
+        for flavor_name in flavors:
             if 'db.' in flavor_name:
                 # RDS instances don't have normalization factor
                 # use 1 as default

diff --git a/rest_api/rest_api_server/controllers/sp_breakdown.py b/rest_api/rest_api_server/controllers/sp_breakdown.py
@@ -38,28 +38,24 @@ def get_usage_breakdown(self, start_date, end_date, cloud_account_ids):
 
     def get_flavors(self, cloud_account_ids):
         flavor_rate_map = defaultdict(float)
-        expenses = self.raw_expenses_collection.aggregate([
-            {'$match': {
-                'cloud_account_id': {'$in': cloud_account_ids},
-                'start_date': {
-                    '$gte': datetime.fromtimestamp(self.start_date),
-                    '$lt': datetime.fromtimestamp(self.end_date)
-                },
-                'lineItem/LineItemType': 'SavingsPlanCoveredUsage'
-            }},
-            {'$group': {
-                '_id': {
-                    'instance_type': '$product/instanceType',
-                    'description': '$lineItem/LineItemDescription'
-                },
-                'rate': {'$last': '$savingsPlan/SavingsPlanRate'}
-            }}
-        ])
+        flavors = self.execute_clickhouse(
+            """SELECT DISTINCT instance_type, sp_rate
+               FROM ri_sp_usage
+               WHERE cloud_account_id IN cloud_account_ids AND
+                 date >= %(start_date)s AND date <= %(end_date)s AND
+                 offer_type='sp' and sp_rate!=0
+               """,
+            params={
+                'start_date': datetime.fromtimestamp(self.start_date),
+                'end_date': datetime.fromtimestamp(self.end_date)
+            },
+            external_tables=[{'name': 'cloud_account_ids',
+                              'structure': [('id', 'String')],
+                              'data': [{'id': r_id} for r_id in
+                                       cloud_account_ids]}])
         # todo: it could be two different rates for flavor
-        for expense in expenses:
-            _id = expense['_id']
-            flavor_name = _id.get('instance_type') or _id.get('description')
-            sp_rate = float(expense.get('rate', 0))
+        for flavor in flavors:
+            flavor_name, sp_rate = flavor
             if sp_rate:
                 flavor_rate_map[flavor_name] = sp_rate
         return flavor_rate_map
@@ -123,7 +119,6 @@ def fill_overprovisioning(self, flavor_rate_map, cloud_account_usage,
             sp_acc_date_exp[cloud_account_id][date] += overprov_exp
         for cloud_acc_id, date_exp in cloud_account_usage.items():
             for date, data in date_exp.items():
-                date_ts = int(date.timestamp())
                 sp_overprov_exp = sp_acc_date_exp[cloud_acc_id].get(date, 0)
                 data['overprovision'] = sp_overprov_exp
                 if 'overprovision_hrs' not in data:

diff --git a/rest_api/rest_api_server/handlers/v2/organization_geminis.py b/rest_api/rest_api_server/handlers/v2/organization_geminis.py
@@ -232,6 +232,7 @@ async def patch(self, gemini_id, **kwargs):
                     description: |
                         Wrong arguments:
                         - OE0223: Argument should be integer
+                        - OE0466: Argument should be float
                 401:
                     description: |
                         Unauthorized:
@@ -536,6 +537,7 @@ async def post(self, organization_id, **kwargs):
                     - OE0214: Argument should be a string
                     - OE0217: Invalid query parameter
                     - OE0223: min_size should be integer
+                    - OE0385: buckets should be a list
             401:
                 description: |
                     Unauthorized

diff --git a/rest_api/rest_api_server/tests/unittests/test_api_base.py b/rest_api/rest_api_server/tests/unittests/test_api_base.py
@@ -609,12 +609,14 @@ def get_csv(path):
                     ('cloud_account_id', 'String', 'default'),
                     ('resource_id', 'String', 'default'),
                     ('date', 'DateTime', datetime.utcnow()),
+                    ('instance_type', 'String', ''),
                     ('offer_id', 'String', 'default'),
                     ('offer_type', "Enum8('ri' = 1, 'sp' = 2)", 1),
                     ('offer_cost', 'Float64', 0),
                     ('on_demand_cost', 'Float64', 0),
                     ('usage', 'Float64', 0),
                     ('ri_norm_factor', 'Float32', 0),
+                    ('sp_rate', 'Float32', 0),
                     ('expected_cost', 'Float64', 0),
                     ('sign', 'Int8', 1)
                 ], self.ri_sp_usage