From 4d3e7460b8b000dcbf2b4a1d6a8cd641690775cb Mon Sep 17 00:00:00 2001
From: Chris Deptula Defaults to "-1s".
*/
- PinSchemaTimeout;
+ PinSchemaTimeout,
+
+ /**
+ * The "AggregateScanSchema" property is the name of the database schema
+ * to scan when looking for aggregate tables. If defined, Mondrian will
+ * only look for aggregate tables within this schema. This means that
+ * all aggregate tables, including explicitly defined tables must be in
+ * this schema. If not defined, Mondrian will scan every schema that
+ * the database connection has access to when looking for aggregate
+ * tables.
+ */
+ AggregateScanSchema,
+
+ /**
+ * The "AggregateScanCatalog" property is the name of the database
+ * catalog to scan when looking for aggregate tables. If defined,
+ * Mondrian will only look for aggregate tables within this catalog.
+ * This means that all aggregate tables, including explicitly defined
+ * tables must be in this catalog. If not defined, Mondrian will
+ * scan every catalog the database connection has access to when
+ * looking for aggregate tables.
+ */
+ AggregateScanCatalog;
/**
* Any property beginning with this value will be added to the
diff --git a/mondrian/src/main/java/mondrian/rolap/RolapSchema.java b/mondrian/src/main/java/mondrian/rolap/RolapSchema.java
index 5dc534cd92..c4364591aa 100644
--- a/mondrian/src/main/java/mondrian/rolap/RolapSchema.java
+++ b/mondrian/src/main/java/mondrian/rolap/RolapSchema.java
@@ -5,12 +5,13 @@
// You must accept the terms of that agreement to use this software.
//
// Copyright (C) 2001-2005 Julian Hyde
-// Copyright (C) 2005-2017 Hitachi Vantara and others
+// Copyright (C) 2005-2019 Hitachi Vantara and others
// All Rights Reserved.
*/
package mondrian.rolap;
import mondrian.olap.*;
+import mondrian.olap.Util.PropertyList;
import mondrian.olap.fun.*;
import mondrian.olap.type.*;
import mondrian.resource.MondrianResource;
@@ -211,7 +212,7 @@ private RolapSchema(
DataSource dataSource)
{
this(key, connectInfo, dataSource, md5Bytes, md5Bytes != null);
- load(catalogUrl, catalogStr);
+ load(catalogUrl, catalogStr, connectInfo);
assert this.md5Bytes != null;
}
@@ -293,14 +294,29 @@ protected Logger getLogger() {
return LOGGER;
}
+ /**
+ * @deprecated API changed to also pass Mondrian connection properties
+ * @param catalogUrl URL of catalog
+ * @param catalogStr Text of catalog, or null
+ */
+ @Deprecated
+ protected void load(String catalogUrl, String catalogStr) {
+ load(catalogUrl, catalogStr, new PropertyList());
+ }
+
/**
* Method called by all constructors to load the catalog into DOM and build
* application mdx and sql objects.
*
* @param catalogUrl URL of catalog
* @param catalogStr Text of catalog, or null
+ * @param connectInfo Mondrian connection properties
*/
- protected void load(String catalogUrl, String catalogStr) {
+ protected void load(
+ String catalogUrl,
+ String catalogStr,
+ PropertyList connectInfo)
+ {
try {
final Parser xmlParser = XOMUtil.createDefaultParser();
@@ -370,7 +386,7 @@ protected void load(String catalogUrl, String catalogStr) {
throw Util.newError(e, "while parsing catalog " + catalogUrl);
}
- aggTableManager.initialize();
+ aggTableManager.initialize(connectInfo);
setSchemaLoadDate();
}
diff --git a/mondrian/src/main/java/mondrian/rolap/aggmatcher/AggGen.java b/mondrian/src/main/java/mondrian/rolap/aggmatcher/AggGen.java
index 4b1103d449..93e2ed7644 100644
--- a/mondrian/src/main/java/mondrian/rolap/aggmatcher/AggGen.java
+++ b/mondrian/src/main/java/mondrian/rolap/aggmatcher/AggGen.java
@@ -5,10 +5,9 @@
// You must accept the terms of that agreement to use this software.
//
// Copyright (C) 2005-2005 Julian Hyde
-// Copyright (C) 2005-2017 Hitachi Vantara and others
+// Copyright (C) 2005-2019 Hitachi Vantara and others
// All Rights Reserved.
*/
-
package mondrian.rolap.aggmatcher;
import mondrian.olap.MondrianDef;
@@ -157,7 +156,7 @@ protected void addForeignKeyToNotLostColumnUsages(
private void init() {
JdbcSchema db = JdbcSchema.makeDB(star.getDataSource());
try {
- db.load();
+ db.load(new Util.PropertyList());
} catch (SQLException ex) {
getLogger().error(ex);
return;
@@ -467,7 +466,7 @@ private boolean addCollapsedColumn(
return false;
}
- //CG guarantee the columns has been loaded before looking up them
+ // CG guarantee the columns has been loaded before looking up them
try {
jt.load();
} catch (SQLException sqle) {
diff --git a/mondrian/src/main/java/mondrian/rolap/aggmatcher/AggTableManager.java b/mondrian/src/main/java/mondrian/rolap/aggmatcher/AggTableManager.java
index 5cc31625c9..e8dd606d0d 100644
--- a/mondrian/src/main/java/mondrian/rolap/aggmatcher/AggTableManager.java
+++ b/mondrian/src/main/java/mondrian/rolap/aggmatcher/AggTableManager.java
@@ -5,7 +5,7 @@
// You must accept the terms of that agreement to use this software.
//
// Copyright (C) 2005-2005 Julian Hyde
-// Copyright (C) 2005-2017 Hitachi Vantara and others
+// Copyright (C) 2005-2019 Hitachi Vantara and others
// All Rights Reserved.
*/
package mondrian.rolap.aggmatcher;
@@ -14,6 +14,7 @@
import mondrian.olap.MondrianException;
import mondrian.olap.MondrianProperties;
import mondrian.olap.Util;
+import mondrian.olap.Util.PropertyList;
import mondrian.recorder.ListRecorder;
import mondrian.recorder.MessageRecorder;
import mondrian.recorder.RecorderException;
@@ -38,7 +39,7 @@
* Unlike many OLAP servers, Mondrian does not store data on disk: it just works on
-the data in the RDBMS, and once it has read a piece of data once, it stores that
-data in its cache. This greatly simplifies the process of installing Mondrian,
-but it puts limits on Mondrian's performance when Mondrian is applied to a huge
+ Unlike many OLAP servers, Mondrian does not store data on disk: it just works on
+the data in the RDBMS, and once it has read a piece of data once, it stores that
+data in its cache. This greatly simplifies the process of installing Mondrian,
+but it puts limits on Mondrian's performance when Mondrian is applied to a huge
dataset. Consider what happens when the CEO runs her Sales Report first thing on a Monday
-morning. This report contains a single number: the total sales of all products,
-in all regions, this year. In order to get this number, Mondrian generates a
+ Consider what happens when the CEO runs her Sales Report first thing on a Monday
+morning. This report contains a single number: the total sales of all products,
+in all regions, this year. In order to get this number, Mondrian generates a
query something like this: and sends it to the DBMS. The DBMS takes several minutes to execute it: which is
-understandable because the DBMS has to read all of this year's records in the
-fact table (a few million sales, say) and aggregate them into a single total.
-Clearly, what is needed in this case, and in others like it, is a pre-computed
+ and sends it to the DBMS. The DBMS takes several minutes to execute it: which is
+understandable because the DBMS has to read all of this year's records in the
+fact table (a few million sales, say) and aggregate them into a single total.
+Clearly, what is needed in this case, and in others like it, is a pre-computed
summary of the data: an aggregate table. An aggregate table coexists with the base fact table,
+ An aggregate table coexists with the base fact table,
and contains pre-aggregated measures built from the
-fact table. It is registered in Mondrian's schema, so that Mondrian can choose
+fact table. It is registered in Mondrian's schema, so that Mondrian can choose
whether to use the aggregate table rather than the fact table, if it is
applicable for a particular query.Contents
1. Introduction
-
@@ -85,15 +89,15 @@
-1. Introduction
AND time.year = 2005
1. Introduction
There is extensive research, both empirical and theoretical, available
on the web concerning different ways to structure aggregate tables and we will not attempt to duplicate any of it here.
-
The star schema has a single fact table Sales
, two measure
-columns (units
and dollars
) and four dimension tables
-(Product
, Mfr
, Customer
, Time
,
+
The star schema has a single fact table Sales
, two measure
+columns (units
and dollars
) and four dimension tables
+(Product
, Mfr
, Customer
, Time
,
and Customer
).
On top of this star schema, we create the following multidimensional model:
@@ -130,19 +134,19 @@[Customer]
has levels [All Customers]
,
[State]
, [City]
, [Custid]
[Payment Method]
has levels [All Payment
+ Dimension [Payment Method]
has levels [All Payment
Methods]
, [Payment Method]
-Most of the dimensions have a corresponding dimension table, but there are
-two exceptions. The [Product]
dimension is a snowflake
-dimension, which means that it is spread across more than one table (in
-this case Product
and Mfr
). The [Payment Method]
dimension
+
Most of the dimensions have a corresponding dimension table, but there are
+two exceptions. The [Product]
dimension is a snowflake
+dimension, which means that it is spread across more than one table (in
+this case Product
and Mfr
). The [Payment Method]
dimension
is a degenerate dimension; its sole attribute is the
-payment
column in the fact table, and so it does not need a dimension
+payment
column in the fact table, and so it does not need a dimension
table.
-
@@ -156,17 +160,17 @@ Time
dimension has been "collapsed" into the aggregate
+ Time
dimension has been "collapsed" into the aggregate
table, omitting the month
and day
columns.Product
dimension has been
+ Product
dimension has been
"collapsed" into the aggregate table.units
, dollars
),
- there are one or more measure columns in the aggregate table (sum units
, min
+ - For each measure column in the fact table (
units
, dollars
),
+ there are one or more measure columns in the aggregate table (sum units
, min
units
, max units
, sum dollars
).
- - There is also a measure column,
row count
, representing the
+ - There is also a measure column,
row count
, representing the
"count" measure.
<Cube name="Sales">
<Table name="sales">
- <AggName
+ <AggName
name="agg_1" ... />
- <AggName
+ <AggName
name="agg_2">
<AggFactCount column="row count"/>
- <AggForeignKey factColumn="prodid"
+ <AggForeignKey factColumn="prodid"
aggColumn="prodid"/>
- <AggMeasure name="[Measures].[Unit
+ <AggMeasure name="[Measures].[Unit
Sales]" column="sum units"/>
- <AggMeasure
+ <AggMeasure
name="[Measures].[Min Units]" column="min units"/>
- <AggMeasure name="[Measures].[Max
+ <AggMeasure name="[Measures].[Max
Units]" column="max units"/>
- <AggMeasure name="[Measures].[Dollar
+ <AggMeasure name="[Measures].[Dollar
Sales]" column="sum dollars"/>
- <AggLevel name="[Time].[Year]"
+ <AggLevel name="[Time].[Year]"
column="year"/>
- <AggLevel name="[Time].[Quarter]"
+ <AggLevel name="[Time].[Quarter]"
column="quarter"/>
- <AggLevel name="[Time].[Month]"
+ <AggLevel name="[Time].[Month]"
column="month"/>
- <AggLevel name="[Payment
- Method].[Payment Method]"
+ <AggLevel name="[Payment
+ Method].[Payment Method]"
column="payment"/>
- <AggLevel name="[Customer].[State]"
+ <AggLevel name="[Customer].[State]"
column="state"/>
</AggName>
</Table>
<Dimension name="Product">
- <Hierarchy hasAll="true"
+ <Hierarchy hasAll="true"
primaryKey="prodid" primaryKeyTable="Product">
- <Join leftKey="mfrid"
+ <Join leftKey="mfrid"
rightKey="mfrid">
- <Table
+ <Table
name="Product"/>
- <Table
+ <Table
name="Mfr"/>
</Join>
- <Level
+ <Level
name="Manufacturer" table="Mfr" column="mfrid"/>
- <Level
+ <Level
name="Brand" table="Product" column="brand"/>
- <Level
+ <Level
name="Name" table="Product" column="prodid"/>
</Hierarchy>
@@ -292,19 +296,19 @@ 2.2 Another aggregate table
Several dimensions have been collapsed: [Time]
at the
-[Quarter]
level; [Customer]
at the [State]
-level; and [Payment Method]
at the [Payment Method]
+[Quarter]
level; [Customer]
at the [State]
+level; and [Payment Method]
at the [Payment Method]
-level. But the [Product]
dimension has been retained in its
+level. But the [Product]
dimension has been retained in its
original snowflake form.
-The <AggForeignKey>
element is
-used to declare that the column prodid
links to the dimension
-table, but all other columns remain in the Product
and Mfr
+
The <AggForeignKey>
element is
+used to declare that the column prodid
links to the dimension
+table, but all other columns remain in the Product
and Mfr
dimension tables.
<Cube name="Sales">
<Table name="sales">
- <AggName
+ <AggName
name="agg_3">
<AggFactCount column="cnt"/>
- <AggMeasure name="[Measures].[Unit
+ <AggMeasure name="[Measures].[Unit
Sales]" column="sls"/>
- <AggLevel name="[Time].[Year]"
+ <AggLevel name="[Time].[Year]"
column="yer"/>
- <AggLevel name="[Time].[Quarter]"
+ <AggLevel name="[Time].[Quarter]"
column="qtr"/>
- <AggLevel name="[Time].[Month]"
+ <AggLevel name="[Time].[Month]"
column="mth"/>
- <AggLevel name="[Channel.Network].[Brand]"
+ <AggLevel name="[Channel.Network].[Brand]"
column="brn" collapsed="false"/>
</AggName>
- <AggName
+ <AggName
name="agg_3">
<AggFactCount column="cnt"/>
- <AggMeasure name="[Measures].[Unit
+ <AggMeasure name="[Measures].[Unit
Sales]" column="sls"/>
- <AggLevel name="[Time].[Year]"
+ <AggLevel name="[Time].[Year]"
column="yer"/>
- <AggLevel name="[Time].[Quarter]"
+ <AggLevel name="[Time].[Quarter]"
column="qtr"/>
- <AggLevel name="[Time].[Month]"
+ <AggLevel name="[Time].[Month]"
column="mth"/>
- <AggLevel name="[Channel.Distributor].[Brand]"
+ <AggLevel name="[Channel.Distributor].[Brand]"
column="brn" collapsed="false"/>
</AggName>
@@ -369,72 +373,72 @@ 2.3 Non Collapsed Aggregate Levels&
<Dimension name="Channel">
- <Hierarchy hasAll="true"
+ <Hierarchy hasAll="true"
name="Network" primaryKey="prod" primaryKeyTable="prod">
- <Join leftKey="brn"
+ <Join leftKey="brn"
rightKey="brn" rightAlias="brn_mfr">
- <Table
+ <Table
name="prod"/>
- <Join leftKey="brn"
+ <Join leftKey="brn"
rightKey="brn" rightAlias="brn_mfr">
- <Table
+ <Table
name="brn_mfr"/>
- <Join leftKey="mfr"
+ <Join leftKey="mfr"
rightKey="mfr">
- <Table
+ <Table
name="brn_mfr"/>
- <Table
+ <Table
name="mfr_net"/>
</Join>
</Join>
</Join>
- <Level
+ <Level
name="Network" table="mrf_net" column="net"/>
- <Level
+ <Level
name="Manufacturer" table="mfr_brn" column="brn"/>
- <Level
+ <Level
name="Brand" table="brn_mfr" column="brn"/>
- <Level
+ <Level
name="Product" table="prd" column="brd"/>
</Hierarchy>
- <Hierarchy hasAll="true"
+ <Hierarchy hasAll="true"
name="Distributor" primaryKey="prod" primaryKeyTable="prod">
- <Join leftKey="brn"
+ <Join leftKey="brn"
rightKey="brn" rightAlias="brn_mfr">
- <Table
+ <Table
name="prod"/>
- <Join leftKey="brn"
+ <Join leftKey="brn"
rightKey="brn" rightAlias="brn_mfr">
- <Table
+ <Table
name="brn_mfr"/>
- <Join leftKey="mfr"
+ <Join leftKey="mfr"
rightKey="mfr">
- <Table
+ <Table
name="brn_mfr"/>
- <Table
+ <Table
name="mfr_dist"/>
</Join>
</Join>
</Join>
- <Level
+ <Level
name="Distributor" table="mrf_dist" column="dist"/>
- <Level
+ <Level
name="Manufacturer" table="mfr_brn" column="brn"/>
- <Level
+ <Level
name="Brand" table="brn_mfr" column="brn"/>
- <Level
+ <Level
name="Product" table="prd" column="brd"/>
</Hierarchy>
@@ -466,7 +470,7 @@ 2.3 Non Collapsed Aggregate Levels&
As with regular AggLevel
elements, it is not necessary to include the bottom
levels of the hierarchies. In the example above, we have ommitted the last level, [Product]
-
@@ -501,17 +505,17 @@ 3. Defining aggregate tables
Mondrian supports two aggregation techniques which are called "lost"
dimension and "collapsed" dimension. For the creation of any
-given aggregate table these can
+given aggregate table these can
be applied independently to any number of different dimensions.
A "lost" dimension is one which is completely missing from the aggregate
-table. The measures that appear in the table have been aggregated
+table. The measures that appear in the table have been aggregated
across all values of the lost dimension. As an example, in a fact table
with dimensions of time, location, and product and measure sales, for an
aggregate table that did not have the location dimension that
dimension would be "lost". Here, the sales measure would be the aggregation
over all locations. An aggregate table where all of the dimensions
-are lost is possible - it would have a single row with the measure
+are lost is possible - it would have a single row with the measure
aggregated over everything - sales for all time, all locations and all
products.
@@ -544,16 +548,16 @@ 3. Defining aggregate tables
product_id/location_id pair (a given product was sold at a given
location at 5 different times).
-
The second supported aggregation technique provides a finer level of
+
The second supported aggregation technique provides a finer level of
control, the "collapsed" dimension technique.
-Recall that the dimension key in the fact table refers (more or less)
-to the
+Recall that the dimension key in the fact table refers (more or less)
+to the
lowest level in the dimension hierarchy.
For a collapsed dimension, the dimension key in the aggregate
table is replaced with a set of dimension levels; the dimension key
column is replaced with a set of columns; a fully denormalized
summary table for that dimension.
-As an example, if the time dimension with base fact table foreign key
+As an example, if the time dimension with base fact table foreign key
time_id had the levels: day, month, quarter and
year, and in an aggregate it was collapsed to the month level, then
the aggregate table would not have a time_id column but rather
@@ -586,7 +590,7 @@
3. Defining aggregate tables
In the literature, there are other ways of creating aggregate tables
but they are not supported by Mondrian at this time.
-
@@ -599,13 +603,13 @@ 4. Building aggregate tables
aggregate table definition, one can estimate that for a dimension
with N levels, there are N+1 possible aggregate tables (N collapsed and
1 lost). Also, dimensions (with different dimension tables) can
-be aggregated independently.
-For the FoodMart Sales cube there are 1400 different possible aggregate
+be aggregated independently.
+For the FoodMart Sales cube there are 1400 different possible aggregate
tables.
Clearly, one does not want to create all possible aggregate tables.
Which ones to create depends upon two considerations. The first
-consideration is application dependent:
+consideration is application dependent:
the nature of the MDX queries that will be executed.
If many of the queries deal with per month and per state questions,
then an aggregate at those levels might be created.
@@ -614,15 +618,15 @@
4. Building aggregate tables
the lowest level to the next lowest generally gives greater
bang for the buck than aggregating from the N to the N+1 (N>1) level.
This is because 1) a first level aggregation can be used for all
-queries at that level and above and 2) dimension fan-out tends to
+queries at that level and above and 2) dimension fan-out tends to
increase for the lower levels.
Of course, your mileage may vary.
-In a sense, picking which aggregate tables to build is analogous to
+
In a sense, picking which aggregate tables to build is analogous to
picking which indexes to build on a table; it is application
dependent and experience helps.
-The hardest part about the actually creation and population of
+
The hardest part about the actually creation and population of
aggregate tables is figuring out how to create the first couple;
what the SQL looks like.
After that they are pretty much all the same.
@@ -710,16 +714,16 @@ 4. Building aggregate tables
do not recognize star joins will require indexes on both the
fact table and the aggregate tables.
-For our purposes here, the exact name of the aggregate table is not
+
For our purposes here, the exact name of the aggregate table is not
important; the "agg_l_05_" preceding the base fact table's name
sales_fact_1997. First, the aggregate table name must be different
-from the base fact table name. Next, the aggregate table name ought to be
+from the base fact table name. Next, the aggregate table name ought to be
related to the base fact table name both for human eyeballing of what
aggregate is associated with which fact table, but also, as described
below, Mondrian employs mechanism to automagically recognize which
tables are aggregates of others.
-The following example is a collapsed dimension aggregate table
+
The following example is a collapsed dimension aggregate table
where the time dimension has been rolled up to the month level.
@@ -789,7 +793,7 @@ 4. Building aggregate tables
When creating a collapsed dimension aggregate one might consider creating
indexes for the columns imported from the dimension that was collapsed.
-Below is another aggregate table. This one has two lost dimensions (store_id
and
+
Below is another aggregate table. This one has two lost dimensions (store_id
and
promotion_id
) as well as collapsed dimension on time
to the quarter level. This shows how aggregate techniques can be
mixed.
@@ -838,9 +842,9 @@ 4. Building aggregate tables
In the above three examples, for the most part the column names
-in the aggregate are the same column names that appear in the fact
+in the aggregate are the same column names that appear in the fact
table and dimension tables. These tables would all be
-recognized by the Mondrian
+recognized by the Mondrian
default
aggregate recognizer.
It is possible to create an aggregate table and name the columns arbitrarily.
@@ -906,7 +910,7 @@
4. Building aggregate tables
This aggregate table has column names that are not identical to those -found in the base fact table and dimension table. It is still a +found in the base fact table and dimension table. It is still a valid aggregate but Mondrian has to be told how to map its columns into those of the base fact table.
@@ -919,7 +923,7 @@Mondrian has to know about the aggregate tables in order to use them. -You can either define an aggregate explicitly, or set up rules to recognize +You can either define an aggregate explicitly, or set up rules to recognize several aggregate tables at the same time.
How Mondrian recognizes aggregate table names and columns pretty much dictates how one must name those table names and columns when creating them in the first place!
-Rules are templates, designed to work for all fact table names +
Rules are templates, designed to work for all fact table names and their column -names. -These rules are templates of regular expressions +names. +These rules are templates of regular expressions that are instantiated with the names of a fact table and its columns. In order to describe the rule templates, a name that instantiate a rule are represented in a rule by have the name bracketed -by "${" and "}". As an example, -"abc_${name}_xyz" +by "${" and "}". As an example, +"abc_${name}_xyz" is a rule parameterized -by "name". When name is "john" the template becomes +by "name". When name is "john" the template becomes "abc_john_xyz".
The regular expression engine used here and a definition of @@ -983,7 +987,7 @@
agg_.+_${fact_table_name}
-
+
which is parameterized with the fact table's name. (In addition, this rule is applied in "ignore case" mode.) This means that an aggregate table's name must start with @@ -993,8 +997,8 @@
As an example of applying the aggregate table name rule,
-let the fact table be called
-sales_fact_1997
, the
+let the fact table be called
+sales_fact_1997
, the
Sales
cube's fact table from the FoodMart schema. Applying the
@@ -1004,7 +1008,7 @@
agg_.+_sales_fact_1997
-
+
This will match the following table names:
After the default recognizer determines that a table's name matches @@ -1071,7 +1075,7 @@
At this point, matches are looked for the level and measure columns. -Both of these matching rules are multi-part - has sub rules; +Both of these matching rules are multi-part - has sub rules; each rule has more than one possible regular expression that might match where a match on any one is a match.
@@ -1079,7 +1083,7 @@There are three sub rules for matching level columns. Each is a template which is parameterized with 1) the fact table's cube's dimension hierarchy's name, "hierarchy_name", 2) the fact table's cube's dimension -hierarchy's level name, "level_name", 3) the dimension table's level +hierarchy's level name, "level_name", 3) the dimension table's level column name, "level_column_name", and 4) a usage prefix, "usage_prefix", which in most cases is null":
@@ -1091,7 +1095,7 @@${level_column_name}
The "usage_prefix" is the value of the +
The "usage_prefix" is the value of the
DimensionUsage
's
or
private Dimension
's
@@ -1100,22 +1104,22 @@
usagePrefix
attribute is used to disambiguate those column names.
Of course, one must also remember to prefix the the column in the aggregate
table with the same prefix.
-As an example of +
As an example of
usagePrefix
, consider a fact table named
-ORDERS
which has two
+ORDERS
which has two
DimensionUsage
s, one for the
CUSTOMER
dimension
-and the other for the
+and the other for the
WHOLESALER
dimension where each dimension has a level
column named
@@ -1131,25 +1135,25 @@
WS_CUST_NM
, then the recognizer could associate the
-column with the
+column with the
WHOLESALER
dimension.
In the case of a private
-Dimension
, a
+Dimension
, a
usagePrefix
need only be used if there is a public,
shared
-Dimension
that has the same name and has a
+Dimension
that has the same name and has a
"level_column_name" that is also the same.
-Without the
+Without the
usagePrefix
there would be no way of disambiguating
collapsed dimension aggregate tables.
If any of these parameters have space characters, ' ', these are mapped to underscore characters, '_', and, similarly, dot characters, '.', are also mapped to underscores. -So, if the hierarchy_name is "Time", level_name is "Month" and -level_column_name is month_of_year, the possible aggregate table column +So, if the hierarchy_name is "Time", level_name is "Month" and +level_column_name is month_of_year, the possible aggregate table column names are:
where the measure name is converted to lower case and both the measure column name and aggregate name are matched as they appear. -If the fact table's cube's measure name was, "Avg Unit Sales", -the fact table's measure +If the fact table's cube's measure name was, "Avg Unit Sales", +the fact table's measure column name -is "unit_sales", and, lastly, the fact table's cube's measure's -aggregate name is "avg", then possible aggregate table column names +is "unit_sales", and, lastly, the fact table's cube's measure's +aggregate name is "avg", then possible aggregate table column names that would match are:
-For Mondrian developers there are +
For Mondrian developers there are additional notes describing the default rule recognition schema.
- @@ -1212,7 +1216,7 @@sales_fact_1997
. There are child elements of the
+sales_fact_1997
. There are child elements of the
Table
element that deal with aggregate table recognition.
@@ -1264,7 +1268,7 @@ agg_lc_10_sales_fact_1997
and
agg_pc_10_sales_fact_1997
.
-Following the excludes is the
+Following the excludes is the
AggName
element which identifies the name of an aggregate table
table,
@@ -1278,18 +1282,18 @@ admin_two
are known and should be ignored. If these columns were not so
-identified, Mondrian at the end of determining the fitness of
-the
+identified, Mondrian at the end of determining the fitness of
+the
agg_c_special_sales_fact_1997
-table to be an aggregate of the
+table to be an aggregate of the
sales_fact_1997
fact table would complain that there were extra unidentified columns
and that the mapping was incomplete.
The
AggForeignKey
-elements define mappings from the
+elements define mappings from the
sales_fact_1997
-fact table foreign key column
+fact table foreign key column
names into the
agg_c_special_sales_fact_1997
aggregate table column names.
@@ -1304,11 +1308,11 @@
AggMeasure
mappings, though it will certainly be the most common case.
-The most notable exception are
+The most notable exception are
distinct-count
measures; such a measure can be aggregated, but one can not
in general aggregate further on the measure - the "distinctness" of the
@@ -1320,13 +1324,13 @@ The +
The
In a given
AggName
element is followed by an
AggPattern
element.
-This matches candidate aggregate table names using a
+This matches candidate aggregate table names using a
regular expression. Included as child elements of the
@@ -1341,7 +1345,7 @@
5.2 Explicit aggregates
Table
-element, all of the
+element, all of the
AggExclude
are applied first, followed by the
@@ -1364,25 +1368,25 @@ 5.2 Explicit aggregates
of the fact table (like column names) against
which some of the aggregate table rules are applied. But, a fact table
can actually be the basis of more than one cube. In the FoodMart
-schema the
+schema the
sales_fact_1997
-fact table applies to both the
+fact table applies to both the
Sales
and the
Sales Ragged
cubes.
-What this means is that any explicit rules defined in the
+What this means is that any explicit rules defined in the
Sales
-cube also applies to the
+cube also applies to the
Sales Ragged
cube and visa versa.
One feature of the explicit recognizer is very useful. With a single
-line in the cubes definition in the schema file,
+line in the cubes definition in the schema file,
one can force Mondrian not to recognize any aggregate tables
for the cube's fact table. As an example, for the FoodMart Sales cube
-the following excludes all aggregate tables because the regular expression
+the following excludes all aggregate tables because the regular expression
pattern
".*"
@@ -1396,10 +1400,10 @@
During aggregate table recognition, +
During aggregate table recognition, rather than fail silently, Mondrian is rather noisy about things it can not figure out.
- @@ -1407,30 +1411,30 @@-A parent-child hierarchy is a -special kind of hierarchy where members can have arbitrary depth. The classic +A parent-child hierarchy is a +special kind of hierarchy where members can have arbitrary depth. The classic example of a parent-child hierarchy is an employee org-chart.
-When dealing with parent-child hierarchies, the challenge is to roll up measures -of child members into parent members. For example, when considering an employee -Bill who is head of a department, we want to report not Bill's salary, but -Bill's salary plus the sum of his direct and indirect reports (Eric, Mark and -Carla). It is difficult to generate efficient SQL to do this rollup, so Mondrian +When dealing with parent-child hierarchies, the challenge is to roll up measures +of child members into parent members. For example, when considering an employee +Bill who is head of a department, we want to report not Bill's salary, but +Bill's salary plus the sum of his direct and indirect reports (Eric, Mark and +Carla). It is difficult to generate efficient SQL to do this rollup, so Mondrian provides a special structure called a closure table, which contains the expanded contents of the hierarchy.
-A closure table serves a similar purpose to an aggregate table: it contains a -redundant copy of the data in the database, organized in such a way that -Mondrian can access the data efficiently. An aggregate table speeds up -aggregation, whereas a closure table makes it more efficient to compute +A closure table serves a similar purpose to an aggregate table: it contains a +redundant copy of the data in the database, organized in such a way that +Mondrian can access the data efficiently. An aggregate table speeds up +aggregation, whereas a closure table makes it more efficient to compute hierarchical rollups.
-Supposing that a schema contains a large fact table, and one of the hierarchies -is a parent-child hierarchy. Is is possible to make aggregate tables and closure -tables work together, to get better performance? Let's consider a concrete +Supposing that a schema contains a large fact table, and one of the hierarchies +is a parent-child hierarchy. Is is possible to make aggregate tables and closure +tables work together, to get better performance? Let's consider a concrete example.
@@ -1596,12 +1600,12 @@
0 - +- @@ -1609,15 +1613,15 @@
Regular dimension table:
time (year, month, quarter, time_id)6.1 Aggregate tables at the leaf level of a parent-child hierarchy
-The simplest option is to create an aggregate table which joins at the leaf -level of the parent-child hierarchy. The following aggregate table is for leaf -members of the
[Employee]
hierarchy, and the[Year]
+The simplest option is to create an aggregate table which joins at the leaf +level of the parent-child hierarchy. The following aggregate table is for leaf +members of the[Employee]
hierarchy, and the[Year]
level of the[Time]
hierarchy.Aggregate table:
- agg_salary_Employee_Time_Year (employee_id, time_year, + agg_salary_Employee_Time_Year (employee_id, time_year, sum_dollars)
INSERT INTO agg_salary_Employee_Time_Year
@@ -1634,15 +1638,15 @@
-Mondrian can use the aggregate table to retrieve salaries of leaf employees -(without rolling up salaries of child employees). But because the aggregate -table has the same foreign key as the
- @@ -1650,14 +1654,14 @@salary
fact table, Mondrian +Mondrian can use the aggregate table to retrieve salaries of leaf employees +(without rolling up salaries of child employees). But because the aggregate +table has the same foreign key as thesalary
fact table, Mondrian is able to automatically joinsalary.employee_id
to eitheragg_salary_Employee_Time_Year.employee_id
or-agg_salary_Employee_Time_Year.supervisor_id
to rollup employees +agg_salary_Employee_Time_Year.supervisor_id to rollup employees efficiently.6.2 Combined closure and aggregate tables
-A more advanced option is to combine the closure table and aggregate table into +A more advanced option is to combine the closure table and aggregate table into one:
Aggregate table:
- agg_salary_Employee$Closure_Time_Year (supervisor_id, + agg_salary_Employee$Closure_Time_Year (supervisor_id, time_year, sum_dollars)
INSERT INTO agg_salary_Employee$Closure_Time_Year
@@ -1677,12 +1681,12 @@
-The
- @@ -1690,8 +1694,8 @@agg_salary_Employee$Closure_Time_Year
aggregate table contains -the salary of every employee, rolled up to include their direct and indirect -reports, aggregated to the[Year]
level of the[Time]
+Theagg_salary_Employee$Closure_Time_Year
aggregate table contains +the salary of every employee, rolled up to include their direct and indirect +reports, aggregated to the[Year]
level of the[Time]
dimension.6.2.1 The trick: How combined closure and aggregate tables work
-Incidentally, this works based upon a 'trick' in Mondrian's internals. Whenever -Mondrian sees a closure table, it creates a auxilliary dimension behind the +Incidentally, this works based upon a 'trick' in Mondrian's internals. Whenever +Mondrian sees a closure table, it creates a auxilliary dimension behind the scenes. In the case of the
@@ -1702,23 +1706,23 @@[Employee]
hierarchy and itsemployee_closure
table, the auxilliary dimension is called[Employee$Closure]
.
-When an MDX query evaluates a cell which uses a rolled up salary measure,
-Mondrian translates the coordinates of that cell in the [Employee]
-dimension into a corresponding coordinate in the [Employee$Closure]
+When an MDX query evaluates a cell which uses a rolled up salary measure,
+Mondrian translates the coordinates of that cell in the [Employee]
+dimension into a corresponding coordinate in the [Employee$Closure]
-dimension. This translation happens before
-Mondrian starts to search for a suitable aggregate table, so if your aggregate
+dimension. This translation happens before
+Mondrian starts to search for a suitable aggregate table, so if your aggregate
table contains the name of the auxiliary hierarchy (as
-agg_salary_Employee$Closure_Time_Year
contains the name of the [Employee$Closure]
+agg_salary_Employee$Closure_Time_Year contains the name of the [Employee$Closure]
hierarchy) it find and use the aggregate table in the ordinary way.
-If more than one aggregate table matches a particular query, Mondrian needs to +If more than one aggregate table matches a particular query, Mondrian needs to choose between them.
-If there is an aggregate table of the same granularity as the query, Mondrian
-will use it. If there is no aggregate table at
-the desired granularity, Mondrian will pick an aggregate table of lower
-granularity and roll up from it. In general, Mondrian chooses the aggregate
-table with the fewest rows, which is typically the aggregate table with the
+If there is an aggregate table of the same granularity as the query, Mondrian
+will use it. If there is no aggregate table at
+the desired granularity, Mondrian will pick an aggregate table of lower
+granularity and roll up from it. In general, Mondrian chooses the aggregate
+table with the fewest rows, which is typically the aggregate table with the
fewest extra dimensions. See property
mondrian.rolap.aggregates.ChooseByVolume
.
There is an important exception for distinct-count measures: they cannot in be -rolled up over arbitrary dimensions. To see why, consider the case of a supermarket chain which has two -stores in the same city. Suppose that Store A has 1000 visits from 800 distinct -customers in the month of July, while Store B has 1500 visits from 900 distinct -customers. Clearly the two stores had a total of 2500 customer visits between -them, but how many distinct customers? We can say that there were at least 900, -and maybe as many as 1700, but assuming that some customers visit both stores, -and the real total will be somewhere in between. "Distinct customers" is an -example of a distinct-count measure, and cannot be deduced by rolling up +
There is an important exception for distinct-count measures: they cannot in be +rolled up over arbitrary dimensions. To see why, consider the case of a supermarket chain which has two +stores in the same city. Suppose that Store A has 1000 visits from 800 distinct +customers in the month of July, while Store B has 1500 visits from 900 distinct +customers. Clearly the two stores had a total of 2500 customer visits between +them, but how many distinct customers? We can say that there were at least 900, +and maybe as many as 1700, but assuming that some customers visit both stores, +and the real total will be somewhere in between. "Distinct customers" is an +example of a distinct-count measure, and cannot be deduced by rolling up subtotals. You have to go back to the raw data in the fact table.
-There is a special case where it is acceptable to roll up distinct count -measures. Suppose that we know that in July, this city's stores (Store A and B -combined) have visits from 600 distinct female customers and 700 distinct male -customers. Can we say that the number of distinct customers in July is 1300? Yes -we can, because we know that the sets of male and female customers cannot -possibly overlap. In technical terms, gender is functionally dependent on +
There is a special case where it is acceptable to roll up distinct count +measures. Suppose that we know that in July, this city's stores (Store A and B +combined) have visits from 600 distinct female customers and 700 distinct male +customers. Can we say that the number of distinct customers in July is 1300? Yes +we can, because we know that the sets of male and female customers cannot +possibly overlap. In technical terms, gender is functionally dependent on customer id.
The rule for rolling up distinct measures can be stated as follows:
- A distinct count measure over key k can be computed by rolling up - more granular subtotals only if the attributes which are being rolled up are - functionally dependent on k. + A distinct count measure over key k can be computed by rolling up + more granular subtotals only if the attributes which are being rolled up are + functionally dependent on k.
-Even with this special case, it is difficult to create enough aggregate tables -to satisfy every possible query. When evaluating a distinct-count measure, Mondrian can only use -an aggregate -table if it has the same logical/level granularity as the cell being -requested, or can be rolled up to that granularity only by dropping functionally -dependent attributes. If -there is no aggregate table of the desired granularity, Mondrian goes instead +Even with this special case, it is difficult to create enough aggregate tables +to satisfy every possible query. When evaluating a distinct-count measure, Mondrian can only use +an aggregate +table if it has the same logical/level granularity as the cell being +requested, or can be rolled up to that granularity only by dropping functionally +dependent attributes. If +there is no aggregate table of the desired granularity, Mondrian goes instead against the fact table.
-This has implications for aggregate design. If your application makes extensive -use of distinct-count measures, you will need to create an aggregate table for -each granularity where it is used. That could be a lot of aggregate tables! (We +This has implications for aggregate design. If your application makes extensive +use of distinct-count measures, you will need to create an aggregate table for +each granularity where it is used. That could be a lot of aggregate tables! (We hope to have a better solution for this problem in future releases.)
That said, Mondrian will rollup measures in an aggregate table that contains one or more distinct-count measures if none of the distinct-count measures are requested. In that respect an aggregate table containing distinct-count -measures are just like any other aggregate table as long as the -distinct-count measures are not needed. And once in memory, distinct-count +measures are just like any other aggregate table as long as the +distinct-count measures are not needed. And once in memory, distinct-count measures are cached like other measures, and can be used for future queries.
When building an aggregate table that will contain a distinct-count measure, the measure must be rolled up to a logical dimension level, which is -to say that the aggregate table must be a collapsed dimension aggregate. +to say that the aggregate table must be a collapsed dimension aggregate. If it is rolled up only to the dimension's foreign key, there is no guarantee that the foreign key is at the same granularity as the lowest logical level, which is what @@ -1815,7 +1819,7 @@
store_id
.
-
+
INSERT INTO "agg_l_04_sales_fact_1997" (
@@ -1840,21 +1844,21 @@
-This aggregate table is useless for computing the "customer_count"
-measure. Why? The distinct-count measure is rolled up to the
+This aggregate table is useless for computing the "customer_count"
+measure. Why? The distinct-count measure is rolled up to the
time_id
granularity, the lowest level granularity of the
-physical database table time_by_day
. Even a query against the lowest level in the
+physical database table time_by_day
. Even a query against the lowest level in the
Time
dimension would require a rollup from time_id
to
month_of_year
, and this is impossible to perform.
-Now consider this collapsed Time
dimension aggregate table
-that has the same lost dimensions customer_id
,
+Now consider this collapsed Time
dimension aggregate table
+that has the same lost dimensions customer_id
,
product_id
, promotion_id
and store_id
.
-The time_id
foreign key is no longer present, rather it
-has been replaced with the logical levels the_year
,
+The time_id
foreign key is no longer present, rather it
+has been replaced with the logical levels the_year
,
quarter
and month_of_year
.
This aggregate table of the distinct-count measure can be used to fulfill
-a query as long as the query specifies the
+a query as long as the query specifies the
Time
dimension down to the
month_of_year
level.
-A better design for the aggregate table would include a few attributes which are
-functionally dependent on customer_id
, the key for the
+A better design for the aggregate table would include a few attributes which are
+functionally dependent on customer_id
, the key for the
distinct-count measure:
The added attributes are "country"
, "gender"
and
-"marital_status"
. This table has only appoximately 12x the number of rows
+"marital_status". This table has only appoximately 12x the number of rows
of the previous aggregate table (3 values of country
x 2 values of
-gender
x 2 values of marital_status
) but can answer
+gender
x 2 values of marital_status
) but can answer
many more potential queries.
-Aggregate tables are difficult to design and maintain. We make no bones about it. -But this is the first release in which aggregate tables have been available, and -we decided to get the internals right rather than building a toolset to make +Aggregate tables are difficult to design and maintain. We make no bones about it. +But this is the first release in which aggregate tables have been available, and +we decided to get the internals right rather than building a toolset to make them easy to use.
-Unless your dataset is very large, Mondrian's performance will be just fine -without aggregate tables. If Mondrian isn't performing well, you should first +Unless your dataset is very large, Mondrian's performance will be just fine +without aggregate tables. If Mondrian isn't performing well, you should first check that your DBMS is well-tuned: see our guide to -optimizing performance). If decide to -build aggregate tables anyway, we don't offer any tools to help administrators -design them, so unless you are blessed with superhuman patience and intuition, +optimizing performance). If decide to +build aggregate tables anyway, we don't offer any tools to help administrators +design them, so unless you are blessed with superhuman patience and intuition, using them won't be smooth sailing.
@@ -2002,7 +2006,7 @@This utility populates (or generates INSERT statements to populate) the agg tables.
-For extra credit: populate the tables in topological order, so that -higher level aggregations can be built from lower level aggregations. +
For extra credit: populate the tables in topological order, so that +higher level aggregations can be built from lower level aggregations. (See [AAD+96]).
- @@ -2044,7 +2048,7 @@The algorithm could also take into account usage -information. A set of sample queries could be an input to the utility, or the -utility could run as a background task, consuming the query log and dynamically +information. A set of sample queries could be an input to the utility, or the +utility could run as a background task, consuming the query log and dynamically making recommendations.
- @@ -2067,7 +2071,7 @@This utility would allow agg tables to be taken offline/online while Mondrian is still running.
- @@ -2077,6 +2081,7 @@The name of the role to adopt for access-control purposes. If not specified, the connection uses a role which has access to every object in the schema.
-This property can contain multiple role names separated by commas. If - so, queries in the connection execute with the sum of the privileges of +
This property can contain multiple role names separated by commas. If
+ so, queries in the connection execute with the sum of the privileges of
all of the rules; the effect is the same as running under a
union role, defined using the
<Union>
element in the schema file.
If a role name contains a comma, escape the comma using an extra +
If a role name contains a comma, escape the comma using an extra comma. For example, a connection created with
--
Role='Pacific region manager,Europe,, Middle East and +
Role='Pacific region manager,Europe,, Middle East and Africa manager'
will execute with the combined privileges of the roles "Pacific +
will execute with the combined privileges of the roles "Pacific region manager", and "Europe, Middle East and Africa manager".
Controls whether a new connection use a schema from the schema
- cache. If true
, the default, a connection shares a schema
- definition (and hence also a cache of aggregate data retrieved by
- previous queries) with other connections which have a textually
+
Controls whether a new connection use a schema from the schema
+ cache. If true
, the default, a connection shares a schema
+ definition (and hence also a cache of aggregate data retrieved by
+ previous queries) with other connections which have a textually
identical schema definition.
If false
, the connection has a private schema definition
+
If false
, the connection has a private schema definition
and cache.
A unique identifier for the connection. If this is set, Mondrian will look at this property and no other to determine whether two +
A unique identifier for the connection. If this is set, Mondrian will look at this property and no other to determine whether two data sources should be considered the same. You must ensure that connections will only share a JdbcConnectionUuid if they point to the same database.
The name of the database catalog to scan when loading aggregate tables. If this is not set, Mondrian will read all catalogs the database connection has access to when loading aggregate tables.
The name of the database schema to scan when loading aggregate tables. If this is not set, Mondrian will read all schemas the database connection has access to when loading aggregate tables.
Connect string properties are also documented in the
@@ -323,7 +333,7 @@ The cache is only used when creating new connections; existing connections
retain their schemas.
-There are four connect string properties that control the use of the
+There are four connect string properties that control the use of the
Schema cache:
Schema cache
UseSchemaPool
,
UseContentChecksum
,
@@ -335,28 +345,28 @@ Schema cache
regardless of the values of any of the other properties. If UseSchemaPool
is "false", then the cache is not used; each request for a new schema
object creates a new one (entailing the re-parsing of the schema definition
-and re-scanning of the database for meta data and aggregate tables -
+and re-scanning of the database for meta data and aggregate tables -
very slow, and, in addition, there is no reuse of the in-memory aggregate
cache).
Next, if UseContentChecksum
is "true", then a check sum (MD5) is created
from the schema definition content (not URL) and it is this check sum
-that is used as the key to lookup previously cached versions of the
+that is used as the key to lookup previously cached versions of the
schema definition. If two schema definitions produce different check
sums, then one can safely assume that they are different schemas (of course,
-it is possible that only a comment or some whitespace in the schema
-definition changed in which case the two schemas would actually be the
-same, but because their check sums are different, different schema
+it is possible that only a comment or some whitespace in the schema
+definition changed in which case the two schemas would actually be the
+same, but because their check sums are different, different schema
objects are used). If UseContentChecksum is "false", then no check sum
-is created and used as the lookup key, rather, a combination of
-the connection attributes
-"catalogUrl",
-"connectionKey",
-"jdbcUser",
+is created and used as the lookup key, rather, a combination of
+the connection attributes
+"catalogUrl",
+"connectionKey",
+"jdbcUser",
"dataSourceStr"
-or
-"catalogUrl",
-"dataSource"
+or
+"catalogUrl",
+"dataSource"
are used to create the key.
If the CatalogContent
is specified, then it is used as the schema
@@ -437,7 +447,7 @@
Mondrian uses log4j for all information and debug logging. When running within an application server, Mondrian's log4j configuration is determined by -the server's or web application's log4j configuration. Please see +the server's or web application's log4j configuration. Please see log4j's documentation for a additional details.
@@ -446,12 +456,12 @@When running outside an application server, log4j determines the location of the log4j.xml file via the log4j.configuration java system property. log4j -treats this string as a URL, so to have it detect the log4j file on the file -system, you must use the syntax "file:DIR/log4j.xml". Relative paths are -acceptible, so if you have your log4j.xml file in the root directory of -mondrian, "file:log4j.xml" will load the correct file. You may specify the -log4j.configuration property in mondrian.properties, because Mondrian's ant -build file explicitly sets the property as a JVM system property when running +treats this string as a URL, so to have it detect the log4j file on the file +system, you must use the syntax "file:DIR/log4j.xml". Relative paths are +acceptible, so if you have your log4j.xml file in the root directory of +mondrian, "file:log4j.xml" will load the correct file. You may specify the +log4j.configuration property in mondrian.properties, because Mondrian's ant +build file explicitly sets the property as a JVM system property when running JUnit tests.
@@ -459,9 +469,9 @@The default log4j.xml file is configured so that a separate log file is created for both MDX and SQL statement logging. In the code, the MDX and SQL -strings are logged at the debug level, so to disable them you can set the log +strings are logged at the debug level, so to disable them you can set the log level to INFO or any other level above debug. Statement logging occurs within -the log4j categories "mondrian.mdx" and "mondrian.sql". These categories log +the log4j categories "mondrian.mdx" and "mondrian.sql". These categories log the statements and how long they took to execute. The SQL log also records the number of results returned in the result set.
From 927d2f6c4f5b1f68b3bb3a3a3b73eb5e5e83e44e Mon Sep 17 00:00:00 2001 From: Guilherme Raimundo