我正在尝试实现Kimball数据集市,它在维度表中使用-1和-2行,用于延迟到达的维度和空业务密钥。下面有一个示例代码,它为事实和维度数据创建一个暂存表,为数据集市创建一个二维表和一个事实表。下面是我使用SQL中数据的示例代码:
--drop table stg_sales
--go
CREATE TABLE dbo.stg_sales
(
stg_sales_id INT IDENTITY(1, 1) NOT NULL,
sales_number INT NOT NULL,
sales_amt INT NULL,
cust_number INT NULL,
cust_firstname NVARCHAR(50) NULL,
cust_lastname NVARCHAR(100) NULL,
cust_address NVARCHAR(500) NULL,
salesperson_number INT NULL,
CONSTRAINT pk_stg_sales PRIMARY KEY (stg_sales_id)
)
go
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (123,
434,
2342,
'Jim',
'Moriaty',
'something',
23)
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (124,
234,
2342,
'Jim',
'Moriaty',
'something',
23)
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (125,
434,
4545,
'Joe',
'Esk',
'someother',
24)
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (126,
434,
5555,
'Daniel',
'Hart',
'Someaddr',
NULL) --salesperson_number business key missing here
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (127,
333,
4444,
'Pat',
'Smith',
'Someaddr',
30)
SELECT *
FROM stg_sales
--create a dimension and fact tables
--drop table dbo.dim_customer
--go
CREATE TABLE dbo.dim_customer
(
customer_wid INT IDENTITY(1, 1) NOT NULL,
cust_number INT NULL,
cust_firstname NVARCHAR(50) NULL,
cust_lastname NVARCHAR(100) NULL,
cust_address NVARCHAR(500) NULL,
date_insert DATETIME2 NOT NULL DEFAULT (Getdate()),
date_update DATETIME2 NULL,
is_current BIT NOT NULL
CONSTRAINT pk_dim_customer PRIMARY KEY (customer_wid)
CONSTRAINT chk_is_current CHECK (is_current IN (0, 1))
)
go
SET IDENTITY_INSERT dbo.dim_customer ON
INSERT dbo.dim_customer
(customer_wid,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
date_insert,
date_update,
is_current)
VALUES (-1,
-1,
'unknown',
'unknown',
'unknown',
Getdate(),
Getdate(),
1)
INSERT dbo.dim_customer
(customer_wid,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
date_insert,
date_update,
is_current)
VALUES (-2,
-2,
'Error',
'Error',
'Error',
Getdate(),
Getdate(),
1)
SET IDENTITY_INSERT dbo.dim_customer OFF
--insert data into dimension table
INSERT dbo.dim_customer
(cust_number,
cust_firstname,
cust_lastname,
cust_address,
is_current)
SELECT DISTINCT cust_number,
cust_firstname,
cust_lastname,
cust_address,
1 AS is_current
FROM dbo.stg_sales
WHERE cust_number <> 4444 --left one record off to simulate the situation where you don't have corrensponding row in dim table (late arriving dimension)
SELECT *
FROM dbo.dim_customer
DROP TABLE dbo.dim_salesperson
--create salesperson table
CREATE TABLE dbo.dim_salesperson
(
salesperson_wid INT IDENTITY(1, 1) NOT NULL,
salesperson_number INT NULL,
salesperson_firstname NVARCHAR(50) NULL,
salesperson_lastname NVARCHAR(100) NULL,
salesperson_address NVARCHAR(500) NULL,
date_insert DATETIME2 NOT NULL DEFAULT (Getdate()),
date_update DATETIME2 NULL,
is_current BIT NOT NULL
CONSTRAINT pk_dim_salesperson PRIMARY KEY (salesperson_wid)
CONSTRAINT chk_dim_salesperson_is_current CHECK (is_current IN (0, 1))
)
go
SET IDENTITY_INSERT dbo.dim_salesperson ON
INSERT dbo.dim_salesperson
(salesperson_wid,
salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
date_insert,
date_update,
is_current)
VALUES (-1,
-1,
'Not available',
'Not available',
'Not available',
Getdate(),
Getdate(),
1)
INSERT dbo.dim_salesperson
(salesperson_wid,
salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
date_insert,
date_update,
is_current)
VALUES (-2,
-2,
'Error',
'Error',
'Error',
Getdate(),
Getdate(),
1)
SET IDENTITY_INSERT dbo.dim_salesperson OFF
--insert data into salesperson
INSERT dbo.dim_salesperson
(salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
is_current)
VALUES (23,
'John',
'Fox',
'something',
1)
INSERT dbo.dim_salesperson
(salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
is_current)
VALUES (24,
'Hadley',
'Fox',
'something',
1)
INSERT dbo.dim_salesperson
(salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
is_current)
VALUES (30,
'Ashley',
'Fox',
'something',
1)
SELECT *
FROM dbo.dim_salesperson
SELECT *
FROM dbo.stg_sales
--create and populate the fact table
--drop table dbo.f_sales
--go
CREATE TABLE dbo.f_sales
(
sales_number INT NOT NULL,
customer_wid INT NOT NULL,
salesperson_wid INT NOT NULL,
sales_amt INT NULL
CONSTRAINT pk_f_sales PRIMARY KEY (sales_number)
CONSTRAINT fk_customer_wid FOREIGN KEY (customer_wid) REFERENCES
dbo.dim_customer(customer_wid),
CONSTRAINT fk_salesperson_wid FOREIGN KEY (salesperson_wid) REFERENCES
dbo.dim_salesperson(salesperson_wid)
)
--populate the fact table
INSERT dbo.f_sales
(sales_number,
customer_wid,
salesperson_wid,
sales_amt)
SELECT stg.sales_number,
Isnull(dimcust.customer_wid, -1) AS customer_wid,
--this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
Isnull(dimsp.salesperson_wid, -2) AS salesperson_wid,
--NOT CORRECT, how to assign -2 foreign key when the business key is NULL in the source?
stg.sales_amt
FROM dbo.stg_sales AS stg
LEFT JOIN dbo.dim_customer AS dimcust
ON stg.cust_number = dimcust.cust_number
LEFT JOIN dbo.dim_salesperson AS dimsp
ON stg.salesperson_number = dimsp.salesperson_number
SELECT *
FROM dbo.f_sales如何为源系统中缺少业务密钥的行分配-2。您可以从Kimball了解更多关于此实现背后的理论:
这基本上就是我想要达到的目标:
在维度和度量中处理NULL
编辑:
我认为我可以在左联接中使用COALESCE或ISNULL,它似乎产生了正确的结果:
INSERT dbo.f_sales
(sales_number,
customer_wid,
salesperson_wid,
sales_amt)
SELECT stg.sales_number,
Isnull(dimcust.customer_wid, -1) AS customer_wid,
--this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
dimsp.salesperson_wid,
stg.sales_amt
FROM dbo.stg_sales AS stg
LEFT JOIN dbo.dim_customer AS dimcust
ON COALESCE(stg.cust_number, -2) = dimcust.cust_number
LEFT JOIN dbo.dim_salesperson AS dimsp
ON COALESCE(stg.salesperson_number, -2) = dimsp.salesperson_number发布于 2012-10-12 16:16:02
纯粹作为一种查找技术
-- add nullable keys to the staging table
alter table dbo.stg_sales ADD
sales_person_wid integer null
, customer_wid integer null
;
-- insert to staging table here (as in your example)
-- lookup sales person key
update dbo.stg_sales
set sales_person_wid = p.sales_person_wid
from dbo.stg_sales as s , dbo.dim_salesperson as p
where s.salesperson_number = p.salesperson_number ;
-- decide what to do with missing business keys
update dbo.stg_sales
set sales_person_wid = -2
where sales_person_wid is null ;
-- do similar for customer
-- now all keys in staging table are not null
-- load to fact table然而,通常的技术是在提取或清理过程中为事务分配特殊(未知、n/a、错误)业务密钥。换句话说,在记录落入暂存表之前,可以分配特殊的Error业务密钥。
最后指出,延迟到达维度意味着业务密钥(salesperson_number)对操作系统来说是已知的,但是事务(销售事实)在维度数据之前将其提交给了仓库。因此,salesperson_number将是not null,但不存在于dimenson表中。您不必将此事务保存在某个地方,并在记录到达维度后尝试更新FK (salesperson_wid);一天左右之后。
发布于 2012-10-22 14:45:15
我认为一个晚到的事实是一个与事实相关的维度正在缓慢变化,而事实却是迟来的。假设你有一个销售系统,有一个叫做'Opal果树‘的产品,该产品将名称改为'Starburst’。该公司希望跟踪名称的变化是否影响销售,因此他们设置了类型2缓慢变化的维度(SCD)。在SCD中,他们跟踪产品名称,因此每次产品名称更改时,他们都会为该产品发出一个新记录,该记录具有相同的业务(自然)密钥,但具有不同的代理密钥,并记录该名称的有效开始日期和日期。因此,说‘蛋白石水果’的有效期为1970年1月1日和1995年3月28日。产品名为'Opal果树‘的版本从'29/3/1995’开始有效,而有效to是空的,因为它仍然有效。
现在,比如说在1995年4月1日,在名称变更后的一天,他们得到了一些交易日期为1/4/1995和交易日期为27/3/1995的销售交易。延迟到达意味着给定维度的事务在产品维度更改后到达。因此,1995年3月27日到达的交易需要有产品维度的代理键,其产品名为“Opal Fruitt”,而在1/4/1995到达的事务需要名称为“Starburst”的产品维度的替代键。您需要做的是计算出'Opal果树/ Starburst‘产品的哪个版本在交易日期是最新的,并在交易进入销售事实时将该产品的代理密钥分配给交易。
这就是我对迟来的事实的理解。在处理维度元素之前到达的事实肯定是早到的事实,而不是晚到的事实。也许只是术语上的问题。
https://stackoverflow.com/questions/12853932
复制相似问题