Maintenance – Record Deletion

A simple framework for implementing a time-framed record deletion policy: deleting records that are older than a certain threshold.

Consists of a logging table and a stored procedure which deletes the records.   The procedure deletes rows in batches and provides for an overall maximum to be specified for a single execution to limit impact on resources when there is a huge set of records to delete.

Logging Table

CREATE TABLE [dbo].[RecordDeletions]
(
	[RecordDeletionId] INT NOT NULL IDENTITY, 
    [TableName] VARCHAR(50) NULL, 
	[DeletionThresholdDate] smalldatetime NULL,
    [OperationStatus] VARCHAR(50) NOT NULL CONSTRAINT DF_RecordDeletions_OperationStatus DEFAULT 'Running', 
    [StartTime] DATETIME NOT NULL CONSTRAINT DF_RecordDeletions_StartTime DEFAULT GetDate(), 
    [EndTime] DATETIME NULL, 
    [NbrRecordsDeleted] INT NULL , 
    [DurationSeconds] as DATEDIFF(SECOND, StartTime, EndTime), 
    CONSTRAINT [PK_RecordDeletions] PRIMARY KEY ([RecordDeletionId]),
	CONSTRAINT [CK_RecordDeletions_OperationStatus_Valid_Values] 
        CHECK (OperationStatus = 'Running' OR OperationStatus = 'Success' OR OperationStatus = 'Failure')
)

GO
EXEC sp_addextendedproperty @name = N'MS_Description',
    @value = N'Keeps a log of records that have been deleted from tables for housekeeping',
    @level0type = N'SCHEMA',
    @level0name = N'dbo',
    @level1type = N'TABLE',
    @level1name = N'RecordDeletions',
    @level2type = NULL,
    @level2name = NULL
GO
EXEC sp_addextendedproperty @name = N'MS_Description',
    @value = N'Time that this operation started',
    @level0type = N'SCHEMA',
    @level0name = N'dbo',
    @level1type = N'TABLE',
    @level1name = N'RecordDeletions',
    @level2type = N'COLUMN',
    @level2name = N'StartTime'
GO
EXEC sp_addextendedproperty @name = N'MS_Description',
    @value = N'Time that this operation ended',
    @level0type = N'SCHEMA',
    @level0name = N'dbo',
    @level1type = N'TABLE',
    @level1name = N'RecordDeletions',
    @level2type = N'COLUMN',
    @level2name = N'EndTime'
GO
EXEC sp_addextendedproperty @name = N'MS_Description',
    @value = N'Number of records deleted',
    @level0type = N'SCHEMA',
    @level0name = N'dbo',
    @level1type = N'TABLE',
    @level1name = N'RecordDeletions',
    @level2type = N'COLUMN',
    @level2name = N'NbrRecordsDeleted'
GO
EXEC sp_addextendedproperty @name = N'MS_Description',
    @value = N'Name of table',
    @level0type = N'SCHEMA',
    @level0name = N'dbo',
    @level1type = N'TABLE',
    @level1name = N'RecordDeletions',
    @level2type = N'COLUMN',
    @level2name = N'TableName'
GO
EXEC sp_addextendedproperty @name = N'MS_Description',
    @value = N'Status of operation:  Running, Success, Failure',
    @level0type = N'SCHEMA',
    @level0name = N'dbo',
    @level1type = N'TABLE',
    @level1name = N'RecordDeletions',
    @level2type = N'COLUMN',
    @level2name = N'OperationStatus'
GO
EXEC sp_addextendedproperty @name = N'MS_Description',
    @value = N'(Computed column) Seconds duration of completed events.',
    @level0type = N'SCHEMA',
    @level0name = N'dbo',
    @level1type = N'TABLE',
    @level1name = N'RecordDeletions',
    @level2type = N'COLUMN',
    @level2name = N'DurationSeconds'
GO

EXEC sp_addextendedproperty @name = N'MS_Description',
    @value = N'Rows older than the DeletionThresholdDate are targetted for deletion in this event.',
    @level0type = N'SCHEMA',
    @level0name = N'dbo',
    @level1type = N'TABLE',
    @level1name = N'RecordDeletions',
    @level2type = N'COLUMN',
    @level2name = N'DeletionThresholdDate'

Record Deletion Procedure

This procedure is fairly basic and requires duplication and customisation for each table (or cluster of related tables).   Search and replace TARGET_TABLE_NAME – but the appropriate Primary Key and Date columns will also have to be edited in the “Deletion Candidates Selection Block”.

CREATE PROCEDURE dbo.DeleteOldRows_TARGET_TABLE_NAME

/*
	Delete rows older than the retention period 
	
	@DeleteBatchSize : number of records deleted in each loop.
	@OverallMaxRowsPerExecution : absolute max number of rows that will be deleted 
								in a single execution of the procedure.

*/
AS
SET NOCOUNT ON;
SET XACT_ABORT ON
BEGIN TRY
	DECLARE @DeleteBatchSize int = 5000
	DECLARE @OverallMaxRowsPerExecution int = 400000

	DECLARE @DebugInfo varchar(2000)
	DECLARE @MaxRowNumber int = 0
	DECLARE @RangeStart int
	DECLARE @RangeEnd int
	DECLARE @RecordDeletionId int
	DECLARE @NbrRecordsDeleted int = 0
	DECLARE @OperationStatus varchar(50) = 'Failure'


	CREATE TABLE #RecordsToDelete
		(RowNumber int not null PRIMARY KEY IDENTITY,
		 RecordId  int not null
		 )

	-- Insert logging row, calculate time threshold.
	INSERT dbo.RecordDeletions 
		(TableName,
		 DeletionThresholdDate)
		VALUES
		('TARGET_TABLE_NAME',
		  DATEADD(MONTH, (-60), CAST(GETDATE() AS date);

	SET @RecordDeletionId = SCOPE_IDENTITY()

	-- Deletion Candidates Selection Block
	INSERT INTO #RecordsToDelete (RecordId)
		SELECT TOP (@OverallMaxRowsPerExecution)
			ID
		FROM
			dbo.TARGET_TABLE_NAME L
			INNER JOIN dbo.RecordDeletions RD
			ON L.RowCreatedDate <= RD.DeletionThresholdDate
		WHERE
			RD.RecordDeletionId = @RecordDeletionId

	SELECT
		@RangeStart = MIN(RowNumber),
		@MaxRowNumber = MAX(RowNumber)
	FROM
		#RecordsToDelete


	WHILE (@RangeStart < @MaxRowNumber)
	BEGIN
		SET @RangeEnd = (@RangeStart + @DeleteBatchSize -1)

		DELETE TargetTable
		FROM 
			dbo.TARGET_TABLE_NAME TargetTable
			INNER JOIN #RecordsToDelete R ON TargetTable.ID = R.RecordId
		WHERE	
			R.RowNumber BETWEEN @RangeStart and @RangeEnd
	
		SET @NbrRecordsDeleted += @@ROWCOUNT
		SET @RangeStart += @DeleteBatchSize

		WAITFOR DELAY '000:00:00.500' -- pause for 500ms to allow other processes to gain locks.
	END

	SET @OperationStatus = 'Success'


END TRY
BEGIN CATCH
	EXECUTE dbo.ERROR_HandleCaughtError @DebugInfo
	SET @OperationStatus = 'Failure';
END CATCH	

BEGIN TRY
	SET @DebugInfo = 'Logging and Cleanup section'
	UPDATE
		dbo.RecordDeletions
	SET
		OperationStatus = @OperationStatus,
		EndTime = GETDATE(),
		NbrRecordsDeleted = @NbrRecordsDeleted
	WHERE
		RecordDeletionId = @RecordDeletionId

	DROP TABLE #RecordsToDelete
END TRY
BEGIN CATCH
	EXECUTE dbo.ERROR_HandleCaughtError @DebugInfo
	RETURN 5555
END CATCH

Uses the  Error Handler for T-SQL.

Populating Date Dimension with Fiscal Periods

A standard method for populating a date table with fiscal periods will always come to grief over exceptions.  This post includes a procedure that should be customisable enough to cater for most variations on fiscal periods – provided all periods contain a whole number of weeks.

Fiscal Periods

… do vary.

  • A 4-4-5 week pattern is standard… except when adjustments are made for busy months.
  • Normally a quarter contains three months… except when a company is adjusting its financial year end.

The procedure detailed below takes a few parameters which specify the fiscal period schema:

  • FirstDayOfFiscalYear  : the day it all starts, which also determines the first-day-of-week.
  • WeeksPerMonthPattern : a csv list stating the number of weeks in each month.  e.g.  4,4,5,4,4,5,4,4,5,4,5,4.    The procedure defaults to 4-4-5 throughout the year.
  • MonthsPerQuarterPattern : a csv list stating the number of months per quarter.  defaults to 3,3,3,3 so only needs overriding in those exceptional circumstances.

There is some validation to ensure that the numbers of months, quarters and weeks align.  There’s a “checksum” NbrOfWeeks parameter which defaults to 52; set a value if 53 weeks applies – or for fiscal years with other numbers of weeks.

Columns to Update

A date table might have any number of specialised fiscal columns to update – week and month numbers, start and end date keys, overall period counts.  Therefore the procedure provides a simple but flexible way of specifying an update clause using placeholders for values to be inserted.

  • TableToUpdate : name of table
  • DateKeyColumn : The Date-key column of the TableToUpdate. Column must be integer conversion of date format 112 (yyyymmdd).
  • ColumnUpdatePattern :  The contents of a SET clause as it would appear in a an UPDATE statement, with placeholders.  E.g.  “FiscalMonthNumber={m}, FiscalQuarterNumber={q},… FiscalMonthStart={mst},…”. Available placeholders:
    • {year} : the value of @FiscalYearName
    • {d}, {w}, {m}, {q}, {s} : day, week, month, quarter and semester numbers.
    • {wst}, {mst}, {qst}, {sst} : week, month, quarter, semester start date key
    • {wend}, {mend}, {qend}, {send} : week, month, quarter, semester end date key

Procedure ai_GenerateFiscalCalendar

This procedure generates table update statements for a single financial year.

Fiscal Calendar Update Generator.sql

Example Call

This call uses a specialised week-per-month pattern.

DECLARE @ColumnUpdatePattern varchar(1000) = 'Fiscal_year={year}, Fiscal_quarter={q}, ' +
    'Fiscal_period={m}, Fiscal_Week={w}, ' +
	'Fiscal_week_overall_nbr=(1161 + {w}), Fiscal_period_overall_nbr=(1037 + {m}), ' +
	'[quarter_Fiscal_of_year_end_date_key] = {qend}';
EXEC ai_GenerateFiscalCalendar    
	@FirstDayOfFiscalYear='2017-04-30',
	@FiscalYearName = '2017',
	@TableToUpdate = 'dbo.DimDate',
	@DateKeyColumn = 'date_key',
	@ColumnUpdatePattern = @ColumnUpdatePattern,
	@WeeksPerMonthPattern = '4,4,5,4,4,5,4,5,4,4,4,5';

A summary is output to the messages tab so that the periods can be checked.

FISCAL YEAR: 2017
START DATE: 2017-04-30 Sunday
TOTAL DAYS: 364
TOTAL WEEKS: 52
MONTHS:
 1  2017-04-30 Sun  28 days  4 weeks  Qtr: 1  Sem: 1
 2  2017-05-28 Sun  28 days  4 weeks  Qtr: 1  Sem: 1
 3  2017-06-25 Sun  35 days  5 weeks  Qtr: 1  Sem: 1
 4  2017-07-30 Sun  28 days  4 weeks  Qtr: 2  Sem: 1
 5  2017-08-27 Sun  28 days  4 weeks  Qtr: 2  Sem: 1
 6  2017-09-24 Sun  35 days  5 weeks  Qtr: 2  Sem: 1
 7  2017-10-29 Sun  28 days  4 weeks  Qtr: 3  Sem: 2
 8  2017-11-26 Sun  35 days  5 weeks  Qtr: 3  Sem: 2
 9  2017-12-31 Sun  28 days  4 weeks  Qtr: 3  Sem: 2
10  2018-01-28 Sun  28 days  4 weeks  Qtr: 4  Sem: 2
11  2018-02-25 Sun  28 days  4 weeks  Qtr: 4  Sem: 2
12  2018-03-25 Sun  35 days  5 weeks  Qtr: 4  Sem: 2

The update statements (output as rows) are created by simple concatenation of the TableToUpdate, ColumnUpdatePattern and DateKeyColumns specified.   One update is created for each day in the fiscal year.

UPDATE dbo.DimDate SET Fiscal_year=2017, Fiscal_quarter=1, Fiscal_period=1, Fiscal_Week=1, Fiscal_week_overall_nbr=(1161 + 1), Fiscal_period_overall_nbr=(1037 + 1), [quarter_Fiscal_of_year_end_date_key] = 20170729 WHERE date_key = 20170430;

-- 364 update statements generated.

 

A Procedure for generating Data Warehouse Update code

Background

Transactional tables exist in Database A.   These tables have partner archive tables in the same or another database.  For each insert or change of a value in the transactional database a timestamped new row is entered into the counterpart history table.  Standard archiving, similar to the temporal archiving offered in SQL Server 2016.

The difficulty is that writing the code to populate the archive table (or a Type 2 Slowly Changing Dimension data warehouse table) is complex and error prone.  Particularly since a Merge statement cannot on its own achieve this, so a temporary table has to populated from the Merge Output clause, and so on.

Code Generator

The procedure UTIL_GenerateArchiveProcedureDDL will generate the code required to populate an archive table.  The procedure is in the file attached at the end of this article.

It can generate code for a single specified table or all tables in a schema or the database.

The source table(s) must have an explicit primary key.

The archive table(s) must

  • have same name as the source tables (TODO add option to add _Archive to the name)
  • have temporal date columns named StartDate and EndDate (TODO allow other naming schemes to be used)
  • have the schema as the source tables with the addition of two date columns StartDate (not null) and EndDate (nullable).

The generator and generated code requires this Error reporting procedure.

Examples

Usage and description.

Standard Table (single Primary Key)

-- Typical table with a single primary key.
CREATE TABLE dbo.Products(
	ProductId int NOT NULL IDENTITY PRIMARY KEY,
	ProductName varchar(30) NOT NULL,
	AvailableFromDate smalldatetime NOT NULL,
	IntroducerId int NULL);

Run this with results to grid (it may be necessary to set the SSMS option to display carriage returns in the output grid view).

-- Generate Archive procedure for this table only by specifying table name
-- (if there is a similarly named table in another schema, provide @SourceSchema too)
EXEC UTIL_GenerateArchiveProcedureDDL
	@WarehouseDatabaseName = Archive,
	@SourceTable = 'Products';

This is the generated DDL.

  • The main action occurs in the MERGE statement. This joins all rows in the LIVE table with all current rows, i.e. rows where the EndDate is Null, in the Warehouse table (selected by CTE TargetCTE).
  • The Live and Warehouse tables are joined on all columns of the Primary Key (this example has only one).
  • If there is a match it means the same record (“same” here means having the same Primary Key value) exists in live and warehouse databases.
    • The procedure then compares the sets of non-PK columns in each table using the EXCEPT set operator. If a row is returned, there is a difference in one of the non-PK columns.
    • The THEN clause UPDATES the EndDate column of the warehouse table to obsolete the row in the warehouse table.
    • Note that at this stage no new row can be inserted into the warehouse to represent the new value; that is handled after the MERGE statement.
  • The remaining two MERGE clauses are simpler
    • Not matched by SOURCE:  the record no longer exists in live so the warehouse row is obsoleted.
    • Not matched by TARGET: the record doesn’t exist in warehouse, so a row is inserted.
  • All the SOURCE columns are OUTPUT-ed to a temporary table, together with the MERGE action key.
  • We still need to insert rows for records that have changed value since the warehouse was last updated.  The Merge Output will contain all rows from the full join of the source and target tables.  These can be filtered to only the records that resulted in an UPDATE operation, but even that is too wide since it includes records which have been deleted from the source; we don’t want to insert those again.  Therefore we filter to insert only rows that exist in the merge Source – that is, those where the ouput-ed Source columns are not null.  To test this, the code chooses the primary key (or the first PK column of a compound key) since this cannot be null if the row exists.
  • The Merge and changed-row Insert statements are wrapped in a transaction.
  • Row counts follow.
CREATE PROCEDURE [dbo].[UpdateDataWarehouse]
/*
	** DO NOT EDIT. 
	** Auto-generated by procedure UTIL_GenerateArchiveProcedureDDL ON Jan 22 2017  3:35AM.

	PURPOSE
		This procedure updates an archive warehouse database with the current values in the source database.
		The warehouse database must contain the same tables with the same columns plus temporal (StartDate, 
		EndDate) columns.		

	CODE GENERATOR HISTORY
		18/01/2017 Andrew Ing 1.02.

*/
AS
SET NOCOUNT, XACT_ABORT ON;
BEGIN TRY

DECLARE @EffectiveDate datetime;
DECLARE @ObsoletedRowCount int;
DECLARE @InsertedRowCount int;
DECLARE @UpdatedRowCount int;
DECLARE @TableActionSummary varchar(1000);
DECLARE @DebugInfo varchar(2000);

SET @EffectiveDate = GETDATE();


-------------------------------------------------------------
-- TABLE: [dbo].[Products]

  SET @DebugInfo = 'Create Temp Table [#MergeOutput_Products]';
  CREATE TABLE [#MergeOutput_Products] (
    MergeAction nvarchar(10)
   ,[ProductId] int NULL
   ,[ProductName] varchar(30) NULL
   ,[AvailableFromDate] smalldatetime NULL
   ,[IntroducerId] int NULL
  );

  SET @DebugInfo = 'Merge [dbo].[Products]';
  BEGIN TRANSACTION;
  -- Select only Current (EndDate IS NULL) warehouse rows for the comparison.
  WITH TargetCTE AS
    (SELECT * 
     FROM Archive.[dbo].[Products]
     WHERE EndDate IS NULL
    )
  MERGE  TargetCTE AS [Target]
  USING  [dbo].[Products] AS [Source]
    ON
         [Target].[ProductId] = [Source].[ProductId]
  WHEN MATCHED
    AND EXISTS (
      SELECT
          [Target].[ProductName]
         ,[Target].[AvailableFromDate]
         ,[Target].[IntroducerId]
      EXCEPT
      SELECT
          [Source].[ProductName]
         ,[Source].[AvailableFromDate]
         ,[Source].[IntroducerId]
    )
    THEN  -- Record has changed.
      UPDATE
      SET [EndDate] = @EffectiveDate
  WHEN NOT MATCHED BY SOURCE
    THEN  -- Record deleted from source.
      UPDATE
      SET [EndDate] = @EffectiveDate
  WHEN NOT MATCHED BY TARGET
    THEN  -- New Record.
      INSERT (
        [StartDate]
       ,[ProductId]
       ,[ProductName]
       ,[AvailableFromDate]
       ,[IntroducerId]
       ) VALUES (
        @EffectiveDate
       ,[Source].[ProductId]
       ,[Source].[ProductName]
       ,[Source].[AvailableFromDate]
       ,[Source].[IntroducerId]
       )
  OUTPUT
     $Action AS MergeAction
     ,[Source].[ProductId]
     ,[Source].[ProductName]
     ,[Source].[AvailableFromDate]
     ,[Source].[IntroducerId]
  INTO [#MergeOutput_Products];

  -- Insert new rows for records that have changed (old rows were marked as expired by
  -- the first MATCHED clause. #MergeOutput_ has been populated only with Source columns
  -- but MERGE is a full outer join so exclude rows with a Null primary key column.
  SET @DebugInfo = 'Insert for changed rows [dbo].[Products]';
  INSERT INTO Archive.[dbo].[Products]
    (
      [StartDate]
     ,[ProductId]
     ,[ProductName]
     ,[AvailableFromDate]
     ,[IntroducerId]
    )
    SELECT
      @EffectiveDate
     ,[ProductId]
     ,[ProductName]
     ,[AvailableFromDate]
     ,[IntroducerId]
    FROM [#MergeOutput_Products]
    WHERE
      MergeAction = 'UPDATE'
      AND [ProductId] IS NOT NULL;
    SELECT @UpdatedRowCount = @@ROWCOUNT;

  COMMIT TRANSACTION

  SET @DebugInfo = 'Get Row Counts [dbo].[Products]';
  SELECT
    @ObsoletedRowCount = SUM(CASE WHEN MergeAction = 'UPDATE' THEN 1 ELSE 0 END),
    @InsertedRowCount = SUM(CASE WHEN MergeAction = 'INSERT' THEN 1 ELSE 0 END)
  FROM
    [#MergeOutput_Products];

  SET @TableActionSummary = CONVERT(varchar(19), GETDATE(), 121) + ' [dbo].[Products]
			 ' + CAST(ISNULL(@ObsoletedRowCount, 0) AS varchar(10)) + ' Rows Obsoleted, ' + 
			 CAST(ISNULL(@InsertedRowCount, 0) AS varchar(10)) + ' New Rows Inserted, ' +
			 CAST(@UpdatedRowCount AS varchar(10)) + ' Update Rows Inserted.';
  PRINT @TableActionSummary;
  D ROP TABLE [#MergeOutput_Products];

-- END OF TABLE [dbo].[Products]


END TRY
BEGIN CATCH
  EXECUTE dbo.ERROR_HandleCaughtError @DebugInfo;
  RETURN 5555;
END CATCH
/*
  END OF GENERATED SCRIPT
*/
GO

Many to Many tables

Or any table with just primary key columns.

CREATE TABLE dbo.StudentCourses(
	StudentId int NOT NULL,
	CourseId int NOT NULL,
	CONSTRAINT PK_StudentCourses PRIMARY KEY (StudentId, CourseId)
	);

If the table only has a primary key then it is not possible to have changed values in the same record. The record either exists in both live and warehouse tables (in which case no action is needed) or it exists in only one, in which case it must be inserted or obsoleted in the warehouse accordingly.

If no non-primary key columns are found, the WHEN MATCHED AND EXISTS… clause is omitted from generated code.  Note in the section below that both primary key columns are used in the Merge join statement, and that the Matched And Exists clause is not generated.

...
  WITH TargetCTE AS
    (SELECT * 
     FROM Archive.[dbo].[StudentCourses]
     WHERE EndDate IS NULL
    )
  MERGE  TargetCTE AS [Target]
  USING  [dbo].[StudentCourses] AS [Source]
    ON
         [Target].[StudentId] = [Source].[StudentId]
     AND [Target].[CourseId] = [Source].[CourseId]
  WHEN NOT MATCHED BY SOURCE
    THEN  -- Record deleted from source.
      UPDATE
      SET [EndDate] = @EffectiveDate
  WHEN NOT MATCHED BY TARGET
    THEN  -- New Record.
      INSERT (
...

Generating Updates for all Tables

Omitting the SourceTable and SourceSchema parameters will generate DDL for a procedure that will include all tables in the database.   This can be restricted by providing SourceSchema (which will include all tables in the named schema) or SourceTable.

 

UTIL_GenerateArchiveProcedure.sql (3)

Duration Dimension

For measuring short durations, such as telephone calls or handling times.

Q1 Linear Grain?

The question I haven’t answered is whether the dimension should be at the finest grain for all durations.  The grain I am using is seconds but probably after 30 minutes that’s finer than is needed. That’s 86,400 rows for a day.   An alternative would be using a log (ish) scale for DimDurationKey:  seconds up to 1 hour; minutes up to 24 hours, hours up to ? 7 days, then days… weeks… months.  This would allow a single Duration Dimension to cover a wide range of periods without becoming huge.  DimDimensionKey would have to be calculated during load.

Q2 Range Titles

What is a clear way of creating range titles:  0-1 minutes, 1-5 minutes, etc.   Strictly these are 0-59 secs, 1min – 1:59, 2min – 4:59, although users probably think of them as 1min, 2min, 5min.

  • 0:00 – 0:59
  • 1:00 – 1:59
  • 2:00 – 4:59
  • 5:00 – 59:59

Or

  • 0 – 59 secs
  • 1 min – 1:59
  • 2 min – 4:59
  • 5 min – 59:59

Neither of those is clear or elegant.  The second emphasises the start of each range whereas the information is that it’s in the “5 minute” band, whereas “2 min – 4:59” most salient value is “2 min”.

Based on this I looked at rounding the end of each range.

  • 0 – 60 secs
  • 1:01- 2 min
  • 2:01 – 5 min
  • 5:01 – 1 hour

That looks better but it gets complicated once you move beyond 1 hour.  How to express the next segment up to say 1.5 hours:

  • 1:00:01 to 1 hour 3o mins
  • 1:00:01 to 1:30:00
  • 1:00:01 to 1 and a half hours

All are ugly mixes of units.

I think for short durations I am resigned to accept the one-second inaccuracy of “0-1 minute”, “2-5 minutes” for the sake of clarity and fitting in with the way that people actually look at these figures.  So long as the second in question is not double counted I don’t think it matters that it’s location is indeterminate. ( In contrast this wouldn’t be acceptable for age ranges (0-65, 65-70).)

In doing this I revert to rounding the start of each round. That accords then with the MinuteOfHour value which looks more logical when viewing the table as a whole.

So that leaves

  • 0 – 59 secs
  • 1 – 2 mins   [60 – 119 secs]
  • 2 – 5 mins
  • 45 – 60 mins
  • 60 – 90 mins
  • 90 mins – 1 hour

Creation Script

IF ( (SELECT COUNT(*) AS C FROM dbo.DimDuration) = 0 ) 
BEGIN
	print 'Seed DimDuration...';

		INSERT DimDuration
		(
			[Seconds],
			[Minutes],
			Hours,
			MinuteOfHour,
			DurationAnalysis1,
			DurationAnalysis1_MaxSeconds
         
		)
		VALUES
		(
			0, 0, 0, 0, '0 secs', 0
		);


	WITH
	  Pass0 as (select 1 as C union all select 1), --2 rows
	  Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
	  Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
	  Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
	  Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
	  Tally as (select row_number() over(order by C) as Number from Pass4)

	INSERT DimDuration
		(
			[Seconds],
			[Minutes],
			Hours,
			MinuteOfHour,
			DurationAnalysis1,
			DurationAnalysis1_MaxSeconds
         
		)

		SELECT TOP(15000) 
			Number,
			Number / 60,
			Number / (60 * 60),
			(Number / 60) % 60,
			CASE 
				WHEN (Number)       < 60	    THEN '1 - 59 secs'
				WHEN (Number)       < 120	    THEN '1 - 2 mins'
				WHEN (Number / 60)  < 3		    THEN '2 - 3 mins'
				WHEN (Number / 60)  < 4		    THEN '3 - 4 mins'
				WHEN (Number / 60)  < 5		    THEN '4 - 5 mins'
				WHEN (Number / 60)  < 6		    THEN '5 - 6 mins'
				WHEN (Number / 60)  < 7		    THEN '6 - 7 mins'
				WHEN (Number / 60)  < 8		    THEN '7 - 8 mins'
				WHEN (Number / 60)  < 9		    THEN '8 - 9 mins'
				WHEN (Number / 60)  < 10	    THEN '9 - 10 mins'
				WHEN (Number / 60)  < 15	    THEN '10 - 15 mins'
				WHEN (Number / 60)  < 20	    THEN '15 - 20 mins'
				WHEN (Number / 60)  < 30	    THEN '20 - 30 mins'
				WHEN (Number / 60)  < 40	    THEN '30 - 40 mins'
				WHEN (Number / 60)  < 50	    THEN '40 - 50 mins'
				WHEN (Number / 60)  < 60	    THEN '50 - 60 mins'
				WHEN (Number / 60)  < 90	    THEN '60 - 90 mins'
				WHEN (Number / 60)  < 120	    THEN '90 mins - 2 hours'
				WHEN (Number / (60 * 60)) < 3	THEN '2 - 3 hours'
				WHEN (Number / (60 * 60)) < 4	THEN '3 - 4 hours'
				WHEN (Number / (60 * 60)) < 10	THEN '4 - 10 hours'
				WHEN (Number / (60 * 60)) < 15	THEN '10 - 15 hours'
				WHEN (Number / (60 * 60)) < 24	THEN '15 - 24 hours'
				ELSE  'over 24 hours'
			END
			,0
			FROM Tally;

	-- Add a column that can be used as a sort-key for ranges.

		WITH CTE AS
		(
			SELECT DurationAnalysis1,
				DurationAnalysis1_MaxSeconds  = MAX(Seconds)
			FROM dbo.DimDuration
			GROUP BY DurationAnalysis1
		)
	
		UPDATE DD 
		SET
			DurationAnalysis1_MaxSeconds = CTE.DurationAnalysis1_MaxSeconds
		FROM
			DimDuration DD 
			INNER JOIN CTE ON DD.DurationAnalysis1 = CTE.DurationAnalysis1;

	print 'Seed DimDuration complete.';

 

Data Obfuscation

Some functions for obfuscating data to protect personally identifiable data.  Aims:

  1. fast enough to process a large table
  2. resulting data is “sufficiently” obfuscated
  3. resulting data retains enough characteristics that it makes some sense in context.
  4. transformations are repeatable so that the same values in different datasets continue to match after transformation.

So this is some form of one-way hash, but not one that results in an entirely meaningless string of alphanumerics.

Telephone Number Obfuscation

The resulting number will have the same shape – including spaces and other punctuation – and retains the first character, so UK numbers are still recognisable.

Method (first draft):  Add first digit to last digit, modulo 10;  then add second digit to second-last digit modulo 10; and so on.  Skip first digit and skip non-numeric characters.

I should probably create a pair of functions, for unicode and non-unicode source columns.

/*
	PURPOSE:
		Efficiently obfuscate a telephone number in a repeatable manner.
		There should be collisions in the output so it cannot be reliably
		reverse engineered.
		Preserve the first digit of the number so it resembles a UK
		telephone number.
		Non-numeric characters reproduced verbatim.

	HISTORY

	30/07/2015 AI created.
*/
ALTER FUNCTION dbo.fn_ObfuscateTelNum
(
	@input nvarchar(100)
)
	RETURNS nvarchar(100)
AS
BEGIN
	declare @i int = 1
	declare @CurrentNum int = 0
	declare @SecondNum int = 0
	declare @PrevNum int = 0
	declare @output nvarchar(100) = ''

	IF (@input IS NULL) RETURN NULL
	WHILE (@i <= LEN(@input))
	BEGIN
		IF ( SUBSTRING(@input, @i, 1)  LIKE '[0-9]')
		BEGIN
			SET @CurrentNum = CAST(SUBSTRING(@input, @i, 1) AS int)  + @PrevNum
			IF (@i > 1 AND (SUBSTRING(@input, LEN(@input) - @i, 1) LIKE '[0-9]'))
				SET @SecondNum = CAST(SUBSTRING(@input, LEN(@input) - @i, 1) AS int)
				
			SET @output += CAST(((@CurrentNum + @SecondNum)  % 10) AS nvarchar(1))
			SET @PrevNum = @CurrentNum
		END
		ELSE
		BEGIN
			SET @output += SUBSTRING(@input, @i, 1)
		END
		SET @i = @i + 1
	END
	RETURN @output
END
GO

-- Two similar numbers produce quite different results:
select dbo.fn_ObfuscateTelNum('02072342357')
-- 05434650585
select dbo.fn_ObfuscateTelNum('02072345327')
-- 05734653885
-- but punctuation of number affects output:
select dbo.fn_ObfuscateTelNum('0207 2345327')
-- 0573 3653885
select dbo.fn_ObfuscateTelNum(null)
-- NULL
select dbo.fn_ObfuscateTelNum('+44 207 3248234')
-- +66 230 0488037

 

Literal Values in Scripts (VALUES() )

Options for including literal values in scripts – syntax that I find particularly hard to remember.

The feature is named Table Value Constructor. It was introduced in 2008 – see this article.

Source of Merge statement

MERGE INTO dbo.TargetTable AS Target
USING 
    (VALUES
	(1, 'One'),
	(2, 'Two')
	) AS
	Source (MyIntCol, MyTextCol)

ON Target ...

Content of CTE

There doesn’t seem to be a more compact method than this:

WITH CTE AS (
 SELECT * FROM( VALUES
    ('ID', '1' ,'int')
   ,('DatabaseName', '0' ,'sysname')
   ,('SchemaName', '0' ,'sysname')
    ) DT (ColumnName, PrimaryKeyFlag, CondensedDataType) )
SELECT * FROM CTE

 

Generate Random Test Data

Steps to follow to create a randomised dataset for testing.

An online data generator:  www.generatedata.com.

Prepare the Database

We will be loading a lot of data into the tables. So

  • Set Instant File Initialisation
  • Set the data file size sufficient to contain the generated data set.
  • Set Bulk Logged recovery mode.
  • Set an appropriate log file size. Need not be too large if inserting to new tables in Bulk Logged recovery mode. Otherwise may need to be size of largest single insert.

Create a Numbers Table

This is a standard numbers/tally table populated with integers 1 – n with the addition of a random number. This random number can be used to select other values for the data set.  See below for rationale on choosing the random number range.

CREATE TABLE Numbers
	(Nbr int NOT NULL,
	RandomNbr int NOT NULL)

-- Load Numbers Table
DECLARE @TotalRows bigint = 2000000
DECLARE @RandomNumberRange int = 4000
;WITH
  Pass0 as (select 1 as C union all select 1), --2 rows
  Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
  Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
  Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
  Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
  Pass5 as (select 1 as C from Pass4 as A, Pass4 as B),--4,294,967,296 rows 
  Tally as (select row_number() over(order by C) as Number from Pass5)
INSERT Numbers
        (Nbr
		,RandomNbr)
    SELECT TOP(@TotalRows) Number, 
		(ABS(CHECKSUM(NewId())) % @RandomNumberRange) + 1
        FROM Tally

-- Add clustered index
ALTER TABLE Numbers ADD CONSTRAINT PK_Numbers PRIMARY KEY CLUSTERED (Nbr)

The recursive CTE number generator is taken from here, inter alia.

Create Random Scalar Values

Assemble a SELECT statement to suit the table you are populating from the following snippets:

INSERT INTO
  -- YourTable
SELECT
  N.Nbr,   -- incrementing primary key
  NewID(), -- random uniqueidentifier
  CHECKSUM(NewId()) % 99,  -- random integer between 0 and 99.  
     -- Dates of birth up to 75 yoa
  DATEADD(dd, - ABS(CHECKSUM(NewId())) % (365 * 75), CAST(GETDATE() AS date)), 
     -- Value in only 10 in 4000 rows (see RandomNumberRange above)
  CASE WHEN N.RandomNbr <= 10 THEN 1 ELSE 0 END
FROM Numbers N
ORDER BY N.Nbr

Select Values from a Seed File

It helps to have plausible names, addresses and contact details.  Begin by populating another staging table with values – e.g. a set of names gathered from some source.

Note how many rows are in the source table: use this number for the upper bound of Numbers.RandomNbr (see above).  The RandomNbr column is used to join to the source table.

INSERT INTO
  -- YourTable
SELECT
  ...
  PN.FirstName,
  PN.LastName,
  ..
FROM Numbers N
  INNER JOIN PersonalNames PN ON PN.PersonalNameId = N.RandomNbr
ORDER BY N.Nbr

 

How Random?

Using CHECKSUM(NewId()) %  will not produce mathematically random values; it depends on the modulo.  In fact for testing it helps not to have random distribution, unless that is the characteristic of the data being modelled.  If modelling a set of customers or sales agents it is naturally that some will be involved in more sales than another.  Creating an unbalanced distribution involves applying some function to the random number generator to control its distribution.  I have used trigonometry operators to weight a random distribution to one end of the scale.

E.g. the following weights the distribution at higher numbers.  The distribution is not very convincing. It could be improved.

CAST(ABS(SIN(CHECKSUM(NewId()) % 5000) * 1000) AS INT) + 1

References

Random Number Recalculation Bug

The problem in outline

A simple expression can assign a random order or row number to a set of records.  The following example assigns a random decimal value between 0 and 1 to each row in a set:

CAST(RANK() OVER (ORDER BY NEWID()) / @TotalRows  AS decimal(5,3)) AS RandomPercentile

This works well until the randomised set is joined with another table.    Depending on the query plan that SQL server chooses, it is quite possible for the randomised set to be evaluated several passes.    Were RANK() ordered by another value in the same table the value would not change.  But using a random order directive means the order will be different on each evaluation.  The output will be a subset of the desired rows, with some rows appearing more than once and with a different RANK (or, in the above example, different RandomPercentile) each time.

Solution(s)

1. Persist the randomised value to a temporary table before joining it.

2. Explore the tempting alternatives in Solution to Catch That Bug! Random Row per Group  in SQL Server Pro.  (putting the Random generation in a CROSS-APPLIED derived table).

Preferring NULL as “no value” Indicator

Some practical reasons why NULL should be used as a no-value indicator in a column.

These examples arise from a situation where a column will contain a unique value eventually .  Until the value is known the column will have a “no value” indicator.  I would use NULL.  Others may use

  • ZLS
  • 0
  • 1900-01-01
  • 00000000-0000-0000-0000-000000000000

Indexing

We must use a filtered index, excluding the “no value” indicator, which may occur multiple times.  I feel that an index filtered on “WHERE col IS NOT NULL”  will still be used for equi joins, whereas indexes filtered “WHERE col <> ‘00000000-0000-0000-0000-000000000000′” may not.

  • sp_executesql  ‘… WHERE col = @x’, ‘@x uniqueidentifier’, ‘9B534305-16B2-4DE5-B063-A83B90B87C62’  does not use the index filtered on “WHERE col <> ‘00000000-0000-0000-0000-000000000000’
  • sp_executesql  ‘… WHERE col = @x AND col <> ”000…000”’, ‘@x uniqueidentifier’, ‘9B534305-16B2-4DE5-B063-A83B90B87C62’  does use the index.

My contention, to be confirmed, is that an index filtered on “IS NOT NULL” would be used because no equality match could be made against the excluded values (i.e. NULL).

Prepared Queries

However the real problem here is that the prepared query plan is ruling out the filtered index. Notice that an ad-hoc query can use the filtered index.

  • “SELECT … FROM  table WHERE col = ‘9B534305-16B2-4DE5-B063-A83B90B87C62’ ”  does use the index filtered on WHERE col <> ‘00000000-0000-0000-0000-000000000000’

This is because the ad-hoc query is planned against the literal value in the where query.  This is not a value excluded by the filter on the index, so the index can be used.

The prepared query plan generated by sp_executesql  will be optimised for “col = @x” – a non-specific value.  Because @x  could be the “no-value” ‘00000000-0000-0000-0000-000000000000’ the optimiser has to exclude use of the filtered index.

My assertion is that a filtered index excluding only NULL could still be used in the analogous  situation.  (at least with the ANSI NULL option).   Is this true, though?

Statistics

If this is a unique value then the statistics should reflect this distribution of values.  Thus if searching on a single value an index seek will always be indicated.  But the existence of a possibly large number of 00000000-0000-0000-0000-000000000000  values means that for at least one case a table scan might be better:  even though it is not a search the application would make.  This adds a query plan option that ideally would never be considered (scan).

Unwanted Joins

Since these “no value” values are intended to represent a lack of information there is no wish (in the designer’s mind) for them to join to any other value.  However, since they are valid values, they will join with undesired results.

Some Workarounds

One solution has been indicated above – add a WHERE clause that matches the filter on the index (in the literal text of the query if using sp_executesql).

However that is not an (easy) option for generated code (e.g. entity framework).  In the real case behind this example creating a non-unique index for this column did enable all queries to use an index lookup – with a considerable improvement in performance since the table was large.  Unfortunately this means that the table has two indexes on the same column – one to enforce uniqueness (where the value is not 00000000-0000-0000-0000-000000000000) and the other for use in lookups on this column.

The correct solution is to refactor this column to replace 00000000-0000-0000-0000-000000000000 with NULL for “no-value” entries.

Does NEWSEQUENTIALID() avoid conflicts?

The Question

The IDs generated by NEWSEQUENTIALID() can be guessable.  What would happen if the next sequential ID was guessed, inserted into a column and then NEWSEQUENTIALID() used for the next row?  Would there be a clash of values?

The Answer

NEWSEQUENTIALID() will skip to another (higher) range to avoid a GUID clash.

It seems to do this whether or not there is a unique index on the uniqueidentifier column.

The Test

CREATE DATABASE TestSeqIDs
GO
USE TestSeqIDs

CREATE TABLE TestTable
( SequentialId uniqueidentifier PRIMARY KEY DEFAULT NewSequentialID()
 ,Comment varchar(100)
)
GO
DECLARE @i int = 0
WHILE (@i < 4)
BEGIN	
	INSERT INTO TestTable (Comment) VALUES ('NEWSEQUENTIALID() in While loop (1)')
	SET @i += 1
END

INSERT INTO TestTable  (Comment) VALUES ('NEWSEQUENTIALID() in single statement (1)')
INSERT INTO TestTable  (Comment) VALUES ('NEWSEQUENTIALID() in single statement (1)')

SELECT * FROM TestTable

/*
SequentialId	Comment
6D2B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (1)
6E2B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (1)
6F2B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (1)
702B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (1)
712B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in single statement (1)
722B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in single statement (1)
*/

-- Manually insert the next ID in the sequence - i.e. intended to clash with next sequential ID

INSERT INTO TestTable  (SequentialId, Comment) 
	VALUES ('732B9ED6-F14A-E411-ABEF-C81F662CE547', 'Manually entered UID') -- Edit to suit!


GO
-- Add some more sequential generated IDs

DECLARE @i int = 0
WHILE (@i < 4)
BEGIN	
	INSERT INTO TestTable (Comment) VALUES ('NEWSEQUENTIALID() in While loop (2)')
	SET @i += 1
END
INSERT INTO TestTable  (Comment) VALUES ('NEWSEQUENTIALID() in single statement (2)')
INSERT INTO TestTable  (Comment) VALUES ('NEWSEQUENTIALID() in single statement (2)')
GO
INSERT INTO TestTable  (Comment) VALUES ('NEWSEQUENTIALID() in single statement new batch (post-GO)')
INSERT INTO TestTable  (Comment) VALUES ('NEWSEQUENTIALID() in single statement new batch (post-GO)')

-- what happens?

SELECT * FROM TestTable
/*
SequentialId	Comment
6D2B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (1)
6E2B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (1)
6F2B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (1)
702B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (1)
712B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in single statement (1)
722B9ED6-F14A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in single statement (1)
732B9ED6-F14A-E411-ABEF-C81F662CE547	Manually entered UID
854BBD18-F24A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (2)
864BBD18-F24A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (2)
874BBD18-F24A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (2)
884BBD18-F24A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in While loop (2)
894BBD18-F24A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in single statement (2)
8A4BBD18-F24A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in single statement (2)
8B4BBD18-F24A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in single statement new batch (post-GO)
8C4BBD18-F24A-E411-ABEF-C81F662CE547	NEWSEQUENTIALID() in single statement new batch (post-GO)
*/


-- There was no clash. NewSequentialId moved on to another higher range.



-- Drop database
Use Master 
ALTER DATABASE TestSeqIDs SET RESTRICTED_USER WITH ROLLBACK IMMEDIATE
DROP DATABASE TestSeqIDs

 Reference

From Books Online http://msdn.microsoft.com/en-gb/library/ms189786.aspx.  The definitions don’t exactly define this behaviour.

NEWSEQUENTIALID() … Creates a GUID that is greater than any GUID previously generated by this function on a specified computer since Windows was started. After restarting Windows, the GUID can start again from a lower range, but is still globally unique.

Each GUID generated by using NEWSEQUENTIALID() is unique on that computer. GUIDs generated by using NEWSEQUENTIALID() are unique across multiple computers only if the source computer has a network card.