Cleanse Date Fields

Resources

Here’s the script I’ve used as a stop-gap to cleanse the dates and hold them in a table. As I mentioned, this does the job for now but will not be good enough for production. We’ll revisit this when we automate the transform processes.


CREATE TABLE	superstore.staging.cleandates(
rowid INT PRIMARY KEY
,orderdate VARCHAR(20)
,shipdate VARCHAR(20)
,cleanorderdate DATE
,cleanshipdate DATE
);

/* Cleaning dates 
		Identifying text dates - CONTAINS(orderdate,'/') in most flavours of SQL; POSITION('/' IN o.orderdate) >0 in PostgreSQL
		Using string functions to 'rearrange' the date elements into format yyyy-mm-dd, and CAST as DATE (this is done in the CTEs here)
		With most SQL flavours, you could use RIGHT, LEFT, CONTAINS and MID
		WITH PostgreSQL I've used LEFT, RIGHT, POSITION and SUBSTRING
		With the remaining Excel dates - adding the Excel date number to '1899-12-31' - DATEADD(d,[excelnumber],'1899-12-31') in most SQL flavours
		'1899-12-31'::date +  (o.shipdate || ' days')::interval AS DATE in PostgreSQL

*/

INSERT INTO		superstore.staging.cleandates

WITH ordermonth AS		(
						SELECT 	rowid
								,CASE 
									WHEN SUBSTRING(orderdate FROM 2 FOR 1) = '/' 
									THEN CONCAT('0',LEFT(orderdate,1)) 
									ELSE LEFT(orderdate,2) 
								END AS ordermonth
						FROM	superstore.staging.orderitems	
						)
						
	,shipmonth 	AS		(
						SELECT 	rowid
								,CASE 
									WHEN SUBSTRING(shipdate FROM 2 FOR 1) = '/' 
									THEN CONCAT('0',LEFT(shipdate,1)) 
									ELSE LEFT(shipdate,2) 
								END AS shipmonth
						FROM	superstore.staging.orderitems
						)
						
	,orderday	AS		(
						SELECT 	rowid
								,CASE
									WHEN	POSITION('/' IN RIGHT(orderdate,7)) = 3
									THEN	LEFT(RIGHT(orderdate,7),2)
									WHEN	POSITION('/' IN RIGHT(orderdate,7)) = 2
									THEN	CONCAT('0',LEFT(RIGHT(orderdate,6),1))
								END AS orderday
						FROM	superstore.staging.orderitems
						)
						
	,shipday	AS		(
						SELECT 	rowid
								,CASE
									WHEN	POSITION('/' IN RIGHT(shipdate,7)) = 3
									THEN	LEFT(RIGHT(shipdate,7),2)
									WHEN	POSITION('/' IN RIGHT(shipdate,7)) = 2
									THEN	CONCAT('0',LEFT(RIGHT(shipdate,6),1))
								END AS shipday
						FROM	superstore.staging.orderitems
						)
	,exceldates	AS		(	
						SELECT 	rowid
								,CAST(orderdate AS INT)
						FROM	superstore.staging.orderitems
						WHERE	POSITION('/' IN orderdate) = 0
						)

						
SELECT 		o.rowid
			,o.orderdate
			,o.shipdate
			,CASE
				WHEN	POSITION('/' IN o.orderdate) >0
				THEN	CAST(CONCAT(RIGHT(o.orderdate,4),'-',om.ordermonth,'-',od.orderday) AS DATE)
				ELSE	CAST('1899-12-30'::date +  (o.orderdate || ' days')::interval AS DATE)
			END AS cleanorderdate
			,CASE
				WHEN	POSITION('/' IN o.shipdate) >0
				THEN	CAST(CONCAT(RIGHT(o.shipdate,4),'-',sm.shipmonth,'-',sd.shipday) AS DATE)
				ELSE	CAST('1899-12-30'::date +  (o.shipdate || ' days')::interval AS DATE)
			END AS cleanshipdate
FROM		superstore.staging.orderitems AS o

LEFT JOIN	ordermonth AS om 
ON			om.rowid = o.rowid
LEFT JOIN	shipmonth AS sm 
ON			sm.rowid = o.rowid
LEFT JOIN	orderday AS od 
ON			od.rowid = o.rowid
LEFT JOIN	shipday AS sd
ON			sd.rowid = o.rowid
LEFT JOIN	exceldates AS ed
ON			ed.rowid = o.rowid
;


SELECT * 
FROM	superstore.staging.cleandates
ORDER BY	rowid;