Preprocessing Step

After creating the relations R1, R2, R1Tokens, R2Tokens you can run the preprocessing step, to create the auxiliary relations.

Download the SQL script preprocessing.sql, or use the code below.

You can run the script as isql -U <user> -P <passwd> -d <database name> -i preprocessing.sql

-- Create two dummy relations with the size of R1, R2

-- (used to create the RiIDF tables)

CREATE TABLE [dbo].[R1Size] (

[s] int PRIMARY KEY

)



CREATE TABLE [dbo].[R2Size] (

[s] int PRIMARY KEY

)



INSERT INTO [dbo].[R1Size]

SELECT COUNT(*)

FROM R1



INSERT INTO [dbo].[R2Size]

SELECT COUNT(*)

FROM R2

GO





-- Create the IDF tables

CREATE TABLE [dbo].[R1IDF] (

[token] varchar (80) NOT NULL,

[idf] float NOT NULL,

PRIMARY KEY (token)

)



CREATE TABLE [dbo].[R2IDF] (

[token] varchar (80) NOT NULL,

[idf] float NOT NULL,

PRIMARY KEY (token)

)





INSERT INTO [dbo].[R1IDF]

SELECT T.token, LOG(S.s)-LOG(COUNT(DISTINCT T.tid))

FROM R1Tokens T, R1Size S

GROUP BY T.token, S.s



INSERT INTO [dbo].[R2IDF]

SELECT T.token, LOG(S.s)-LOG(COUNT(DISTINCT T.tid))

FROM R2Tokens T, R2Size S

GROUP BY T.token, S.s

GO







-- Create the TF tables

CREATE TABLE [dbo].[R1TF] (

[tid] int NOT NULL,

[token] varchar (80) NOT NULL,

[tf] int NOT NULL,

PRIMARY KEY (tid,token),

FOREIGN KEY (tid) REFERENCES R1,

FOREIGN KEY (token) REFERENCES R1IDF

)



CREATE TABLE [dbo].[R2TF] (

[tid] int NOT NULL,

[token] varchar (80) NOT NULL,

[tf] int NOT NULL,

PRIMARY KEY (tid,token),

FOREIGN KEY (tid) REFERENCES R2,

FOREIGN KEY (token) REFERENCES R2IDF

)





INSERT INTO [dbo].[R1TF]

SELECT T.tid, T.token, COUNT(*)

FROM R1Tokens T

GROUP BY T.tid, T.token



INSERT INTO [dbo].[R2TF]

SELECT T.tid, T.token, COUNT(*)

FROM R2Tokens T

GROUP BY T.tid, T.token

GO









-- Create the Length tables

CREATE TABLE [dbo].[R1Length] (

[tid] int NOT NULL,

[length] float NOT NULL,

PRIMARY KEY (tid),

FOREIGN KEY (tid) REFERENCES R1

)



CREATE TABLE [dbo].[R2Length] (

[tid] int NOT NULL,

[length] float NOT NULL,

PRIMARY KEY (tid),

FOREIGN KEY (tid) REFERENCES R2

)





INSERT INTO [dbo].[R1Length]

SELECT T.tid, SQRT(SUM(I.idf*I.idf*T.tf*T.tf))

FROM R1IDF I, R1TF T

WHERE I.token = T.token

GROUP BY T.tid



INSERT INTO [dbo].[R2Length]

SELECT T.tid, SQRT(SUM(I.idf*I.idf*T.tf*T.tf))

FROM R2IDF I, R2TF T

WHERE I.token = T.token

GROUP BY T.tid

GO









-- Create the Weights tables

CREATE TABLE [dbo].[R1Weights] (

[tid] int NOT NULL,

[token] varchar (80) NOT NULL,

[weight] float NOT NULL,

PRIMARY KEY (tid,token),

FOREIGN KEY (tid) REFERENCES R1,

FOREIGN KEY (token) REFERENCES R1IDF

)



CREATE TABLE [dbo].[R2Weights] (

[tid] int NOT NULL,

[token] varchar (80) NOT NULL,

[weight] float NOT NULL,

PRIMARY KEY (tid,token),

FOREIGN KEY (tid) REFERENCES R2,

FOREIGN KEY (token) REFERENCES R2IDF

)





INSERT INTO [dbo].[R1Weights]

SELECT T.tid, T.token, I.idf*T.tf/L.length

FROM R1IDF I, R1TF T, R1Length L

WHERE I.token = T.token AND T.tid = L.tid



INSERT INTO [dbo].[R2Weights]

SELECT T.tid, T.token, I.idf*T.tf/L.length

FROM R2IDF I, R2TF T, R2Length L

WHERE I.token = T.token AND T.tid = L.tid

GO







-- Create the Sum tables

CREATE TABLE [dbo].[R1Sum] (

[token] varchar (80) NOT NULL,

[total] float NOT NULL,

PRIMARY KEY (token),

FOREIGN KEY (token) REFERENCES R1IDF

)



CREATE TABLE [dbo].[R2Sum] (

[token] varchar (80) NOT NULL,

[total] float NOT NULL,

PRIMARY KEY (token),

FOREIGN KEY (token) REFERENCES R2IDF

)





INSERT INTO [dbo].[R1Sum]

SELECT R.token, SUM(R.weight)

FROM R1Weights R

GROUP BY R.token



INSERT INTO [dbo].[R2Sum]

SELECT R.token, SUM(R.weight)

FROM R2Weights R

GROUP BY R.token

GO









-- Create the RV tables (NOTE: obsolete)



CREATE TABLE [dbo].[R1V] (

[tid] int NOT NULL,

[Tv] float NOT NULL,

PRIMARY KEY (tid),

FOREIGN KEY (tid) REFERENCES R1

)



CREATE TABLE [dbo].[R2V] (

[tid] int NOT NULL,

[Tv] float NOT NULL,

PRIMARY KEY (tid),

FOREIGN KEY (tid) REFERENCES R2

)





INSERT INTO R1V(tid, Tv)

SELECT R.tid, SUM(R.weight*S.total)

FROM R1Weights R, R2Sum S

WHERE R.token = S.token

GROUP BY R.tid



INSERT INTO R2V(tid, Tv)

SELECT R.tid, SUM(R.weight*S.total)

FROM R2Weights R, R1Sum S

WHERE R.token = S.token

GROUP BY R.tid

GO