--##################################################### --##################################################### --##################################################### -- We will use the results from this table to calculate -- precision and recall inside the database. -- -- This version calculates the Baseline table using -- "block nested loops"-style evaluation. The standard -- query uses a lot of disk space and crashed for the -- qgram versions. (more than 20Gb of free disk space -- used and then it crashed) -- -- Then for precision we divide with the size of the join -- and for recall with the size of the baseline --##################################################### DROP TABLE [dbo].[Baseline] GO CREATE TABLE [dbo].[Baseline] ( [tid1] int NOT NULL, [tid2] int NOT NULL, [sim] float NOT NULL, PRIMARY KEY (tid1,tid2), FOREIGN KEY (tid1) REFERENCES R1, FOREIGN KEY (tid2) REFERENCES R2 ) GO DECLARE @ID int DECLARE @step int SET @step=200 SET @ID=@step WHILE @ID < 14000 BEGIN INSERT INTO Baseline(tid1, tid2, sim) SELECT T1.tid AS tid1, T2.tid AS tid2, SUM(T1.weight*T2.weight) AS sim FROM R1Weights T1, R2Weights T2 WHERE T1.token = T2.token AND T1.tid<=@ID AND T1.tid>@ID-@step GROUP BY T1.tid, T2.tid HAVING SUM(T1.weight*T2.weight) >= 0.1 SET @ID = @ID + @step END GO CREATE INDEX [similarities] ON [dbo].[Baseline] ([sim] desc ) WITH FILLFACTOR = 100