开发者

Interpreting a primary key from table data

开发者 https://www.devze.com 2023-02-23 18:36 出处:网络
I have a legacy database imported into sql server 2008 r2 that contains no indexes and no primary/foreign keys for hundreds of tables (some with hundreds of columns). The primary keys which I\'ve iden

I have a legacy database imported into sql server 2008 r2 that contains no indexes and no primary/foreign keys for hundreds of tables (some with hundreds of columns). The primary keys which I've identified manually (some are composite) are usually high up in the column ordinal order but I'm not going to spend weeks working them all out if I can help it.

Is there any tool or script which can be used to analyse the cardinality of the data to either suggest or script out likely primary keys?

So far I've found the following script but it's throwing some errors. I will debug it to see if I can work out what is going wrong but wondering if anyone has come across a similar problem and managed to get something working.

Script text from link is below

--Natural Key Finder
--blindman, 6/2005
--This script searches up to 63 columns of any table for permutations of fields that represent unique keys within the dataset.
--The search can be limited by the maximum columns per key, and by the maximum minutes for processing.

declare @TableName varchar(50)
declare @MaxColumns int
declare @MaxMinutes int

--Get a suitable test table from the current database
/*
set @TableName = 
        (select top 1 sysobjects.name
        from    sysobjects
            inner join sysindexes
                on sysobjects.id = sysindexes.id
                and indid in (0, 1)
            inner join syscolumns on sysobjects.id = syscolumns.id
        where   sysobjects.type = 'U'
        group by sysobjects.name,
            sysindexes.rowcnt
        having count(*) between 5 and 15
        order by rowcnt * count(*) desc)
*/

set @TableName = 'calendarEvents' --Enter the name of the table to analyze.  Do NOT enclose it in brackets: []!
set @MaxColumns = 63 --Set the maximum number of columns per key you want to search for.
set @MaxMinutes = 5 --Set a maximum time limit on the search process.

set nocount on

--Procedure variables
declare @RecordCount bigint --The number of records in the table, and the maximum possible cardinality.
declare @ColumnMask bigint --A bitmask representing the presence or absence of columns in a column set.
declare @ColumnString varchar(8000) --A comma-delimited string including all the column names in a column set.
declare @KeyLength int --The permutation size in columns currently being searched.
declare @StartTime datetime --Used to track execution time of the script.
declare @sqlstring varchar(8000) --Dynamically created sql statement.
declare @ProcessGUID char(32) --This random value will be used to name dynamically created database objects.
declare @SampleTableName varchar(50) --This value will hold the name of a dynamically created subset of the table data.
declare @SubMask1 bigint --Bitmask for storing temporary column mask.
declare @SubMask2 bigint --Bitmask for storing temporary column mask.
declare @SubMask3 bigint --Bitmask for storing temporary column mask.
declare @BitMask bigint --Temporary bitmask for stepping through a column mask to find active columns.
declare @Counter int --counts the number permutations analyzed

set @StartTime = getdate()
set @ProcessGUID = replace(newid(), '-', '') --Generate a random string.
set @SampleTableName = 'tbl' + @ProcessGUID --Generate a random table name.
set @Counter = 0
if  @MaxColumns > 63 set @MaxColumns = 63

--Create a temp table to hold the record count processed through dynamic SQL
set @RecordCount = 
    (select rowcnt
    from    sysobjects
        inner join sysindexes
            on sysobjects.id = sysindexes.id
    where   sysobjects.name = @TableName
        and sysindexes.indid in (1, 0))

--Create a table for listing and tracking permutations of columns.
create table #ColumnSets
    (ColumnSetID int identity,
    ColumnMask bigint, --a bitmask value reprenting the presence or absence of columns in a column set.
    ColumnCount int, --the number of columns in the set.
    Cardinality bigint,  --the number of unique values in the column set.
    IsUnique int, -- 0: not a naturalkey.  1: natural key.
    ColumnString varchar(4000)) --comma-delimited string listing all the columns in the column set.

exec ('CREATE CLUSTERED INDEX IX_' + @ProcessGUID + '_1 ON #ColumnSets (ColumnMask)')

--Create the parent records in the ColumnSets table
insert into #ColumnSets
    (ColumnMask,
    ColumnCount,
    Cardinality,
    ColumnString)
select  case when colid < 64 then power(cast(2 as bigint), colid-1) else 0 end, --Converts the colid value to a bitmap value.
    1,
    @RecordCount + 1, --Cardinality is unknown, so estimate 1 higher than the maximum possible cardinality.
    '[' + syscolumns.name + ']'
from    syscolumns
    inner join sysobjects on syscolumns.id = sysobjects.id
where   sysobjects.name = @TableName
    and syscolumns.prec is not null
    and colid < 64

--Informational message
select  'Analyzing table "' + @TableName + '" containing ' + cast(@RecordCount as varchar(20)) + ' records in ' + cast(count(*) as varchar(3)) + ' columns.'
from    #ColumnSets
where   ColumnCount = 1

--First we will search for large non-primary keys in a sample set of the records.  If we find any large permutations that
--are known not to be unique, then we can eliminate any subsets of these permutations from processing later on.
select cast(power(@MaxMinutes, 0.5) -1 as varchar(4)) + ' minutes will be alloted for pre-searching.'

declare @SampleSize int
set @SampleSize = power(@RecordCount, (0.5)) --Sample sqrt(RecordCount)
set @SQLString = 'select top ' + cast(@SampleSize as varchar(10)) + ' * into ' + @SampleTableName + ' from ' + @TableName
exec    (@SQLString)

--Start with a columnset including all columns
insert into #ColumnSets
    (ColumnMask,
    ColumnCount)
select  sum(ColumnMask),
    count(*)
from    #ColumnSets
where   ColumnCount = 1

set @KeyLength = (select count(*) from #ColumnSets where ColumnCount = 1)

while   exists (select * from #ColumnSets where IsUnique is null and ColumnCount > 1)--isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 1))
    begin
        --show status
        select  'Largest non-unique permutation found: ' + cast(isnull(max(ColumnCount), 0) as varchar(3)) + ' columns.'
        from    #ColumnSets
        where   IsUnique = 0

        --If the estimated cardinality is less than the record count, we know the column set cannot
        --possibly be unique, so mark it as non-unique.
        update  #ColumnSets
        set IsUnique = 0
        where   Cardinality < @RecordCount
            and IsUnique is null

        while exists (select * from #ColumnSets where IsUnique is null and ColumnCount > 1)--isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 1))
            begin

                --set   @KeyLength = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1)
                --Get the cardinality of all untested column sets
                set @Counter = @Counter + 
                    (Select Count(*)
                    from    #ColumnSets
                    where   IsUnique is null
                        and ColumnCount = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1))

                declare CSCursor cursor for
                    Select  ColumnMask,
                        ColumnString
                    from    #ColumnSets
                    where   IsUnique is null
                        and ColumnCount = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1)

                Open    CSCursor
                Fetch next from CSCursor into @ColumnMask, @ColumnString
                while @@FETCH_STATUS = 0
                    begin
                        --Create the column string for the bitmask
                        select  @ColumnString = isnull(@ColumnString + ', ', '') + ColumnString
                        from    #ColumnSets
                        where   ColumnCount = 1
                            and ColumnMask & @ColumnMask = ColumnMask
                        order by ColumnString

                        set @SQLString = 'Update #ColumnSets set ColumnString = ''' + @ColumnString + ''', Cardinality = (select count(*) from (select distinct ' + @ColumnString + ' from ' + @SampleTableName + ') DistinctValues), IsUnique = 0 where IsUnique is null and ColumnMask = ' + cast(@ColumnMask as varchar(20))
                        exec (@SQLString)
                        fetch next from CSCursor into @ColumnMask, @ColumnString

                        --Break out of this loop if we have used more than half of the alloted processing time.
                        if dateadd(mi, power(@MaxMinutes, 0.5), @StartTime) < getdate() break
                    end
                Close   CSCursor
                Deallocate CSCursor

                --Any columnsets with a cardinality equal to the rowcount represent natural keys
                Update  #ColumnSets set IsUnique = 1 where Cardinality = @SampleSize and ColumnCount > 1
                if dateadd(mi, power(@MaxMinutes, 0.5), @StartTime) < getdate() break

                --Delete subsets of combinations known not to be unique.
                delete  CandidateSets
                from    #ColumnSets CandidateSets
                    inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
                where   SuperSets.IsUnique = 0
                    and CandidateSets.ColumnCount > 1
                    and CandidateSets.ColumnCount < SuperSets.ColumnCount
            end

        if dateadd(mi, power(@MaxMinutes, 0.5) -1, @StartTime) < getdate() break

        --Split non-minimal unique sets into three subsets.  We will shuffle these subset together
        --to creat new permutations for searching.
        declare CSCursor cursor for
            Select  ColumnMask
            from    #ColumnSets
            where   IsUnique = 1
                and ColumnCount >= 6

        Open    CSCursor
        Fetch next from CSCursor into @ColumnMask
        while @@FETCH_STATUS = 0
            begin
                set @BitMask = 1
                set @SubMask1 = 0
                set @SubMask2 = 0
                set @SubMask3 = 0

                --Note that this permutation has already been split
                update  #ColumnSets set IsUnique = 2 where ColumnMask = @ColumnMask

                --split the TargetMask into three distinct masks
                while   @BitMask < @ColumnMask
                    begin
                    while   @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2
                    set @SubMask1 = @SubMask1 | @BitMask
                    if  @BitMask > @ColumnMask/2 break
                    set @BitMask = @BitMask * 2
                    while   @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2
                    set @SubMask2 = @SubMask2 | @BitMask
                    if  @BitMask > @ColumnMask/2 break
                    set @BitMask = @BitMask * 2
                    while   @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2
                    set @SubMask3 = @SubMask3 | @BitMask
                    if  @BitMask > @ColumnMask/2 break
                    set @BitMask = @BitMask * 2
                    end     

                insert into #ColumnSets
                    (ColumnMask,
                    ColumnCount,
                    Cardinality)
                select  @SubMask1,
                    Count(*),
                    @RecordCount + 1
                from    #ColumnSets
                where   ColumnCount = 1
                    and ColumnMask & @SubMask1 = ColumnMask

                insert into #ColumnSets
                    (ColumnMask,
                    ColumnCount,
                    Cardinality)
                select  @SubMask2,
                    Count(*),
                    @RecordCount + 1
                from    #ColumnSets
                where   ColumnCount = 1
                    and ColumnMask & @SubMask2 = ColumnMask

                insert into #ColumnSets
                    (ColumnMask,
                    ColumnCount,
                    Cardinality)
                select  @SubMask3,
                    Count(*),
                    @RecordCount + 1
                from    #ColumnSets
                where   ColumnCount = 1
                    and ColumnMask & @SubMask3 = ColumnMask

                fetch next from CSCursor into @ColumnMask
            end
        Close   CSCursor
        Deallocate CSCursor

        --Create larger combinations of non-unique permutations for testing
        insert into #ColumnSets
            (ColumnMask,
            ColumnCount,
            Cardinality)
        select  distinct
            ColumnSetsA.ColumnMask | ColumnSetsB.ColumnMask,
            ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount,
            @RecordCount + 1
        from    #ColumnSets ColumnSetsA,
            #ColumnSets ColumnSetsB
        where   ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount > isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 0)
            and ColumnSetsA.IsUnique = 0
            and isnull(ColumnSetsB.IsUnique, 0) = 0
            and ColumnSetsA.ColumnMask <> ColumnSetsB.ColumnMask
            and ColumnSetsA.ColumnMa开发者_如何学Gosk & ColumnSetsB.ColumnMask = 0

        --Remove duplicates
        delete  #ColumnSets
        from    #ColumnSets
            left outer join --DistinctSets
                (select ColumnMask,
                    min(ColumnSetID) ColumnSetID
                from    #ColumnSets
                group by ColumnMask) DistinctSets
                on #ColumnSets.ColumnMask = DistinctSets.ColumnMask
                and #ColumnSets.ColumnSetID = DistinctSets.ColumnSetID
        where   DistinctSets.ColumnSetID is null

        --Delete subsets of combinations known not to be unique.
        delete  CandidateSets
        from    #ColumnSets CandidateSets
            inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
        where   SuperSets.IsUnique = 0
            and CandidateSets.ColumnCount > 1
            and CandidateSets.ColumnCount < SuperSets.ColumnCount
            and CandidateSets.IsUnique is null

        --Delete supersets of combinations already known to be unique.
        delete  CandidateSets
        from    #ColumnSets CandidateSets
            inner join #ColumnSets UniqueSets on CandidateSets.ColumnMask & UniqueSets.ColumnMask = UniqueSets.ColumnMask
        where   UniqueSets.IsUnique > 0 
            and CandidateSets.ColumnCount > UniqueSets.ColumnCount
            and CandidateSets.IsUnique is null
    end     

delete  CandidateSets
from    #ColumnSets CandidateSets
    inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
where   SuperSets.IsUnique = 0
    and CandidateSets.ColumnCount > 1
    and CandidateSets.ColumnCount < SuperSets.ColumnCount

delete
from    #ColumnSets
where   (IsUnique <> 0 or IsUnique is null)
    and ColumnCount > 1

set @SQLString = 'drop table ' + @SampleTableName
exec    (@SQLString)


--Now start building permutations of columns and checking them for uniqueness.
set @KeyLength = 1
While @KeyLength <= @MaxColumns
    begin
        --If there are no more candidates left, then stop looking.
        if (select count(*) from #ColumnSets where IsUnique is null) = 0 break

        --If the estimated cardinality is less than the record count, we know the column set cannot possibly be unique.
        update  #ColumnSets
        set IsUnique = 0
        where   Cardinality < @RecordCount
            and IsUnique is null

        --show status
        select  ColumnCount,
            sum(case when IsUnique is null then 1 else 0 end) as Unknown,
            sum(case when IsUnique = 0 then 1 else 0 end) as NotUnique,
            sum(case when IsUnique = 1 then 1 else 0 end) as IsUnique
        from    #ColumnSets
        group by ColumnCount
        order by ColumnCount

        --Get the cardinality of all untested column sets
        set @Counter = @Counter + 
            (Select Count(*)
            from    #ColumnSets
            where   Cardinality >= @RecordCount
                and IsUnique is null)

        declare CSCursor cursor for
            Select  ColumnMask,
                ColumnString
            from    #ColumnSets
            where   Cardinality >= @RecordCount
                and IsUnique is null

        Open    CSCursor
        Fetch next from CSCursor into @ColumnMask, @ColumnString
        while @@FETCH_STATUS = 0
            begin
                set @SQLString = 'Update #ColumnSets set Cardinality = (select count(*) from (select distinct ' + @ColumnString + ' from ' + @TableName + ') DistinctValues), IsUnique = 0 where IsUnique is null and ColumnMask = ' + cast(@ColumnMask as varchar(20))
                exec (@SQLString)
                fetch next from CSCursor into @ColumnMask, @ColumnString
                if dateadd(mi, @MaxMinutes, @StartTime) < getdate() break
            end
        Close   CSCursor
        Deallocate CSCursor

        --Delete any ColumnSets with fewer than two distinct values, as they cannot be part of a natural key
        Delete from #ColumnSets where Cardinality < 2

        --Any columnsets with a cardinality equal to the rowcount represent natural keys.
        Update  #ColumnSets set IsUnique = 1 where Cardinality = @RecordCount
        if dateadd(mi, @MaxMinutes, @StartTime) < getdate() break

        set @SQLString = 'select ColumnString as ''' + cast(@KeyLength as varchar(3)) + '-column keys found in ' + cast(datediff(second, @StartTime, getdate()) as varchar(10)) + ' seconds.'' from #ColumnSets where IsUnique = 1 and ColumnCount = ' + cast(@KeyLength as varchar(3)) + ' order by ColumnCount, ColumnString'
        exec (@SQLString)

        --Remove superfluous permutations (ColumnSet has same cardinality as one of its subsets)
        delete  ColumnSets
        from    #ColumnSets ColumnSets
            inner join #ColumnSets SubSets
                on ColumnSets.Cardinality = SubSets.Cardinality
        where   ColumnSets.ColumnCount > SubSets.ColumnCount
            and ColumnSets.ColumnCount = @KeyLength
            and ColumnSets.ColumnMask & SubSets.ColumnMask = SubSets.ColumnMask

        --Identify and insert combinations of non-unique permutations that have insufficient potential cardinality to be unique
        insert into #ColumnSets
            (ColumnMask,
            ColumnCount,
            Cardinality, --estimate the cardinality as the product of the cardinalities of the component columns.
            IsUnique,
            ColumnString)
        select  ColumnSetsA.ColumnMask + ColumnSetsB.ColumnMask as ColumnMask,
            ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount as ColumnCount,
            min(ColumnSetsA.Cardinality * ColumnSetsB.Cardinality) as Cardinality,
            0 as IsUnique,
            min(ColumnSetsA.ColumnString + ', ' + ColumnSetsB.ColumnString) as ColumnString
        from    #ColumnSets ColumnSetsA
            inner join #ColumnSets ColumnSetsB on ColumnSetsB.ColumnCount < ColumnSetsA.ColumnCount
        where   ColumnSetsA.ColumnCount = @KeyLength
            and ColumnSetsA.Cardinality * ColumnSetsB.Cardinality <= @RecordCount
            and ColumnSetsA.ColumnMask & ColumnSetsB.ColumnMask = 0
            and ColumnSetsA.IsUnique = 0
            and ColumnSetsB.IsUnique = 0
        group by ColumnSetsA.ColumnMask + ColumnSetsB.ColumnMask,
            ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount

        --Insert new column sets
        set @KeyLength = @KeyLength + 1

        insert into #ColumnSets
            (ColumnMask,
            ColumnCount,
            Cardinality, --estimate the cardinality as the product of the cardinalities of the component columns.
            ColumnString)
        select  ParentSets.ColumnMask | UnarySets.ColumnMask as ColumnMask,
            ParentSets.ColumnCount + UnarySets.ColumnCount as ColumnCount,
            ParentSets.Cardinality * UnarySets.Cardinality as Cardinality,
            ParentSets.ColumnString + ', ' + UnarySets.ColumnString as ColumnString
        from    #ColumnSets ParentSets
            inner join #ColumnSets UnarySets on ParentSets.IsUnique = UnarySets.IsUnique
        where   ParentSets.IsUnique = 0
            and ParentSets.ColumnCount = @KeyLength - 1
            and UnarySets.ColumnCount = 1
            and ParentSets.ColumnMask & UnarySets.ColumnMask = 0

        --Remove duplicates
        delete  #ColumnSets
        from    #ColumnSets
            left outer join --DistinctSets
                (select ColumnMask,
                    min(ColumnString) ColumnString
                from    #ColumnSets
                group by ColumnMask) DistinctSets
                on #ColumnSets.ColumnMask = DistinctSets.ColumnMask
                and #ColumnSets.ColumnString = DistinctSets.ColumnString
        where   DistinctSets.ColumnString is null

        --Remove supersets of combinations already known to be unique.
        delete  CandidateSets
        from    #ColumnSets CandidateSets
            inner join #ColumnSets UniqueSets on CandidateSets.ColumnMask & UniqueSets.ColumnMask = UniqueSets.ColumnMask
        where   UniqueSets.IsUnique = 1
            and CandidateSets.IsUnique is null

        --Remove subsets of combinations known not to be unique.
        delete  CandidateSets
        from    #ColumnSets CandidateSets
            inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
        where   SuperSets.IsUnique = 0
            and CandidateSets.IsUnique is null
    end

-- Make sure that all natural keys present their columns in a uniform alphabetical order
declare CSCursor cursor for
    Select  ColumnMask
    from    #ColumnSets
    where   IsUnique = 1

Open    CSCursor
Fetch next from CSCursor into @ColumnMask
while @@FETCH_STATUS = 0
    begin
        set @ColumnString = null
        select  @ColumnString = isnull(@ColumnString + ', ', '') + ColumnString
        from    #ColumnSets
        where   ColumnCount = 1
            and ColumnMask & @ColumnMask = ColumnMask
        order by ColumnString
        update  #ColumnSets set ColumnString = @ColumnString where ColumnMask = @ColumnMask
        fetch next from CSCursor into @ColumnMask
    end
Close   CSCursor
Deallocate CSCursor


--Final output:
select  'Found ' + cast(count(*) as varchar(10)) + ' natural keys with ' + cast(@MaxColumns as varchar(10)) + ' or fewer columns in ' + cast(datediff(minute, @StartTime, getdate()) as varchar(10)) + ' minutes.' from #ColumnSets where IsUnique = 1

select  ColumnString as NaturalKeys
from    #ColumnSets
where   IsUnique = 1
order by ColumnCount, ColumnString

select  convert(varchar(20), @Counter) + ' permutations tested.'

drop table #ColumnSets


If you are embracing the Microsoft Suite a possibility is to use SSIS. Intergration services has a data profiling task.

This site has the basics on how to use it: http://consultingblogs.emc.com/jamiethomson/archive/2008/03/04/ssis-data-profiling-task-part-8-candidate-key.aspx

As a caution it can take a very long time depending on table size and the number of columns to include in your candidate key. The output is very useful for identifying the strength of keys. In your case you would be looking for items that are 100%.

0

精彩评论

暂无评论...
验证码 换一张
取 消