how to improve the code (Delphi) for loading and searching in a dictionary?_问答_开发者

I'm a Delphi programmer. I have made a program who uses dictionaries with words and expressions (loaded in program as "array of string"). It uses a search algorithm based on their "checksum" (I hope this is the correct word). A string is transformed in integer based on this:

var
   FHashSize: Integer; //stores the value of GetHashSize
   HashTable, HashTableNoCase: array[Byte] of Longword;
   HashTableInit: Boolean = False;

const
   AnsiLowCaseLookup: array[AnsiChar] of AnsiChar = (
      #$00, #$01, #$02, #$03, #$04, #$05, #$06, #$07,
      #$08, #$09, #$0A, #$0B, #$0C, #$0D, #$0E, #$0F,
      #$10, #$11, #$12, #$13, #$14, #$15, #$16, #$17,
      #$18, #$19, #$1A, #$1B, #$1C, #$1D, #$1E, #$1F,
      #$20, #$21, #$22, #$23, #$24, #$25, #$26, #$27,
      #$28, #$29, #$2A, #$2B, #$2C, #$2D, #$2E, #$2F,
      #$30, #$31, #$32, #$33, #$34, #$35, #$36, #$37,
      #$38, #$39, #$3A, #$3B, #$3C, #$3D, #$3E, #$3F,
      #$40, #$61, #$62, #$63, #$64, #$65, #$66, #$67,
      #$68, #$69, #$6A, #$6B, #$6C, #$6D, #$6E, #$6F,
      #$70, #$71, #$72, #$73, #$74, #$75, #$76, #$77,
      #$78, #$79, #$7A, #$5B, #$5C, #$5D, #$5E, #$5F,
      #$60, #$61, #$62, #$63, #$64, #$65, #$66, #$67,
      #$68, #$69, #$6A, #$6B, #$6C, #$6D, #$6E, #$6F,
      #$70, #$71, #$72, #$73, #$74, #$75, #$76, #$77,
      #$78, #$79, #$7A, #$7B, #$7C, #$7D, #$7E, #$7F,
      #$80, #$81, #$82, #$83, #$84, #$85, #$86, #$87,
      #$88, #$89, #$8A, #$8B, #$8C, #$8D, #$8E, #$8F,
      #$90, #$91, #$92, #$93, #$94, #$95, #$96, #$97,
      #$98, #$99, #$9A, #$9B, #$9C, #$9D, #$9E, #$9F,
      #$A0, #$A1, #$A2, #$A3, #$A4, #$A5, #$A6, #$A7,
      #$A8, #$A9, #$AA, #$AB, #$AC, #$AD, #$AE, #$AF,
      #$B0, #$B1, #$B2, #$B3, #$B4, #$B5, #$B6, #$B7,
      #$B8, #$B9, #$BA, #$BB, #$BC, #$BD, #$BE, #$BF,
      #$C0, #$C1, #$C2, #$C3, #$C4, #$C5, #$C6, #$C7,
      #$C8, #$C9, #$CA, #$CB, #$CC, #$CD, #$CE, #$CF,
      #$D0, #$D1, #$D2, #$D3, #$D4, #$D5, #$D6, #$D7,
      #$D8, #$D9, #$DA, #$DB, #$DC, #$DD, #$DE, #$DF,
      #$E0, #$E1, #$E2, #$E3, #$E4, #$E5, #$E6, #$E7,
      #$E8, #$E9, #$EA, #$EB, #$EC, #$ED, #$EE, #$EF,
      #$F0, #$F1, #$F2, #$F3, #$F4, #$F5, #$F6, #$F7,
      #$F8, #$F9, #$FA, #$FB, #$FC, #$FD, #$FE, #$FF);

implementation

function GetHashSize(const Count: Integer): Integer;
begin
   if Count < 65 then
      Result := 256
   else
      Result := Round(IntPower(16, Ceil(Log10(Count div 4) / Log10(16))));
end;

function Hash(const Hash: LongWord; const Buf; const BufSize: Integer): LongWord;
var P: PByte;
   I: Integer;
begin
   P := @Buf;
   Result := Hash;
   for I := 1 to BufSize do
   begin
      Result := HashTable[Byte(Result) xor P^] xor (Result shr 8);
      Inc(P);
   end;
end;

function HashStrBuf(const StrBuf: Pointer; const StrLength: Integer; const Slots: LongWord): LongWord;
var P: PChar;
   I, J: Integer;
begin
   if not HashTableInit then
      InitHashTable;
   P := StrBuf;
   if StrLength <= 48 then // Hash all characters for short strings
      Result := Hash($FFFFFFFF, P^, StrLength)
   else
   begin
      // Hash first 16 bytes
      Result := Hash($FFFFFFFF, P^, 16);
      // Hash last 16 bytes
      Inc(P, StrLength - 16);
      Result := Hash(Result, P^, 16);
      // Hash 16 bytes sampled from rest of string
      I := (StrLength - 48) div 16;
      P := StrBuf;
      Inc(P, 16);
      for J := 1 to 16 do
      begin
         Result := HashTable[Byte(Result) xor Byte(P^)] xor (Result shr 8);
         Inc(P, I + 1);
      end;
   end;
  // Mod into slots
   if Slots <> 0 then
      Result := Result mod Slots;
end;

procedure InitHashTable;
var I, J: Byte;
   R: LongWord;
begin
   for I := $00 to $FF do
   begin
      R := I;
      for J := 8 downto 1 do
         if R a开发者_Python百科nd 1 <> 0 then
            R := (R shr 1) xor $EDB88320
         else
            R := R shr 1;
      HashTable[I] := R;
   end;
   Move(HashTable, HashTableNoCase, Sizeof(HashTable));
   for I := Ord('A') to Ord('Z') do
      HashTableNoCase[I] := HashTableNoCase[I or 32];
   HashTableInit := True;
end;

The result of the HashStrBuf is "and (FHashSize - 1)" and is used as index in an "array of array of Integer" (of FHashSize size) to store the index of the string from that "array of string". This way, when searches for a string, it's transformed in "checksum" and then the code searches in the "branch" with this index comparing this string with the strings from dictionary who have the same "checksum".

Ideally each string from dictionary should have unique checksum. But in the "real world" about 2/3 share the same "checksum" with other words. Because of that the search is not that fast. In these dictionaries strings are composed of this characters: ['a'..'z',#224..#246,#248..#254,#154,#156..#159,#179,#186,#191,#190,#185,'0'..'9', ''''] Is there any way to improve the "hashing" so the strings would have more unique "checksums"? Oh, one way is to increase the size of that "array of array of Integer" (FHashSize) but it cannot be increased too much because it takes a lot of Ram.

Another thing: these dictionaries are stored on HDD only as words/expressions (not the "checksums"). Their "checksum" is generated at program startup. But it takes a lot of seconds to do that... Is there any way to speed up the startup of the program? Maybe by improving the "hashing" function, maybe by storing the "checksums" on HDD and loading them from there...

Any input would be appreciated...

PS: here is the code to search:

function TDictionary.LocateKey(const Key: AnsiString): Integer;
var i, j, l, H: Integer;
   P, Q: PChar;
begin
   Result := -1;
   l := Length(Key);
   H := HashStrBuf(@Key[1], l, 0) and (FHashSize - 1);
   P := @Key[1];
   for i := 0 to High(FHash[H]) do  //FHash is that "array of array of integer"
   begin
      if l <> FKeys.ItemSize[FHash[H][i]] then //FKeys.ItemSize is an byte array with the lengths of strings from dictionary
         Continue;
      Q := FKeys.Pointer(FHash[H][i]); //pointer to string in dictionary
      for j := 0 to l - 1 do
         if (P + j)^ <> (Q + j)^ then
            Break;
      if j = l then
      begin
         Result := FHash[H][i];
         Exit;
      end;
   end;
end;

Don't reinvent the wheel!

IMHO your hashing is far from efficient, and your collision algorithm can be improved.

Take a look for instance at the IniFiles unit, and the THashedStringList. It's a bit old, but a good start for a string list using hashes.

There are a lot of good Delphi implementation of such, like in SuperObject and a lot of other code...

Take a look at our SynBigTable unit, which can handle arrays of data in memory or in file very fast, with full indexed searches. Or our latest TDynArray wrapper around any dynamic array of data, to implement TList-like methods to it, including fast binary search. I'm quite sure it could be faster than your hand-tuned code using hashing, if you use an ordered index then fast binary search.

Post-Scriptum:

About pure hashing speed of a string content, take a look at this function - rename RawByteString into AnsiString, PPtrInt into PPointer, and PtrInt into Integer for Delphi 7:

function Hash32(const Text: RawByteString): cardinal;
function SubHash(P: PCardinalArray): cardinal;
{$ifdef HASINLINE}inline;{$endif}
var s1,s2: cardinal;
    i, L: PtrInt;
const Mask: array[0..3] of cardinal = (0,$ff,$ffff,$ffffff);
begin
  if P<>nil then begin
    L := PPtrInt(PtrInt(P)-4)^; // fast lenght(Text)
    s1 := 0;
    s2 := 0;
    for i := 1 to L shr 4 do begin // 16 bytes (4 DWORD) by loop - aligned read
      inc(s1,P^[0]);
      inc(s2,s1);
      inc(s1,P^[1]);
      inc(s2,s1);
      inc(s1,P^[2]);
      inc(s2,s1);
      inc(s1,P^[3]);
      inc(s2,s1);
      inc(PtrUInt(P),16);
    end;
    for i := 1 to (L shr 2)and 3 do begin // 4 bytes (DWORD) by loop
      inc(s1,P^[0]);
      inc(s2,s1);
      inc(PtrUInt(P),4);
    end;
    inc(s1,P^[0] and Mask[L and 3]);      // remaining 0..3 bytes
    inc(s2,s1);
    result := s1 xor (s2 shl 16);
  end else
    result := 0;
end;
begin // use a sub function for better code generation under Delphi
  result := SubHash(pointer(Text));
end;

There is even a pure asm version, even faster, in our SynCommons.pas unit. I don't know any faster hashing function around (it's faster than crc32/adler32/IniFiles.hash...). It's based on adler32, but use DWORD aligned reading and summing for even better speed. This could be improved with SSE asm, of course, but here is a fast pure Delphi hash function.

Then don't forget to use "multiplication"/"binary and operation" for hash resolution, just like in IniFiles. It will reduce the number of iteration to your list of hashs.

But since you didn't provide the search source code, we are not able to know what could be improved here.

If you are using Delphi 7, consider using Julian Bucknall's lovely Delphi data types code, EzDsl (Easy Data Structures Library).

Now you don't have to reinvent the wheel as another wise person has also said.

You can download ezdsl, a version that I have made work with both Delphi 7, and recent unicode delphi versions, here.

In particular the unit name EHash contains a hash table implementation, which has various hashing algorithms plug-inable, or you can write your own plugin function that just does the hashing function of your choice.

As a word to the wise, if you are using a Unicode Delphi version; I would be careful about hashing your unicode strings with a code library like this, without checking how its hashing algorithms perform on your system. The OP here is using Delphi 7, so Unicode is not a factor for the original question.

I think you'll find a database (without checksums) a lot quicker. Maybe try sqlite which will give you a single file database. There are many Delphi Libraries available.