tricky string matching_问答_开发者_运维开发者技术经验分享

I want to find the first index of substrings in a larger string. I only want it to match whole words and I'd like it to be case-insensitive, except that I want it to treat CamelCase as separate words.

The code below does the trick, but it's slow. I'd like to speed it up. Any suggestions? I was trying some regex stuff, but couldn't find one that handled all the edge cases.

def word_start_index(text, seek_word):
    start_index = 0
    curr_word = ""
    def case_change():
        return curr_word and ch.isupper() and curr_word[-1].islower()
    def is_match():
        return curr_word.lower()开发者_Python百科 == seek_word.lower()
    for i, ch in enumerate(text):
        if case_change() or not ch.isalnum():
            if is_match():
                return start_index
            curr_word = ""
            start_index = None
        if ch.isalnum():
            if start_index is None:
                start_index = i
            curr_word += ch
    if is_match():
        return start_index

if __name__ == "__main__":
    #            01234567890123456789012345
    test_text = "a_foobar_FooBar baz golf_CART"
    test_words = ["a", "foo", "bar", "baz", "golf", "cart", "fred"]

    for word in test_words:
        match_start = word_start_index(test_text, word)
        print match_start, word

Output:

0 a
9 foo
12 bar
16 baz
20 golf
25 cart
None fred

word_emitter (below) takes a text string and yields lowercase "words" as they are found, one at a time (along with their positions).

It replaces all underscores with spaces. It then splits the text into a list. For example,

"a_foobar_FooBar baz golf_CART Foo"

becomes

['a', 'foobar', 'FooBar', 'baz', 'golf', 'CART', 'Foo']

Of course, you also want camelCase words to be treated as separate words. So for each piece in the above list, we use the regex pattern '(.*[a-z])(?=[A-Z])' to split camelCase words. This regex uses the re module's look-forward operator (?=...). Perhaps that is the trickiest part to the whole thing.

word_emitter then yields the words one at a time, along with their associated positions.

Once you have a function which splits the text into "words", the rest is easy.

I've also switch the order of your loops, so you only loop through the test_text once. This will speed things up if test_text is very long compared to test_words.

import re
import string
import itertools

nonspace=re.compile('(\S+)')
table = string.maketrans(
    '_.,!?;:"(){}@#$%^&*-+='+"'",
    '                       ',
    )

def piece_emitter(text):
    # This generator splits text into 2-tuples of (positions,pieces).
    # Given "a_foobar_FooBar" it returns
    # ((0,'a'),
    #  (2,'foobar'),
    #  (9,'FooBar'),
    #  )
    pos=0
    it=itertools.groupby(text,lambda w: w.isspace())
    for k,g in it:
        w=''.join(g)
        w=w.translate(table)
        it2=itertools.groupby(w,lambda w: w.isspace())
        for isspace,g2 in it2:
            word=''.join(g2)
            if not isspace:
                yield pos,word
            pos+=len(word)

def camel_splitter(word):
    # Given a word like 'FooBar', this generator yields
    # 'Foo', then 'Bar'.
    it=itertools.groupby(word,lambda w: w.isupper())
    for k,g in it:
        w=''.join(g)
        if len(w)==1:
            try:
                k1,g1=next(it)
                w+=''.join(g1)
            except StopIteration:
                pass
        yield w

def word_emitter(piece):
    # Given 'getFooBar', this generator yields in turn the elements of the sequence
    # ((0,'get'),
    #  (0,'getFoo'),
    #  (0,'getFooBar'),
    #  (3,'Foo'),
    #  (3,'FooBar'),
    #  (6,'Bar'), 
    #  )
    # In each 2-tuple, the number is the starting position of the string,
    # followed by the fragment of camelCase word generated by camel_splitter.
    words=list(camel_splitter(piece))
    num_words=len(words)
    for i in range(0,num_words+1):
        prefix=''.join(words[:i])
        for step in range(1,num_words-i+1):
            word=''.join(words[i:i+step])
            yield len(prefix),word

def camel_search(text,words):
    words=dict.fromkeys(words,False)
    for pos,piece in piece_emitter(text):        
        if not all(words[test_word] for test_word in words):
            for subpos,word in word_emitter(piece):
                for test_word in words:
                    if not words[test_word] and word.lower() == test_word.lower(): 
                        yield pos+subpos,word
                        words[test_word]=True
                        break
        else:
            break
    for word in words:
        if not words[word]:
            yield None,word

if __name__ == "__main__":    
    #            01234567890123456789012345
    test_text = "a_foobar_FooBar baz golf_CART"
    test_words = ["a", "foo", "bar", "baz", "golf", "cart", "fred"]
    for pos,word in camel_search(test_text,test_words):
        print pos,word.lower()

Here are the unit tests I used to check the program:

import unittest
import sys
import camel
import itertools

class Test(unittest.TestCase):
    def check(self,result,answer):
        for r,a in itertools.izip_longest(result,answer):
            if r!=a:
                print('%s != %s'%(r,a))
            self.assertTrue(r==a)

    def test_piece_emitter(self):
        tests=(("a_foobar_FooBar baz? golf_CART Foo 'food' getFooBaz",
                ((0,'a'),
                 (2,'foobar'),
                 (9,'FooBar'),
                 (16,'baz'),
                 (21,'golf'),
                 (26,'CART'),
                 (31,'Foo'),
                 (36,'food'),
                 (42,'getFooBaz'),
                )
                ),
            )
        for text,answer in tests:
            result=list(camel.piece_emitter(text))
            print(result)
            self.check(result,answer)
    def test_camel_splitter(self):
        tests=(('getFooBar',('get','Foo','Bar')),
               ('getFOObar',('get','FOO','bar')),
               ('Foo',('Foo',)),
               ('getFoo',('get','Foo')),
               ('foobar',('foobar',)),
               ('fooBar',('foo','Bar')),
               ('FooBar',('Foo','Bar')),
               ('a',('a',)),
               ('fooB',('foo','B')),
               ('FooB',('Foo','B')),               
               ('FOOb',('FOO','b')),                              
               )
        for word,answer in tests:
            result=camel.camel_splitter(word)
            self.check(result,answer)            
    def test_word_emitter(self):
        tests=(("a",
                ((0,'a'),) ),
               ('getFooBar',
                ((0,'get'),
                 (0,'getFoo'),
                 (0,'getFooBar'),
                 (3,'Foo'),
                 (3,'FooBar'),
                 (6,'Bar'), 
                 )                
                )
            )
        for text,answer in tests:
            result=list(camel.word_emitter(text))
            print(result)
            self.check(result,answer)

    def test_camel_search(self):
        tests=(("a_foobar_FooBar baz? golf_CART Foo 'food' getFooBaz",
                ("a", "foo", "bar", "baz", "golf", "cart", "fred", "food",
                  'FooBaz'),
                ((0,'a'),
                 (9,'Foo'),
                 (12,'Bar'),
                 (16,'baz'),
                 (21,'golf'),
                 (26,'CART'),
                 (36,'food'),
                 (45,'FooBaz'),
                 (None,'fred')
                )
                ),
               ("\"Foo\"",('Foo',),((1,'Foo'),)),
               ("getFooBar",('FooBar',),((3,'FooBar'),)),                              
            )
        for text,search_words,answer in tests:
            result=list(camel.camel_search(text,search_words))
            print(result)
            self.check(result,answer)

if __name__ == '__main__':
    unittest.main(argv = unittest.sys.argv + ['--verbose'])

If I were doing this with regular expressions I'd probably do it like this:

def word_start_index2(text, seek_word):
    camel_case = seek_word[0].upper() + seek_word[1:].lower()
    seek_word_i = ''.join('[' + c.lower() + c.upper() + ']'
                           for c in seek_word)
    regex1 = r'(?:(?<=[^a-zA-Z])|^)' + seek_word_i + r'(?=$|[^a-zA-Z])'
    regex2 = r'(?:(?<=[a-z]|[^A-Z])|^)' + camel_case + r'(?=$|[A-Z]|[^a-z])'
    regex = '%s|%s' % (regex1,  regex2)
    import re
    m = re.search(regex, text)
    if not m:
        return None
    else:
        return m.start()

I haven't performance tested this against your version though, but you could try it to see if it is better or worse and let us know.

My answer might give different output from yours on some edge cases but in your comments you said that you don't care about these cases.

Also, I tried to use the notation (?i) to mark part of the regex as case-insensitive but for some reason this fails to work correctly. I cannot explain why.

Final self-nitpick: the function needs to validate its arguments but this code is omitted for clarity. You should add checks at least for the following:

text should be a string
seek_word should be a string matching '[a-zA-Z]+'

With a index to speed up searching :-)

from collections import defaultdict

class IndexedText(object):
    """ a indexed text """
    def __init__(self, text):
        self.text = text
        self._index()


    def word_start_index(self, word):
        l = len(word)
        w = word.lower()
        return self.index[word]

    def _index(self):
        self.index = defaultdict( list )

        def index( word, pos):
            self.index[word.lower()].append( pos )

        start = 0
        it = enumerate(self.text)
        lpos, lchar = it.next()
        WS = (' ','_')

        for pos, char in it:
            if lchar in WS and char not in WS:
                index( self.text[start:lpos], start )
                start = pos
            elif lchar.islower() and char.isupper(): # camelcase
                index( self.text[start:pos], start )
                start = pos
            lpos, lchar = pos, char

        # last word is missing
        index( self.text[start:], start ) 

if __name__ == "__main__":
    #            01234567890123456789012345
    test_text = "a_foobar_FooBar baz golf_CART"
    test_words = ["a", "foo", "bar", "baz", "golf", "cart", "fred"]

    index = IndexedText( test_text )

    for word in test_words:
        match_start = index.word_start_index( word )
        print match_start, word