file reading in python_问答_开发者_运维开发者技术经验分享

So my whole problem is that I have two files one with following format(for Python 2.6):

#comments
config = {
    #comments
    'name': 'hello',
    'see?': 'wo开发者_运维百科rld':'ABC',CLASS=3
}

This file has number of sections like this. Second file has format:

[23]
[config]
 'name'='abc'
 'see?'=
[23]

Now the requirement is that I need to compare both files and generate file as:

#comments
config = {
    #comments
    'name': 'abc',
    'see?': 'world':'ABC',CLASS=3
}

So the result file will contain the values from the first file, unless the value for same attribute is there in second file, which will overwrite the value. Now my problem is how to manipulate these files using Python.

Thanks in advance and for your previous answers in short time ,I need to use python 2.6

Was unable to find a beautiful solution due to the comments. This is tested and works for me, but requires Python 3.1 or higher:

from collections import OrderedDict

indenting = '\t'

def almost_py_read(f):
    sections = OrderedDict()
    contents = None
    active = sections
    for line in f:
        line = line.strip()
        if line.startswith('#'):
            active[line] = None
        elif line.endswith('{'):
            k = line.split('=')[0].strip()
            contents = OrderedDict()
            active = contents
            sections[k] = contents
        elif line.endswith('}'):
            active = sections
        else:
            try:
                k, v = line.split(':')
                k = k.strip()
                v = v.strip()
                active[k] = v
            except:
                pass
    return sections

def almost_ini_read(f):
    sections = OrderedDict()
    contents = None
    for line in f:
        line = line.strip()
        try:
            k, v = line.split('=')
            k = k.strip()
            v = v.strip()
            if v:
                contents[k] = v
        except:
            if line.startswith('[') and line.endswith(']'):
                contents = OrderedDict()
                sections[line[1:-1]] = contents
    print(sections)
    return sections

def compilefiles(pyname, ininame):
    sections = almost_py_read(open(pyname, 'rt'))
    override_sections = almost_ini_read(open(ininame, "rt"))
    for section_key, section_value in override_sections.items():
        if not sections.get(section_key):
            sections[section_key] = OrderedDict()
        for k, v in section_value.items():
            sections[section_key][k] = v
    return sections

def output(d, indent=''):
    for k, v in d.items():
        if v == None:
            print(indent+k)
        elif v:
            if type(v) == str:
                print(indent+k+': '+v+',')
            else:
                print(indent+k+' = {')
                output(v, indent+indenting)
                print(indent+'}')

d = compilefiles('a.txt', 'b.ini')
output(d)

Output:

#comments
config = {
    #comments
    'name': 'abc',
    'see?': 'world',
}

I had a really long and hard time to manage to write the following code.

I had difficulties to manage with commas. I wanted the updated file to have after the updating the same format as the file to update before the updating : lines end with a comma, except for the last one.

This code is crafted for the particular problem as exposed by the questioner and can't be used as-is for another type of problem. I know. It's the problem of using a code based on regex and not on a parser, I'm fully aware of that. But I think that it is a canvas that can be relatively easily adapted to other cases, by changing the regexes, which is a relatively readily process thanks to the malleability of regexes.

def file_updating(updating_filename,updating_data_extractor,filename_to_update):
    # function whose name is hold by updating_data_extractor parameter 
    # is a function that 
    # extracts data from the file whose name is hold by updating_filename parameter
    # and must return a tuple:
    # ( updating dictionary , compiled regex )
    updating_dico,pat = updating_data_extractor( updating_filename )

    with open(filename_to_update,'r+') as f:
        lines = f.readlines()

        def jiji(line,dico = updating_dico ):
            mat = pat.search(line.rstrip())
            if mat and mat.group(3) in dico:
                return '%s: %s,' % (mat.group(1),dico.pop(mat.group(3)))
            else:
                return line.rstrip(',') + ','

        li = [jiji(line) for line in lines[0:-1] ] # [0:-1] because last line is '}'
        front = (mit.group(2) for mit in ( pat.search(line) for line in lines ) if mit).next()
        li.extend(front + '%s: %s,' % item for item in updating_dico.iteritems() )
        li[-1] = li[-1].rstrip(',')
        li.append('}')

        f.seek(0,0)
        f.writelines( '\n'.join(li)  )
        f.truncate()

Exemplifying code:

import re

bef1 = '''#comments
config =
{
#comments
    'name': 'hello',
    'arctic':01011101,
    'summu': 456,
    'see?': 'world',
    'armorique': 'bretagne'
}'''

bef2 = '''#comments
config =
{
#comments
    'name': 'abc',
    'see?': { 'world':'india':'jagdev'},
}'''



def one_extractor(data_containing_filename):

    with open(data_containing_filename) as g:
        contg = re.search('\[(\d+)\].+\[config\](.*?)\[(\\1)\]',g.read(),re.DOTALL)
        if contg:
            updtgen = ( re.match("([^=]+)=[ \f\t\v]*([^ \f\t\v].*|)",line.strip())
                        for line in contg.group(2).splitlines() )
            updating_data =  dict( mi.groups() for mi in updtgen if mi and mi.group(2))
        else:
            from sys import exit
            exit(updating_filename + " isn't a valid file for updating")

    pat = re.compile("(([ \t]*)([^:]+)):\s*(.+),?")

    return (updating_data,pat)



for bef in (bef1,bef2):

    # file to update:  rudu.txt
    with open('rudu.txt','w') as jecr:
        jecr.write(bef)

    # updating data:   renew_rudu.txt
    with open('renew_rudu.txt','w') as jecr:
        jecr.write('''[23]
    [config]
     'nuclear'= 'apocalypse'
     'name'='abc'
     'armorique'= 'BRETAGNE'
     'arctic'=
     'boloni'=7600
     'see?'=
     'summu'='tumulus'
    [23]''')


    print 'BEFORE ---------------------------------'
    with open('rudu.txt') as lir:
        print lir.read()

    print '\nUPDATING DATA --------------------------'
    with open('renew_rudu.txt') as lir:
        print lir.read()

    file_updating('renew_rudu.txt',one_extractor,'rudu.txt')

    print '\nAFTER ================================='
    with open('rudu.txt','r') as f:
        print f.read()

    print '\n\nX#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#\n'

Result:

>>> 
BEFORE ---------------------------------
#comments
config =
{
#comments
    'name': 'hello',
    'arctic':01011101,
    'summu': 456,
    'see?': 'world',
    'armorique': 'bretagne'
}

UPDATING DATA --------------------------
[23]
    [config]
     'nuclear'= 'apocalypse'
     'name'='abc'
     'armorique'= 'BRETAGNE'
     'arctic'=
     'boloni'=7600
     'see?'=
     'summu'='tumulus'
    [23]

AFTER =================================
#comments,
config =,
{,
#comments,
    'name': 'abc',
    'arctic':01011101,
    'summu': 'tumulus',
    'see?': 'world',
    'armorique': 'BRETAGNE',
    'boloni': 7600,
    'nuclear': 'apocalypse'
}


X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#

BEFORE ---------------------------------
#comments
config =
{
#comments
    'name': 'abc',
    'see?': { 'world':'india':'jagdev'},
}

UPDATING DATA --------------------------
[23]
    [config]
     'nuclear'= 'apocalypse'
     'name'='abc'
     'armorique'= 'BRETAGNE'
     'arctic'=
     'boloni'=7600
     'see?'=
     'summu'='tumulus'
    [23]

AFTER =================================
#comments,
config =,
{,
#comments,
    'name': 'abc',
    'see?': { 'world':'india':'jagdev'},
    'armorique': 'BRETAGNE',
    'boloni': 7600,
    'summu': 'tumulus',
    'nuclear': 'apocalypse'
}


X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#X#

>>>

EDIT:

I have improved the code because I was still insatisfied. Now the "variable" front catches the blank characters ( ' ' or '\t' ) at the beginning of the data-containing lines in the file to be updated.

I had also forgot the instruction f.truncate() which is very important to not keep a tail of undesired characters.

I am satisfied to see that my code works well even with the following file in which a value is a dictionnary, as presented by Jagdev:

#comments
config =
{
#comments
    'name': 'abc',
    'see?': { 'world':'india':'jagdev'},
}

That confirms me in my choice to process line after line , and not trying to run through the entire file with a regex.

EDIT 2:

I again changed the code. The updating is performed by a function that takes as arguments :

the name of the updating file (the file containing the data used to udpdate another file)
and the function that is suited to extract the data from this particular updating file

Hence, it is possible to update a given file with data from various updating files. That makes the code more generic.

Very roughly (i.e. this hasn't been tested at all, and there are numerous imprvements that could be made such as the use of regex and/or pretty-printing):

dicts = []
with open('file1') as file1:
  try:
    file1content = file1.read()
    eval(file1content )
    file1content.strip(' ')
    file1content.strip('\t')
    for line in file1content.splitlines():
      if '={' in line: 
        dicts.append(line.split('={').strip())
  except:
    print 'file1 not valid'
with open('file2') as file2:
  filelines = file2.readlines()
  while filelines:
    while filelines and '[23]' not in filelines[0]:
      filelines.pop(0)
    if filelines:
      filelines.pop(0)
      dictname = filelines.pop(0).split('[')[1].split(']')[0]
      if dictname not in dicts:
        dicts.append(dictname)
        exec(dictname + ' = {}')
      while filelines and '[23]' not in filelines[0]:
        line = filelines.pop(0)
        [k,v] = line.split('=')
        k.strip()
        v.strip()
        if v:
          exec(dictname + '[k] = v')
with open('file3', 'w') as file3:
  file3content = '\n'.join([`eval(dictname)` for dictname in dicts])
  file3.write(file3content)