开发者

How to determine File Format? DOS/Unix/MAC

开发者 https://www.devze.com 2023-01-04 01:56 出处:网络
I have written the following method to detemine whether file in question is formatted with DOS/ MAC, or UNIX line endings.

I have written the following method to detemine whether file in question is formatted with DOS/ MAC, or UNIX line endings.

I see at least 1 obvious issue: 1. i am hoping that i will get the EOL on the first run, say within first 1000 bytes. This may or may not happen.

I ask you to review this and suggest improvements which will lead to hardening the code and making it more generic.

THANK YOU.

new FileFormat().discover(fileName, 0, 1000);

and then

public void discover(String fileName, int offset, int depth) throws IOException {

    BufferedInputStream in = new BufferedInputStream(new FileInputStream(fileName));
    FileReader a = new FileReader(new File(fileName));

    byte[] bytes = new byte[(int) depth];
    in.read(bytes, off开发者_JAVA百科set, depth);

    a.close();
    in.close();
    int thisByte;
    int nextByte;

    boolean isDos = false;
    boolean isUnix = false;
    boolean isMac = false;

    for (int i = 0; i < (bytes.length - 1); i++) {
        thisByte = bytes[i];
        nextByte = bytes[i + 1];
    if (thisByte == 10 && nextByte != 13) {
            isDos = true;
            break;
        } else if (thisByte == 13) {
            isUnix = true;
            break;
        } else if (thisByte == 10) {
            isMac = true;
            break;
        }
    }
    if (!(isDos || isMac || isUnix)) {
            discover(fileName, offset + depth, depth + 1000);
    } else {
        // do something clever
    }
}


Your method seems unnecessarily complicated. Why not:

public class FileFormat {
    public enum FileType { WINDOWS, UNIX, MAC, UNKNOWN }

    private static final char CR = '\r';
    private static final char LF = '\n';

    public static FileType discover(String fileName) throws IOException {    

        Reader reader = new BufferedReader(new FileReader(fileName));
        FileType result = discover(reader);
        reader.close();
        return result;
    }

    private static FileType discover(Reader reader) throws IOException {
        int c;
        while ((c = reader.read()) != -1) {
            switch(c) {        
            case LF: return FileType.UNIX;
            case CR: {
                if (reader.read() == LF) return FileType.WINDOWS;
                return FileType.MAC;
            }
            default: continue;
            }
        }
        return FileType.UNKNOWN;
    }
}

Which puts this in a static method that you can then call and use as:

switch(FileFormat.discover(fileName) {
case WINDOWS: ...
case MAC: ...
case UNKNOWN: ...
}


Here's a rough implementation that guesses the line ending type based on a simple majority and falls back on unknown in a worst-case scenario:

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.EnumMap;
import java.util.Map;
import java.util.Scanner;

class LineEndings
{
    private enum ExitState
    {
        SUCCESS, FAILURE;
    }

    public enum LineEndingType
    {
        DOS("Windows"), MAC("Mac OS Classic"), UNIX("Unix/Linux/Mac OS X"), UNKNOWN("Unknown");

        private final String name;
        private LineEndingType(String name)
        {
            this.name = name;
        }

        public String toString()
        {
            if (null == this.name) {
                return super.toString();
            }
            else {
                return this.name;
            }
        }
    }

    public static void main(String[] arguments)
    {
        ExitState exitState = ExitState.SUCCESS;

        File inputFile = getInputFile();

        if (null == inputFile) {
            exitState = ExitState.FAILURE;

            System.out.println("Error: No input file specified.");
        }
        else {
            System.out.println("Determining line endings for: " + inputFile.getName());

            try {
                LineEndingType lineEndingType = getLineEndingType(inputFile);

                System.out.println("Determined line endings: " + lineEndingType);
            }
            catch (java.io.IOException exception) {
                exitState = ExitState.FAILURE;

                System.out.println("Error: " + exception.getMessage());
            }
        }

        switch (exitState) {
        case SUCCESS:
            System.exit(0);
            break;
        case FAILURE:
            System.exit(1);
            break;
        }
    }

    private static File getInputFile()
    {
        File inputFile = null;
        Scanner stdinScanner = new Scanner(System.in);

        while (true) {
            System.out.println("Enter the input file name:");
            System.out.print(">> ");

            if (stdinScanner.hasNext()) {
                String inputFileName = stdinScanner.next();

                inputFile = new File(inputFileName);

                if (!inputFile.exists()) {
                    System.out.println("File not found.\n");
                }
                else if (!inputFile.canRead()) {
                    System.out.println("Could not read file.\n");
                }
                else {
                    break;
                }
            }
            else {
                inputFile = null;
                break;
            }
        }

        System.out.println();

        return inputFile;
    }

    private static LineEndingType getLineEndingType(File inputFile)
        throws java.io.IOException, java.io.FileNotFoundException
    {
        EnumMap<LineEndingType, Integer> lineEndingTypeCount =
            new EnumMap<LineEndingType, Integer>(LineEndingType.class);

        BufferedReader inputReader = new BufferedReader(new FileReader(inputFile));

        LineEndingType currentLineEndingType = null;

        while (inputReader.ready()) {
            int token = inputReader.read();

            if ('\n' == token) {
                currentLineEndingType = LineEndingType.UNIX;
            }
            else if ('\r' == token) {
                if (inputReader.ready()) {
                    int nextToken = inputReader.read();

                    if ('\n' == nextToken) {
                        currentLineEndingType = LineEndingType.DOS;
                    }
                    else {
                        currentLineEndingType = LineEndingType.MAC;
                    }
                }
            }

            if (null != currentLineEndingType) {
                incrementLineEndingType(lineEndingTypeCount, currentLineEndingType);

                currentLineEndingType = null;
            }
        }

        return getMostFrequentLineEndingType(lineEndingTypeCount);
    }

    private static void incrementLineEndingType(Map<LineEndingType, Integer> lineEndingTypeCount, LineEndingType targetLineEndingType)
    {
        Integer targetLineEndingCount = lineEndingTypeCount.get(targetLineEndingType);

        if (null == targetLineEndingCount) {
            targetLineEndingCount = 0;
        }
        else {
            targetLineEndingCount++;
        }

        lineEndingTypeCount.put(targetLineEndingType, targetLineEndingCount);
    }

    private static LineEndingType getMostFrequentLineEndingType(Map<LineEndingType, Integer> lineEndingTypeCount)
    {
        Integer maximumEntryCount = Integer.MIN_VALUE;

        Map.Entry<LineEndingType, Integer> mostFrequentEntry = null;

        for (Map.Entry<LineEndingType, Integer> entry : lineEndingTypeCount.entrySet()) {
            int entryCount = entry.getValue();

            if (entryCount > maximumEntryCount) {
                mostFrequentEntry = entry;
                maximumEntryCount = entryCount;
            }
        }

        if (null != mostFrequentEntry) {
            return mostFrequentEntry.getKey();
        }
        else {
            return LineEndingType.UNKNOWN;
        }
    }
}


There is a whole lot wrong with this. You need to understand the FileInputStream class better. Note that read is not guaranteed to read all the bytes you requested. offset is the offset into the array, not the file. And so on.

0

精彩评论

暂无评论...
验证码 换一张
取 消

关注公众号