001    /*
002     Copyright (c) 2012, Regents of the University of Colorado
003     All rights reserved.
004    
005     Redistribution and use in source and binary forms, with or without modification, 
006     are permitted provided that the following conditions are met:
007    
008     * Redistributions of source code must retain the above copyright notice, this 
009        list of conditions and the following disclaimer.
010       
011     * Redistributions in binary form must reproduce the above copyright notice, 
012        this list of conditions and the following disclaimer in the documentation 
013        and/or other materials provided with the distribution.
014       
015     * Neither the name of the University of Colorado nor the names of its 
016        contributors may be used to endorse or promote products derived from this 
017        software without specific prior written permission.
018    
019     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
020     ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
021     WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
022     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
023     ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
024     (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
025     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
026     ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
027     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
028     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029     */
030    package edu.ucdenver.ccp.medline;
031    
032    import java.io.File;
033    import java.util.Arrays;
034    import java.util.Collections;
035    import java.util.Comparator;
036    import java.util.List;
037    import java.util.regex.Matcher;
038    import java.util.regex.Pattern;
039    
040    import edu.ucdenver.ccp.common.string.StringUtil;
041    
042    /**
043     * Iterates over a collection of Medline files according to the indexes that are part of their file
044     * names, e.g. medline12n0123.xml.gz
045     * 
046     * @author Center for Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu
047     * 
048     */
049    public class MedlineFileOrderer {
050    
051            public enum FileOrder {
052                    INC,
053                    DEC
054            }
055    
056            /**
057             * @param directory
058             * @return by default we iterate over files in increasing order
059             */
060            public static Iterable<File> getOrderedMedlineFileIterable(File directory) {
061                    return getOrderedMedlineFileIterable(directory, FileOrder.INC);
062            }
063    
064            /**
065             * 
066             * @param directory
067             * @return an {@link Iterable} of {@link File} objects for each Medline XML file in the
068             *         specified directory. The collection is orderd according to the index in the file
069             *         names, e.g. medline12n0123.xml.gz, medline12n0124.xml.gz, medline12n0125.xml.gz, and
070             *         so on.
071             */
072            public static Iterable<File> getOrderedMedlineFileIterable(File directory, final FileOrder fileOrder) {
073                    List<File> files = Arrays.asList(directory.listFiles());
074                    Collections.sort(files, new Comparator<File>() {
075    
076                            private final Pattern medlineFilePattern = Pattern.compile("^medline\\d\\dn(\\d\\d\\d\\d)\\.xml\\.?g?z?$");
077    
078                            public int compare(File f1, File f2) {
079                                    Integer f1Number = null;
080                                    Integer f2Number = null;
081                                    Matcher m = medlineFilePattern.matcher(f1.getName());
082                                    if (m.find()) {
083                                            f1Number = Integer.parseInt(StringUtil.removePrefixRegex(m.group(1), "0*"));
084                                    }
085                                    m = medlineFilePattern.matcher(f2.getName());
086                                    if (m.find()) {
087                                            f2Number = Integer.parseInt(StringUtil.removePrefixRegex(m.group(1), "0*"));
088                                    }
089    
090                                    if (f1Number == null) {
091                                            if (f2Number != null) {
092                                                    return 1;
093                                            }
094                                            return 0; // both are null so it doesn't matter
095                                    } else if (f2Number == null) {
096                                            return -1;
097                                    }
098    
099                                    int orderer = 1;
100                                    if (fileOrder.equals(FileOrder.DEC)) {
101                                            orderer = -1;
102                                    }
103                                    return orderer * f1Number.compareTo(f2Number);
104                            }
105                    });
106                    return files;
107            }
108    
109    }