001 /*
002 Copyright (c) 2012, Regents of the University of Colorado
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without modification,
006 are permitted provided that the following conditions are met:
007
008 * Redistributions of source code must retain the above copyright notice, this
009 list of conditions and the following disclaimer.
010
011 * Redistributions in binary form must reproduce the above copyright notice,
012 this list of conditions and the following disclaimer in the documentation
013 and/or other materials provided with the distribution.
014
015 * Neither the name of the University of Colorado nor the names of its
016 contributors may be used to endorse or promote products derived from this
017 software without specific prior written permission.
018
019 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
023 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030 package edu.ucdenver.ccp.medline;
031
032 import java.io.File;
033 import java.util.Arrays;
034 import java.util.Collections;
035 import java.util.Comparator;
036 import java.util.List;
037 import java.util.regex.Matcher;
038 import java.util.regex.Pattern;
039
040 import edu.ucdenver.ccp.common.string.StringUtil;
041
042 /**
043 * Iterates over a collection of Medline files according to the indexes that are part of their file
044 * names, e.g. medline12n0123.xml.gz
045 *
046 * @author Center for Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu
047 *
048 */
049 public class MedlineFileOrderer {
050
051 public enum FileOrder {
052 INC,
053 DEC
054 }
055
056 /**
057 * @param directory
058 * @return by default we iterate over files in increasing order
059 */
060 public static Iterable<File> getOrderedMedlineFileIterable(File directory) {
061 return getOrderedMedlineFileIterable(directory, FileOrder.INC);
062 }
063
064 /**
065 *
066 * @param directory
067 * @return an {@link Iterable} of {@link File} objects for each Medline XML file in the
068 * specified directory. The collection is orderd according to the index in the file
069 * names, e.g. medline12n0123.xml.gz, medline12n0124.xml.gz, medline12n0125.xml.gz, and
070 * so on.
071 */
072 public static Iterable<File> getOrderedMedlineFileIterable(File directory, final FileOrder fileOrder) {
073 List<File> files = Arrays.asList(directory.listFiles());
074 Collections.sort(files, new Comparator<File>() {
075
076 private final Pattern medlineFilePattern = Pattern.compile("^medline\\d\\dn(\\d\\d\\d\\d)\\.xml\\.?g?z?$");
077
078 public int compare(File f1, File f2) {
079 Integer f1Number = null;
080 Integer f2Number = null;
081 Matcher m = medlineFilePattern.matcher(f1.getName());
082 if (m.find()) {
083 f1Number = Integer.parseInt(StringUtil.removePrefixRegex(m.group(1), "0*"));
084 }
085 m = medlineFilePattern.matcher(f2.getName());
086 if (m.find()) {
087 f2Number = Integer.parseInt(StringUtil.removePrefixRegex(m.group(1), "0*"));
088 }
089
090 if (f1Number == null) {
091 if (f2Number != null) {
092 return 1;
093 }
094 return 0; // both are null so it doesn't matter
095 } else if (f2Number == null) {
096 return -1;
097 }
098
099 int orderer = 1;
100 if (fileOrder.equals(FileOrder.DEC)) {
101 orderer = -1;
102 }
103 return orderer * f1Number.compareTo(f2Number);
104 }
105 });
106 return files;
107 }
108
109 }