Class IOUtilFunctions


  • public class IOUtilFunctions
    extends Object
    • Constructor Detail

      • IOUtilFunctions

        public IOUtilFunctions()
    • Method Detail

      • getFileSystem

        public static org.apache.hadoop.fs.FileSystem getFileSystem​(org.apache.hadoop.fs.Path fname)
                                                             throws IOException
        Throws:
        IOException
      • getFileSystem

        public static org.apache.hadoop.fs.FileSystem getFileSystem​(org.apache.hadoop.conf.Configuration conf)
                                                             throws IOException
        Throws:
        IOException
      • getFileSystem

        public static org.apache.hadoop.fs.FileSystem getFileSystem​(org.apache.hadoop.fs.Path fname,
                                                                    org.apache.hadoop.conf.Configuration conf)
                                                             throws IOException
        Throws:
        IOException
      • isSameFileScheme

        public static boolean isSameFileScheme​(org.apache.hadoop.fs.Path path1,
                                               org.apache.hadoop.fs.Path path2)
      • isObjectStoreFileScheme

        public static boolean isObjectStoreFileScheme​(org.apache.hadoop.fs.Path path)
      • getPartFileName

        public static String getPartFileName​(int pos)
      • closeSilently

        public static void closeSilently​(Closeable io)
      • closeSilently

        public static void closeSilently​(org.apache.hadoop.mapred.RecordReader<?,​?> rr)
      • checkAndRaiseErrorCSVEmptyField

        public static void checkAndRaiseErrorCSVEmptyField​(String row,
                                                           boolean fill,
                                                           boolean emptyFound)
                                                    throws IOException
        Throws:
        IOException
      • checkAndRaiseErrorCSVNumColumns

        public static void checkAndRaiseErrorCSVNumColumns​(org.apache.hadoop.mapred.InputSplit split,
                                                           String line,
                                                           String[] parts,
                                                           long ncol)
                                                    throws IOException
        Throws:
        IOException
      • split

        public static String[] split​(String str,
                                     String delim)
        Splits a string by a specified delimiter into all tokens, including empty. NOTE: This method is meant as a faster drop-in replacement of the regular string split.
        Parameters:
        str - string to split
        delim - delimiter
        Returns:
        string array
      • splitCSV

        public static String[] splitCSV​(String str,
                                        String delim)
        Splits a string by a specified delimiter into all tokens, including empty while respecting the rules for quotes and escapes defined in RFC4180, with robustness for various special cases.
        Parameters:
        str - string to split
        delim - delimiter
        Returns:
        string array of tokens
      • splitCSV

        public static String[] splitCSV​(String str,
                                        String delim,
                                        String[] cache)
        Splits a string by a specified delimiter into all tokens, including empty while respecting the rules for quotes and escapes defined in RFC4180, with robustness for various special cases.
        Parameters:
        str - string to split
        delim - delimiter
        cache - cachedReturnArray
        Returns:
        string array of tokens
      • splitCSV

        public static String[] splitCSV​(String str,
                                        String delim,
                                        String[] tokens,
                                        Set<String> naStrings)
        Splits a string by a specified delimiter into all tokens, including empty while respecting the rules for quotes and escapes defined in RFC4180, with robustness for various special cases.
        Parameters:
        str - string to split
        delim - delimiter
        tokens - array for tokens, length needs to match the number of tokens
        naStrings - the strings to map to null value.
        Returns:
        string array of tokens
      • countTokensCSV

        public static int countTokensCSV​(String str,
                                         String delim)
        Counts the number of tokens defined by the given delimiter, respecting the rules for quotes and escapes defined in RFC4180, with robustness for various special cases.
        Parameters:
        str - string to split
        delim - delimiter
        Returns:
        number of tokens split by the given delimiter
      • readMatrixMarketHeader

        public static String[] readMatrixMarketHeader​(String filename)
      • countNnz

        public static int countNnz​(String[] cols)
        Returns the number of non-zero entries but avoids the expensive string to double parsing. This function is guaranteed to never underestimate.
        Parameters:
        cols - string array
        Returns:
        number of non-zeros
      • countNnz

        public static int countNnz​(String[] cols,
                                   int pos,
                                   int len)
        Returns the number of non-zero entries but avoids the expensive string to double parsing. This function is guaranteed to never underestimate.
        Parameters:
        cols - string array
        pos - starting array index
        len - ending array index
        Returns:
        number of non-zeros
      • getUTFSize

        public static int getUTFSize​(String value)
        Returns the serialized size in bytes of the given string value, following the modified UTF-8 specification as used by Java's DataInput/DataOutput. see java docs: docs/api/java/io/DataInput.html#modified-utf-8
        Parameters:
        value - string value
        Returns:
        string size for modified UTF-8 specification
      • sortInputSplits

        public static org.apache.hadoop.mapred.InputSplit[] sortInputSplits​(org.apache.hadoop.mapred.InputSplit[] splits)
      • countNumColumnsCSV

        public static int countNumColumnsCSV​(org.apache.hadoop.mapred.InputSplit[] splits,
                                             org.apache.hadoop.mapred.InputFormat informat,
                                             org.apache.hadoop.mapred.JobConf job,
                                             String delim)
                                      throws IOException
        Counts the number of columns in a given collection of csv file splits. This primitive aborts if a row with more than 0 columns is found and hence is robust against empty file splits etc.
        Parameters:
        splits - input splits
        informat - input format
        job - job configruation
        delim - delimiter
        Returns:
        the number of columns in the collection of csv file splits
        Throws:
        IOException - if IOException occurs
      • getSequenceFilePaths

        public static org.apache.hadoop.fs.Path[] getSequenceFilePaths​(org.apache.hadoop.fs.FileSystem fs,
                                                                       org.apache.hadoop.fs.Path file)
                                                                throws IOException
        Throws:
        IOException
      • getMetadataFilePaths

        public static org.apache.hadoop.fs.Path[] getMetadataFilePaths​(org.apache.hadoop.fs.FileSystem fs,
                                                                       org.apache.hadoop.fs.Path file)
                                                                throws IOException
        Throws:
        IOException
      • deleteCrcFilesFromLocalFileSystem

        public static void deleteCrcFilesFromLocalFileSystem​(org.apache.hadoop.mapred.JobConf job,
                                                             org.apache.hadoop.fs.Path path)
                                                      throws IOException
        Throws:
        IOException
      • deleteCrcFilesFromLocalFileSystem

        public static void deleteCrcFilesFromLocalFileSystem​(org.apache.hadoop.fs.FileSystem fs,
                                                             org.apache.hadoop.fs.Path path)
                                                      throws IOException
        Delete the CRC files from the local file system associated with a particular file and its metadata file.
        Parameters:
        fs - the file system
        path - the path to a file
        Throws:
        IOException - thrown if error occurred attempting to delete crc files
      • baToShort

        public static int baToShort​(byte[] ba,
                                    int off)
      • baToInt

        public static int baToInt​(byte[] ba,
                                  int off)
      • baToLong

        public static long baToLong​(byte[] ba,
                                    int off)
      • shortToBa

        public static void shortToBa​(int val,
                                     byte[] ba,
                                     int off)
      • intToBa

        public static void intToBa​(int val,
                                   byte[] ba,
                                   int off)
      • longToBa

        public static void longToBa​(long val,
                                    byte[] ba,
                                    int off)
      • getBytes

        public static byte[] getBytes​(ByteBuffer buff)
      • get

        public static <T> T get​(Future<T> in)
      • isFileCPReadable

        public static boolean isFileCPReadable​(String path)
      • getSeqWriter

        public static org.apache.hadoop.io.SequenceFile.Writer getSeqWriter​(org.apache.hadoop.fs.Path path,
                                                                            org.apache.hadoop.conf.Configuration job,
                                                                            int replication)
                                                                     throws IOException
        Throws:
        IOException
      • getSeqWriterFrame

        public static org.apache.hadoop.io.SequenceFile.Writer getSeqWriterFrame​(org.apache.hadoop.fs.Path path,
                                                                                 org.apache.hadoop.conf.Configuration job,
                                                                                 int replication)
                                                                          throws IOException
        Throws:
        IOException
      • getSeqWriterTensor

        public static org.apache.hadoop.io.SequenceFile.Writer getSeqWriterTensor​(org.apache.hadoop.fs.Path path,
                                                                                  org.apache.hadoop.conf.Configuration job,
                                                                                  int replication)
                                                                           throws IOException
        Throws:
        IOException
      • getSeqWriterCell

        public static org.apache.hadoop.io.SequenceFile.Writer getSeqWriterCell​(org.apache.hadoop.fs.Path path,
                                                                                org.apache.hadoop.conf.Configuration job,
                                                                                int replication)
                                                                         throws IOException
        Throws:
        IOException
      • getCompressionEncodingType

        public static org.apache.hadoop.io.SequenceFile.CompressionType getCompressionEncodingType()
      • getCompressionCodec

        public static org.apache.hadoop.io.compress.CompressionCodec getCompressionCodec()