I'm trying to read in a fairly large csv file (~160 MEGS).
Code:
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.*;
/**
* @author Lawrence
*/
public class RegexFormat {
public static void main(String args[]) {
String s1 = " & "; //To " and "
String s2 = "&"; //To " and "
String s3 = "Tampa"; //To Boston
String s4 = "Florida"; //To Massachustts
String s5 = "Carolinas"; //To New England
String s6 = "Richmond"; //To Providence
String s7 = "Southern"; //To Northern
String s8 = "Energy"; //To Power
String f1 = " and ";
String f2 = " and ";
String f3 = "Boston";
String f4 = "Massachusetts";
String f5 = "New England";
String f6 = "Providence";
String f7 = "Northern";
String f8 = "Power";
//Array of Replacements
String[] ListToFind = {s1, s2, s3, s4, s5, s6, s7, s8};
String[] ListToReplace = {f1, f2, f3, f4, f5, f6, f7, f8};
//Patterns
Pattern p1 = Pattern.compile(s1, Pattern.CASE_INSENSITIVE); //To " and "
Pattern p2 = Pattern.compile(s2, Pattern.CASE_INSENSITIVE); //To " and "
Pattern p3 = Pattern.compile(s3, Pattern.CASE_INSENSITIVE); //To Boston
Pattern p4 = Pattern.compile(s4, Pattern.CASE_INSENSITIVE); //To Massachustts
Pattern p5 = Pattern.compile(s5, Pattern.CASE_INSENSITIVE); //To New England
Pattern p6 = Pattern.compile(s6, Pattern.CASE_INSENSITIVE); //To Providence
Pattern p7 = Pattern.compile(s7, Pattern.CASE_INSENSITIVE); //To Northern
Pattern p8 = Pattern.compile(s8, Pattern.CASE_INSENSITIVE); //To Power
Pattern[] toFindArray = {p1, p2, p3, p4, p5, p6, p7, p8};
String testInput = "I am going to tampa, florida which is near Carolinas & Richmond. Paul&I will be going to the Southern part to research Energy";
//Init first matcher
//Matcher toMatch = toFindArray[0].matcher(testInput);
String strInput = "x";
try{
byte[] FileInput = ReadFile(args[0]);
// make a backup copy
WriteFile(args[0]+".backup.copy",FileInput);
strInput = new String(FileInput);
//loop to clean data
for (int i = 0; i < 8; i++) {
Matcher toMatch;
toMatch = toFindArray[i].matcher(strInput);
strInput = toMatch.replaceAll(ListToReplace[i]);
}
WriteFile(args[0],strInput.getBytes());
}
catch(Exception e){
System.out.println(e.getMessage());
}
Console console = System.console();
System.out.println(strInput); //System.out.println("The element at [0][2] is " + replaceArray[4][1]);
}
//To Read a File into a Byte Array
static public final byte[] ReadFile(String strFile) throws IOException {
int nSize = 32768;
// open the input file stream
BufferedInputStream inStream = new BufferedInputStream(new FileInputStream(strFile), nSize);
byte[] pBuffer = new byte[nSize];
int nPos = 0;
// read bytes into a buffer
nPos += inStream.read(pBuffer, nPos, nSize - nPos);
// while the buffer is filled, double the buffer size and read more
while (nPos == nSize) {
byte[] pTemp = pBuffer;
nSize *= 2;
pBuffer = new byte[nSize];
System.arraycopy(pTemp, 0, pBuffer, 0, nPos);
nPos += inStream.read(pBuffer, nPos, nSize - nPos);
}
// close the input stream
inStream.close();
if (nPos == 0) {
return "".getBytes();
}
// return data read into the buffer as a byte array
byte[] pData = new byte[nPos];
System.arraycopy(pBuffer, 0, pData, 0, nPos);
return pData;
}
//To Write to File
static public final void WriteFile(String strFile, byte[] pData) throws IOException {
BufferedOutputStream outStream = new BufferedOutputStream(new FileOutputStream(strFile), 32768);
if (pData.length > 0) {
outStream.write(pData, 0, pData.length);
}
outStream.close();
}
}
This is the error i get when i try to run 'java RegexFormat myfile.csv'
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at RegexFormat.ReadFile(regexformat.java:105)
at RegexFormat.main(regexformat.java:70)
i tried to run with -Xmx512m and got this error...
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.lang.StringCoding$StringDecoder.decode(String Coding.java:133)
at java.lang.StringCoding.decode(StringCoding.java:17 3)
at java.lang.StringCoding.decode(StringCoding.java:18 5)
at java.lang.String.<init>(String.java:571)
at java.lang.String.<init>(String.java:594)
at RegexFormat.main(regexformat.java:73)

