Tabula-java is skipping one row from Table
HI, I am using Tabula-java to parse Table in pdf file, but it is skipping one row. Alternative row is fetching perperly. I have attached my pdf file named murree_ren.pdf. murree_ren.pdf
This is the code I have used:
public void parse() {
System.out.println("TabulaPdfParser.parse-----------------------------------");
try {
File file = new File("D:/Pdfs/murree_ren.pdf");
FileInputStream inputStream = new FileInputStream(file);
PDDocument document = PDDocument.load(inputStream);
{
System.out.println("TabulaPdfParser.parse--------------------document loaded---------------");
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
@SuppressWarnings("resource")
PageIterator pi = new ObjectExtractor(document).extract();
while (pi.hasNext()) {
// iterate over the pages of the document
Page page = pi.next();
List<Table> tables = sea.extract(page);
System.out.println("TabulaPdfParser.parse------------------||||-----------------table size=" + tables.size());
// iterate over the tables of the page
for(Table table: tables) {
List<List<RectangularTextContainer>> rows = table.getRows();
System.out.println("TabulaPdfParser.parse------------getRowCount=" + table.getRowCount() + " colcount=" + table.getColCount());
String str = "";
RectangularTextContainer rect;
int rowcount = table.getRowCount();
int colcount = table.getColCount();
for (int i=0; i<rowcount; i++) {
str = "";
for (int j=0; j<colcount; j++) {
rect = table.getCell(i, j);
str = str + rect.getText().replace("\r", " ");
if (j < (colcount-1)) {
str += "|";
}
}
System.out.println("RowText:----------row no=" + i + " str=" + str);
}
}
}
}
} catch (Exception ex) {
System.out.println("Exception:---------------------------------------=" + ex.getMessage());
}
}
Here is the output:
TabulaPdfParser.parse----------------------------------- TabulaPdfParser.parse--------------------document loaded--------------- TabulaPdfParser.parse------------------||||-----------------table size=109 TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=2 colcount=7 RowText:----------row no=0 str=1|39167 24/11/2019|MUHAMMAD MATLOOB|MUHAMMAD TAJ|VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB|6/6/1976|F.A RowText:----------row no=1 str=|||||| TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=3 RowText:----------row no=0 str=|| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=2 colcount=7 RowText:----------row no=0 str=3|455 18/06/2020|WAHEED ANWAR|ABDUL QADOUS|P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB|12/9/1954|MATRIC RowText:----------row no=1 str=|||||| TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=WAHEED ANWAR| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=ABDUL QADOUS RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=5 RowText:----------row no=0 str=|||| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=3 colcount=11 RowText:----------row no=0 str=5|61134 2020-12-08|AZRA PARVEEN|MUHAMMAD TALIB|V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|30/10/1966|MATRIC|||| RowText:----------row no=1 str=|||||||||| RowText:----------row no=2 str=|||||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=61134 2020-12-08 RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=61134 2020-12-08 RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=AZRA PARVEEN| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=3 RowText:----------row no=0 str=|| TabulaPdfParser.parse------------getRowCount=1 colcount=3 RowText:----------row no=0 str=|| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=3 RowText:----------row no=0 str=|| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=3 colcount=8 RowText:----------row no=0 str=7|60305 25/01/2021|AZRA NAHEED|MANZOOR HUSSAIN|BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|17/03/1973|MATRIC| RowText:----------row no=1 str=||||||| RowText:----------row no=2 str=||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MANZOOR HUSSAIN RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MANZOOR HUSSAIN RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=3 colcount=9 RowText:----------row no=0 str=9|59463 31/07/2021|MUNAWAR HUSSAIN|MUHAMMAD ABDULLAH|H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|01/03/1974|MATRIC|| RowText:----------row no=1 str=|||||||| RowText:----------row no=2 str=|||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MUNAWAR HUSSAIN RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MUHAMMAD ABDULLAH RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=4 RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB||| RowText:----------row no=1 str=||| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=3 colcount=11 RowText:----------row no=0 str=11|58306 10/01/2022|MUBASHAR ISHAQ QAMAR|MUHAMMAD ISHAQ|VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB|26/02/1992|FSC|||| RowText:----------row no=1 str=|||||||||| RowText:----------row no=2 str=|||||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MUBASHAR ISHAQ QAMAR RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=4 RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB||| RowText:----------row no=1 str=||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=FSC RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=3 colcount=12 RowText:----------row no=0 str=13|49597 19/06/2022|TAHIR MEHBOOB|MUHAMMAD MEHBOOB|GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB|15/7/1988|ICS , FSC HOMEO||||| RowText:----------row no=1 str=||||||||||| RowText:----------row no=2 str=||||||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=TAHIR MEHBOOB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=TAHIR MEHBOOB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=TAHIR MEHBOOB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=4 RowText:----------row no=0 str=MUHAMMAD MEHBOOB||| RowText:----------row no=1 str=||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=ICS , FSC HOMEO RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=3 RowText:----------row no=0 str=|| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=3 colcount=11 RowText:----------row no=0 str=15|53404 19/03/2023|MUHAMMAD AJMAL MALIK|MAHMOOD AHMED MALIK|VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB|26/2/1961|MATRIC|||| RowText:----------row no=1 str=|||||||||| RowText:----------row no=2 str=|||||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=MUHAMMAD AJMAL MALIK|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MAHMOOD AHMED MALIK RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=||