從pdf中讀取表格數據並且寫入datatable中

首先引入第三方組件的使用Tabula,這個是一個開源的組件,該組件基於pdfpig組件實現

        /// <summary>
        /// 提取表格的方法
        /// </summary>
        /// <param name="pdfPath"></param>
        /// <param name="startNumber"></param>
        /// <param name="endNumber"></param>
        /// <returns></returns>
        private List<DataTable> ExtractTables(string pdfPath, int startNumber, int endNumber)
        {
            try
            {
                using (UglyToad.PdfPig.PdfDocument document = UglyToad.PdfPig.PdfDocument.Open(pdfPath, new ParsingOptions() { ClipPaths = true }))
                {

                    ObjectExtractor oe = new ObjectExtractor(document);
                    IExtractionAlgorithm ea = new SpreadsheetExtractionAlgorithm();
                    var pagesNumber = document.NumberOfPages;
                    if (startNumber < pagesNumber && endNumber > pagesNumber)
                    {
                        endNumber = pagesNumber;
                    }
                    if (startNumber > pagesNumber || endNumber > pagesNumber)
                    {
                        throw new IndexOutOfRangeException("頁碼超出範圍!");
                    }

                    List<DataTable> dtList = new List<DataTable>();

                    for (int i = startNumber; i <= endNumber; i++)
                    {

                        PageArea page = oe.Extract(i);
                        List<Table> tables = ea.Extract(page);
                        foreach (Table tb in tables)
                        {
                            DataTable dt = new DataTable();
                            var columnCount = tb.ColumnCount;
                            for (int b = 0; b < columnCount; b++)
                            {
                                dt.Columns.Add(b.ToString(), typeof(string));
                            }
                            var rows = tb.Rows;
                            foreach (IReadOnlyList<Cell> row in tb.Rows)
                            {
                                DataRow dr = dt.NewRow();
                                for (int c = 0; c < columnCount; c++)
                                {
                                    dr[c] = row[c];
                                }
                                dt.Rows.Add(dr);
                            }
                            dtList.Add(dt);
                        }
                    }
                    return dtList;
                }

            }
            catch (Exception ex)
            {
                throw ex;
            }
        }

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章