Repository URL to install this package:
|
Version:
24.8.8 ▾
|
| .. |
| .vscode |
| lib |
| node_modules |
| test |
| .editorconfig |
| .gitlab-ci.yml |
| .jsbeautifyrc |
| CHANGELOG |
| CONTRIBUTING.md |
| LICENSE |
| NOTES.md |
| QUICKSTART.js |
| README.md |
| index.js |
| package.json |
Pure javascript cross-platform module to extract texts from PDFs.
npm install pdf-parse
const fs = require('fs'); const pdf = require('pdf-parse'); let dataBuffer = fs.readFileSync('path to PDF file...'); pdf(dataBuffer).then(function(data) { // number of pages console.log(data.numpages); // number of rendered pages console.log(data.numrender); // PDF info console.log(data.info); // PDF metadata console.log(data.metadata); // PDF.js version // check https://mozilla.github.io/pdf.js/getting_started/ console.log(data.version); // PDF text console.log(data.text); });
You can use crawler-request which uses the pdf-parse
const fs = require('fs'); const pdf = require('pdf-parse'); let dataBuffer = fs.readFileSync('path to PDF file...'); pdf(dataBuffer).then(function(data) { // use data }) .catch(function(error){ // handle exceptions })
// default render callback function render_page(pageData) { //check documents https://mozilla.github.io/pdf.js/ let render_options = { //replaces all occurrences of whitespace with standard spaces (0x20). The default value is `false`. normalizeWhitespace: false, //do not attempt to combine same line TextItem's. The default value is `false`. disableCombineTextItems: false } return pageData.getTextContent(render_options) .then(function(textContent) { let lastY, text = ''; for (let item of textContent.items) { if (lastY == item.transform[5] || !lastY){ text += item.str; } else{ text += '\n' + item.str; } lastY = item.transform[5]; } return text; }); } let options = { pagerender: render_page } let dataBuffer = fs.readFileSync('path to PDF file...'); pdf(dataBuffer,options).then(function(data) { //use new format });
const DEFAULT_OPTIONS = { // internal page parser callback // you can set this option, if you need another format except raw text pagerender: render_page, // max page number to parse max: 0, //check https://mozilla.github.io/pdf.js/getting_started/ version: 'v1.10.100' }
If you need another format except raw text.
Max number of page to parse. If the value is less than or equal to 0, parser renders all pages.
check pdf.js
'default''v1.9.426''v1.10.100''v1.10.88''v2.0.550'default uses version v1.10.100
mozilla.github.io/pdf.js
mocha or npm testI use this package actively myself, so it has my top priority. You can chat on WhatsApp about any infos, ideas and suggestions.
If you find a bug or a mistake, you can help by submitting an issue to GitLab Repository
GitLab calls it merge request instead of pull request.
MIT licensed and all it's dependencies are MIT or BSD licensed.