I would like to get access to the pdf file's pages' text, but as all the pdf crates have very poor documentation, I don't see a way to do that.
I've tried
let y = file::File::open(s).unwrap();
let y = file::File::pages(&y);
for i in y {
println!("{:?}",i);
}
But the problem is that I get an iterator not over pages' text, but over pages' information
What should I do to get exactly pages' text?
A typical page in this code looks like this:
Ok(PageRc(RcRef { inner: PlainRef { id: 5, gen: 0 }, data: Leaf(Page { parent: PagesRc(RcRef { inner: PlainRef { id: 1, gen: 0 }, data: Tree(PageTree { parent: None, kids: [Ref(3), Ref(5), Ref(7)], count: 3, resources: None, media_box: Some(Rect { left: 0.0, bottom: 0.0, right: 595.28, top: 841.89 }), crop_box: None }) }), resources: Some(Indirect(RcRef { inner: PlainRef { id: 2, gen: 0 }, data: Resources { graphics_states: {"GS1": GraphicsStateParameters {
line_width: None, line_cap: None, line_join: None, miter_limit: None, rendering_intent: None, font: None, _other: {
BM: /Normal
CA: 1
Type: /ExtGState
ca: 1
} }}, color_spaces: {}, xobjects: {"I3": Ref(18), "I1": Ref(16), "I2": Ref(17)}, fonts: {"F1": Ref(13), "F2": Ref(14), "F3": Ref(15)}, properties: {} } })), media_box: Some(Rect { left: 0.0, bottom: 0.0, right: 595.28, top: 841.89
}), crop_box: None, trim_box: Some(Rect { left: 0.0, bottom: 0.0, right: 595.28, top: 841.89 }), contents: Some(Content { operations: [Operation { operator: "J", operands: [Integer(2)] }, Operation { operator: "w", operands: [Number(0.567)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tf", operands: [Name("F1"), Number(11.0)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "g", operands: [Number(1.0)] }, Operation { operator: "G", operands: [Number(0.0)] }, Operation { operator: "gs", operands: [Name("GS1")] }, Operation { operator: "w", operands: [Number(0.567)]
}, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "d", operands: [Array([]), Integer(0)] }, Operation { operator: "gs", operands: [Name("GS1")] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tf", operands: [Name("F3"), Number(16.5)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation
{ operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation {
operator: "Td", operands: [Number(56.693), Number(741.078)] }, Operation { operator: "Tj", operands: [String("Artificial Intelligence: A Modern Approach (4th Edition) PDF")] }, Operation { operator: "ET", operands: [] }, Operation
{ operator: "Q", operands: [] }, Operation { operator: "gs", operands: [Name("GS1")] }, Operation { operator: "BT", operands: [] }, Operation { operator:
"Tf", operands: [Name("F1"), Number(11.0)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "gs", operands: [Name("GS1")] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tf", operands: [Name("F3"), Number(11.0)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation
{ operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(683.076)] }, Operation { operator: "Tj", operands: [String("Artificial Intelligence: A Modern Approach (4th Edition) by by Stuart Russell, Peter Norvig")] }, Operation { operator: "ET", operands: [] }, Operation
{ operator: "Q", operands: [] }, Operation { operator: "BT", operands: [] },
Operation { operator: "Tf", operands: [Name("F2"), Number(11.0)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tc", operands: [Number(0.094)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tw", operands: [Number(0.441)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(668.521)] }, Operation { operator: "Tj", operands: [String("This Artificial Intelligence: A Modern Approach (4th Edition) book is not really ordinary book, you")] }, Operation { operator:
"ET", operands: [] }, Operation { operator: "Q", operands: [] }, Operation {
operator: "BT", operands: [] }, Operation { operator: "Tc", operands: [Number(0.288)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tw", operands: [Number(0.98)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(654.025)] }, Operation { operator: "Tj", operands: [String("have it then the world is in your hands. The benefit you get by reading this book is actually")] }, Operation { operator: "ET", operands: [] }, Operation { operator: "Q", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator:
"Tc", operands: [Number(0.056)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tw", operands: [Number(0.266)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation
{ operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(639.528)] }, Operation { operator: "Tj", operands: [String("information inside this reserve incredible fresh, you will get information
which is getting deeper an")] }, Operation { operator: "ET", operands: [] },
Operation { operator: "Q", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tc", operands: [Number(0.272)] }, Operation
{ operator: "ET", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tw", operands: [Number(1.161)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(625.032)] }, Operation { operator: "Tj", operands: [String("individual read a lot of information you will get. This kind of Artificial Intelligence: A Modern")] }, Operation { operator: "ET", operands: [] }, Operation { operator: "Q", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tc", operands: [Number(0.174)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tw", operands: [Number(0.721)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(610.536)] }, Operation { operator: "Tj", operands: [String("Approach (4th Edition) without we recognize teach the one who looking at it become critical
in")] }, Operation { operator: "ET", operands: [] }, Operation { operator: "Q", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tc", operands: [Number(0.128)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tw", operands: [Number(0.677)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(596.04)] }, Operation { operator: "Tj", operands:
[String("imagining and analyzing. Don\x92t be worry Artificial Intelligence:
A Modern Approach (4th Edition)")] }, Operation { operator: "ET", operands: [] }, Operation { operator: "Q", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tc", operands: [Number(0.055)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "BT", operands:
[] }, Operation { operator: "Tw", operands: [Number(0.193)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(581.543)] }, Operation { operator: "Tj", operands: [String("can bring any time you are and not make your tote space or bookshelves\x92 grow to be full because")] }, Operation
{ operator: "ET", operands: [] }, Operation { operator: "Q", operands: [] },
Operation { operator: "BT", operands: [] }, Operation { operator: "Tc", operands: [Number(0.165)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tw", operands: [Number(0.704)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(567.047)] }, Operation { operator: "Tj", operands: [String("you can have it inside your lovely laptop even cell phone. This Artificial Intelligence: A Modern")] }, Operation { operator: "ET", operands: [] }, Operation { operator: "Q", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tc", operands: [Number(0.165)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tw", operands: [Number(0.688)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(552.551)] }, Operation { operator: "Tj",
operands: [String("Approach (4th Edition) having great arrangement in word and layout, so you will not really feel")] }, Operation { operator: "ET", operands: [] }, Operation { operator: "Q", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tc", operands: [Integer(0)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tw", operands: [Integer(0)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "g", operands: [Number(0.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(538.055)] }, Operation { operator: "Tj", operands: [String("uninterested in reading.")] }, Operation { operator: "ET", operands: [] }, Operation { operator: "Q", operands: []
}, Operation { operator: "gs", operands: [Name("GS1")] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tf", operands: [Name("F1"), Number(11.0)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tf", operands: [Name("F2"), Number(11.0)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "rg", operands: [Number(0.0), Number(0.0), Number(1.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(511.238)] }, Operation { operator: "Tj", operands: [String("->>>Download: Artificial Intelligence: A Modern Approach (4th Edition) PDF")] }, Operation { operator: "ET", operands: [] }, Operation { operator: "RG", operands: [Number(0.0), Number(0.0), Number(1.0)] }, Operation { operator: "w", operands: [Number(0.55)] }, Operation { operator: "j", operands: [Integer(0)] }, Operation { operator: "J", operands: [Integer(0)]
}, Operation { operator: "m", operands: [Number(56.693), Number(509.863)] },
Operation { operator: "l", operands: [Number(423.202), Number(509.863)] }, Operation { operator: "S", operands: [] }, Operation { operator: "w", operands: [Number(0.567)] }, Operation { operator: "j", operands: [Integer(2)] }, Operation { operator: "J", operands: [Integer(2)] }, Operation { operator: "g", operands: [Number(1.0)] }, Operation { operator: "Q", operands: [] }, Operation { operator: "gs", operands: [Name("GS1")] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tf", operands: [Name("F1"), Number(11.0)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "gs",
operands: [Name("GS1")] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Tf", operands: [Name("F2"), Number(11.0)] }, Operation { operator: "ET", operands: [] }, Operation { operator: "q", operands: [] }, Operation { operator: "rg", operands: [Number(0.0), Number(0.0), Number(1.0)] }, Operation { operator: "Tr", operands: [Integer(0)] }, Operation { operator: "BT", operands: [] }, Operation { operator: "Td", operands: [Number(56.693), Number(484.422)] }, Operation { operator: "Tj", operands: [String("->>>Read Online: Artificial Intelligence: A Modern Approach (4th Edition) PDF")] }, Operation { operator: "ET", operands: [] }, Operation { operator: "RG", operands:
[Number(0.0), Number(0.0), Number(1.0)] }, Operation { operator: "w", operands: [Number(0.55)] }, Operation { operator: "j", operands: [Integer(0)] }, Operation { operator: "J", operands: [Integer(0)] }, Operation { operator: "m",
operands: [Number(56.693), Number(483.047)] }, Operation { operator: "l", operands: [Number(435.434), Number(483.047)] }, Operation { operator: "S", operands: [] }, Operation { operator: "w", operands: [Number(0.567)] }, Operation
{ operator: "j", operands: [Integer(2)] }, Operation { operator: "J", operands: [Integer(2)] }, Operation { operator: "g", operands: [Number(1.0)] }, Operation { operator: "Q", operands: [] }, Operation { operator: "gs", operands:
[Name("GS1")] }] }) }) }))
UPD:
I have found a way to get the page's content, but it is really strange: besides the text, there are many extra symbols
let y = file::File::open(s).unwrap();
//let y = file::File::open(&y);
//let how_many = y.get_page(n);
//println!("{:?}",parser::parse(y));
//for page in y{
// println!("{:?}",page);
//}
let mut doc = lopdf::Document::load("pdf_experiment.pdf").unwrap();
println!("{:?}",doc.get_pages());
//for i in doc.get_pages(){
let c = doc.get_page_content((7,0));
println!("{:?}",std::str::from_utf8(&c.unwrap()[0..1000]));
The output I get:
{1: (3, 0), 2: (5, 0), 3: (7, 0)}
Ok(" 2 J\n0.567 w\nBT /F1 11.000 Tf ET\n0.567 w\n0.000 G\n0 Tr\n1.000 g\n[] 0 d\n/GS1 gs\nBT /F3 12.870 Tf ET\nq 0.000 g 0 Tr BT 56.693 747.067 Td (Artificial Intelligence: A Modern Approach \\(4th Edition\\) Review) Tj ET Q\n/GS1 gs\nBT /F1 11.000 Tf ET\n/GS1 gs\nBT /F2 11.000 Tf ET\nBT 0.094 Tc ET\nBT 0.441 Tw ET\nq 0.000 g 0 Tr BT 56.693 707.253 Td (This Artificial Intelligence: A Modern Approach \\(4th Edition\\) book is not really ordinary book, you) Tj ET Q\nBT 0.288 Tc ET\nBT 0.980 Tw ET\nq 0.000 g 0 Tr BT 56.693 692.757 Td (have it then the world is in your hands. The benefit you get by reading this book is actually) Tj ET
Q\nBT 0.056 Tc ET\nBT 0.266 Tw ET\nq 0.000 g 0 Tr BT 56.693 678.261 Td (information inside this reserve incredible fresh, you will get information which is getting deeper an) Tj ET Q\nBT 0.272 Tc ET\nBT 1.161 Tw ET\nq 0.000 g 0 Tr BT 56.693 663.764 Td (individual read a lot of information you will get. This kind of Artificial Intelligence: A
Modern) Tj ET Q\nBT 0.174 Tc ET\nBT 0.721 T")
What shall I do to get the plain text? I know about extract-pdf cratem but i would like to iter file by page, while extract-pdf just reads the contents of the whole PDF-file.