From ec460f85df37f2dec67021c526756484948c05c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20V=C3=B6gele?= Date: Tue, 29 Mar 2022 10:22:01 +0200 Subject: [PATCH] Initial commit --- .editorconfig | 7 ++++++ .gitignore | 1 + Cargo.lock | 42 ++++++++++++++++++++++++++++++++++++ Cargo.toml | 9 ++++++++ rustfmt.toml | 1 + src/main.rs | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 120 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 rustfmt.toml create mode 100644 src/main.rs diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..35c5fb2 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,7 @@ +root = true + +[*] +end_of_line = lf +indent_style = tab +insert_final_newline = true +trim_trailing_whitespace = true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..a8a1b99 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,42 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "memchr" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" + +[[package]] +name = "pdfextractor" +version = "0.1.0" +dependencies = [ + "regex", +] + +[[package]] +name = "regex" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..67dc7c0 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "pdfextractor" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +regex = "*" diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..218e203 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1 @@ +hard_tabs = true diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..d4a3d0d --- /dev/null +++ b/src/main.rs @@ -0,0 +1,60 @@ +use std::{ + env::args, + fs::File, + io::{Read, Write}, + path::PathBuf, +}; + +use regex::bytes::Regex; + +fn main() { + let path = args().nth(1).expect("no file given"); + let mut data = Vec::new(); + File::open(path) + .expect("Cannot open file") + .read_to_end(&mut data) + .unwrap(); + + let re_obj_begin = Regex::new(r"\d+ \d+ obj\n").unwrap(); + let re_line = Regex::new(r"[^\n]*\n").unwrap(); + let re_len = regex::Regex::new(r"/Length (\d+)").unwrap(); + + let mut data = &data[..]; + let mut filecount: usize = 0; + loop { + let m = match re_obj_begin.find(&data) { + Some(m) => m, + None => break, + }; + data = &data[m.end()..]; + let m = re_line.find(&data).unwrap(); + let line = String::from_utf8(m.as_bytes().to_owned()).unwrap(); + data = &data[m.end()..]; + if !line.contains("/Image") { + continue; + } + if !line.contains("/Type /XObject") { + continue; + } + let length = re_len + .captures(&line) + .unwrap() + .get(1) + .unwrap() + .as_str() + .parse() + .unwrap(); + data = &data[7..]; + let stream = &data[..length]; + data = &data[length..]; + File::create( + ["out".to_string(), filecount.to_string()] + .iter() + .collect::(), + ) + .unwrap() + .write_all(stream) + .unwrap(); + filecount += 1; + } +}