Initial commit

This commit is contained in:
2022-03-29 10:22:01 +02:00
commit ec460f85df
6 changed files with 120 additions and 0 deletions

7
.editorconfig Normal file
View File

@@ -0,0 +1,7 @@
root = true
[*]
end_of_line = lf
indent_style = tab
insert_final_newline = true
trim_trailing_whitespace = true

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/target

42
Cargo.lock generated Normal file
View File

@@ -0,0 +1,42 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
dependencies = [
"memchr",
]
[[package]]
name = "memchr"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a"
[[package]]
name = "pdfextractor"
version = "0.1.0"
dependencies = [
"regex",
]
[[package]]
name = "regex"
version = "1.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.6.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"

9
Cargo.toml Normal file
View File

@@ -0,0 +1,9 @@
[package]
name = "pdfextractor"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
regex = "*"

1
rustfmt.toml Normal file
View File

@@ -0,0 +1 @@
hard_tabs = true

60
src/main.rs Normal file
View File

@@ -0,0 +1,60 @@
use std::{
env::args,
fs::File,
io::{Read, Write},
path::PathBuf,
};
use regex::bytes::Regex;
fn main() {
let path = args().nth(1).expect("no file given");
let mut data = Vec::new();
File::open(path)
.expect("Cannot open file")
.read_to_end(&mut data)
.unwrap();
let re_obj_begin = Regex::new(r"\d+ \d+ obj\n").unwrap();
let re_line = Regex::new(r"[^\n]*\n").unwrap();
let re_len = regex::Regex::new(r"/Length (\d+)").unwrap();
let mut data = &data[..];
let mut filecount: usize = 0;
loop {
let m = match re_obj_begin.find(&data) {
Some(m) => m,
None => break,
};
data = &data[m.end()..];
let m = re_line.find(&data).unwrap();
let line = String::from_utf8(m.as_bytes().to_owned()).unwrap();
data = &data[m.end()..];
if !line.contains("/Image") {
continue;
}
if !line.contains("/Type /XObject") {
continue;
}
let length = re_len
.captures(&line)
.unwrap()
.get(1)
.unwrap()
.as_str()
.parse()
.unwrap();
data = &data[7..];
let stream = &data[..length];
data = &data[length..];
File::create(
["out".to_string(), filecount.to_string()]
.iter()
.collect::<PathBuf>(),
)
.unwrap()
.write_all(stream)
.unwrap();
filecount += 1;
}
}