Initial commit
This commit is contained in:
7
.editorconfig
Normal file
7
.editorconfig
Normal file
@@ -0,0 +1,7 @@
|
||||
root = true
|
||||
|
||||
[*]
|
||||
end_of_line = lf
|
||||
indent_style = tab
|
||||
insert_final_newline = true
|
||||
trim_trailing_whitespace = true
|
||||
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/target
|
||||
42
Cargo.lock
generated
Normal file
42
Cargo.lock
generated
Normal file
@@ -0,0 +1,42 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "0.7.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a"
|
||||
|
||||
[[package]]
|
||||
name = "pdfextractor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
|
||||
9
Cargo.toml
Normal file
9
Cargo.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
[package]
|
||||
name = "pdfextractor"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
regex = "*"
|
||||
1
rustfmt.toml
Normal file
1
rustfmt.toml
Normal file
@@ -0,0 +1 @@
|
||||
hard_tabs = true
|
||||
60
src/main.rs
Normal file
60
src/main.rs
Normal file
@@ -0,0 +1,60 @@
|
||||
use std::{
|
||||
env::args,
|
||||
fs::File,
|
||||
io::{Read, Write},
|
||||
path::PathBuf,
|
||||
};
|
||||
|
||||
use regex::bytes::Regex;
|
||||
|
||||
fn main() {
|
||||
let path = args().nth(1).expect("no file given");
|
||||
let mut data = Vec::new();
|
||||
File::open(path)
|
||||
.expect("Cannot open file")
|
||||
.read_to_end(&mut data)
|
||||
.unwrap();
|
||||
|
||||
let re_obj_begin = Regex::new(r"\d+ \d+ obj\n").unwrap();
|
||||
let re_line = Regex::new(r"[^\n]*\n").unwrap();
|
||||
let re_len = regex::Regex::new(r"/Length (\d+)").unwrap();
|
||||
|
||||
let mut data = &data[..];
|
||||
let mut filecount: usize = 0;
|
||||
loop {
|
||||
let m = match re_obj_begin.find(&data) {
|
||||
Some(m) => m,
|
||||
None => break,
|
||||
};
|
||||
data = &data[m.end()..];
|
||||
let m = re_line.find(&data).unwrap();
|
||||
let line = String::from_utf8(m.as_bytes().to_owned()).unwrap();
|
||||
data = &data[m.end()..];
|
||||
if !line.contains("/Image") {
|
||||
continue;
|
||||
}
|
||||
if !line.contains("/Type /XObject") {
|
||||
continue;
|
||||
}
|
||||
let length = re_len
|
||||
.captures(&line)
|
||||
.unwrap()
|
||||
.get(1)
|
||||
.unwrap()
|
||||
.as_str()
|
||||
.parse()
|
||||
.unwrap();
|
||||
data = &data[7..];
|
||||
let stream = &data[..length];
|
||||
data = &data[length..];
|
||||
File::create(
|
||||
["out".to_string(), filecount.to_string()]
|
||||
.iter()
|
||||
.collect::<PathBuf>(),
|
||||
)
|
||||
.unwrap()
|
||||
.write_all(stream)
|
||||
.unwrap();
|
||||
filecount += 1;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user