Browse Source

First commit

master
Stephen 8 months ago
commit
15748dc9f4
12 changed files with 1040 additions and 0 deletions
  1. +2
    -0
      .gitignore
  2. +306
    -0
      Cargo.lock
  3. +16
    -0
      Cargo.toml
  4. +16
    -0
      README.md
  5. +48
    -0
      src/bin/explorer.rs
  6. +112
    -0
      src/bin/longestpath.rs
  7. +125
    -0
      src/bin/wiki2graph.rs
  8. +128
    -0
      src/extractor.rs
  9. +9
    -0
      src/lib.rs
  10. +142
    -0
      src/minivec.rs
  11. +115
    -0
      src/process.rs
  12. +21
    -0
      src/wiki.rs

+ 2
- 0
.gitignore View File

@ -0,0 +1,2 @@
/target
/.idea/

+ 306
- 0
Cargo.lock View File

@ -0,0 +1,306 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "aho-corasick"
version = "0.7.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86"
dependencies = [
"memchr",
]
[[package]]
name = "autocfg"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d"
[[package]]
name = "bincode"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f30d3a39baa26f9651f17b375061f3233dde33424a8b72b0dbe93a68a0bc896d"
dependencies = [
"byteorder",
"serde",
]
[[package]]
name = "byteorder"
version = "1.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de"
[[package]]
name = "cfg-if"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
[[package]]
name = "crossbeam-deque"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
"maybe-uninit",
]
[[package]]
name = "crossbeam-epoch"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"lazy_static",
"maybe-uninit",
"memoffset",
"scopeguard",
]
[[package]]
name = "crossbeam-queue"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "774ba60a54c213d409d5353bda12d49cd68d14e45036a285234c8d6f91f92570"
dependencies = [
"cfg-if",
"crossbeam-utils",
"maybe-uninit",
]
[[package]]
name = "crossbeam-utils"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8"
dependencies = [
"autocfg",
"cfg-if",
"lazy_static",
]
[[package]]
name = "either"
version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3"
[[package]]
name = "fnv"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "hermit-abi"
version = "0.1.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3deed196b6e7f9e44a2ae8d94225d80302d81208b1bb673fd21fe634645c85a9"
dependencies = [
"libc",
]
[[package]]
name = "itoa"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc6f3ad7b9d11a0c00842ff8de1b60ee58661048eb8049ed33c73594f359d7e6"
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2f02823cf78b754822df5f7f268fb59822e7296276d3e069d8e8cb26a14bd10"
[[package]]
name = "maybe-uninit"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
[[package]]
name = "memchr"
version = "2.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400"
[[package]]
name = "memoffset"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c198b026e1bbf08a937e94c6c60f9ec4a2267f5b0d2eec9c1b21b061ce2be55f"
dependencies = [
"autocfg",
]
[[package]]
name = "num_cpus"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "proc-macro2"
version = "1.0.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04f5f085b5d71e2188cb8271e5da0161ad52c3f227a661a3c135fdf28e258b12"
dependencies = [
"unicode-xid",
]
[[package]]
name = "quote"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rayon"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62f02856753d04e03e26929f820d0a0a337ebe71f849801eea335d464b349080"
dependencies = [
"autocfg",
"crossbeam-deque",
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e92e15d89083484e11353891f1af602cc661426deb9564c298b270c726973280"
dependencies = [
"crossbeam-deque",
"crossbeam-queue",
"crossbeam-utils",
"lazy_static",
"num_cpus",
]
[[package]]
name = "regex"
version = "1.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
"thread_local",
]
[[package]]
name = "regex-syntax"
version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8"
[[package]]
name = "ryu"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
[[package]]
name = "scopeguard"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "serde"
version = "1.0.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5317f7588f0a5078ee60ef675ef96735a1442132dc645eb1d12c018620ed8cd3"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a0be94b04690fbaed37cddffc5c134bf537c8e3329d53e982fe04c374978f8e"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.57"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "164eacbdb13512ec2745fb09d51fd5b22b0d65ed294a1dcf7285a360c80a675c"
dependencies = [
"itoa",
"ryu",
"serde",
]
[[package]]
name = "syn"
version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4cdb98bcb1f9d81d07b536179c269ea15999b5d14ea958196413869445bb5250"
dependencies = [
"proc-macro2",
"quote",
"unicode-xid",
]
[[package]]
name = "thread_local"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
dependencies = [
"lazy_static",
]
[[package]]
name = "unicode-xid"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
[[package]]
name = "wikigraph"
version = "0.1.0"
dependencies = [
"bincode",
"fnv",
"lazy_static",
"rayon",
"regex",
"serde",
"serde_json",
"xml-rs",
]
[[package]]
name = "xml-rs"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a"

+ 16
- 0
Cargo.toml View File

@ -0,0 +1,16 @@
[package]
name = "wikigraph"
version = "0.1.0"
authors = ["Stephen <stephen@stephendownward.ca>"]
edition = "2018"
[dependencies]
xml-rs="0.8.3"
rayon="1.3"
serde = { version = "1.0", features = ["derive"] }
serde_json="1.0"
regex = "1.3"
lazy_static = "1.4"
bincode = "1.3"
fnv = "1.0"

+ 16
- 0
README.md View File

@ -0,0 +1,16 @@
# Wikigraph
Some utilities designed to take a wikipedia xml dump and perform some analysis. These are not production-grade tools - they are basically just experiments.
This software is very RAM-intensive! I have 32GB of RAM and I found myself using all of it at some points.
## Applications
This repo contains 3 applications:
- `wiki2graph`: Takes an XML file and converts it to a collection of nodes and edges (a graph), which is saved as a file.
- `explorer`: Takes in a graph and allows seeing all links from a source article, given the name of that source article. This is primarily used for debugging.
- `longestpath`: Takes a graph and finds the longest path from an article to a target article. The target article is specified in the source code. The default is "Vacuum Tube".

+ 48
- 0
src/bin/explorer.rs View File

@ -0,0 +1,48 @@
extern crate wikigraph;
use wikigraph::process::Graph;
use std::env;
use std::io;
use std::io::prelude::*;
fn main() {
let args: Vec<String> = env::args().collect();
if args.len() != 2 {
println!("Usage: ./{} bincode_file", args[0]);
return;
}
println!("Loading graph...");
let graph = Graph::load_bin(&args[1]);
println!("Done. {} nodes loaded.", graph.nodes.len());
// Print all article titles
// graph.nodes.iter().for_each(|(_, v)| { println!("{}", v) });
loop {
print!("Enter an article name: ");
io::stdout().flush().unwrap();
let mut buffer = String::new();
match io::stdin().read_line(&mut buffer) {
Ok(_) => {
let buffer = buffer.trim();
// Lookup id from article name. O(n) time
match graph.nodes.iter().find(|(_, v)| { v == &&buffer }) {
Some((&id, _)) => {
// Lookup edges
graph.edges.iter()
.filter(|edge| { edge.from == id })
.map(|edge| { edge.to })
.for_each(|target_id| {
println!("-> {}", graph.nodes[&target_id]);
});
}
None => {
println!("Could not find that article!");
continue;
}
}
}
Err(_) => { break; }
}
}
}

+ 112
- 0
src/bin/longestpath.rs View File

@ -0,0 +1,112 @@
extern crate wikigraph;
use wikigraph::{ process::Graph, minivec::MiniVec };
use rayon::prelude::*;
use fnv::FnvHashMap; // TODO use this more! HashMaps should all be removed
use std::collections::HashMap;
use std::env;
// Calculates articles that are farthest away from "Vacuum Tube"
// in the direction of (farthest article) --> Vacuum Tube
fn main() {
let args: Vec<String> = env::args().collect();
if args.len() != 2 {
println!("Usage: ./{} bincode_file", args[0]);
return;
}
println!("Loading graph...");
let mut graph = Graph::load_bin(&args[1]);
println!("Done.\r\nReversing direction of edges...");
graph.edges.par_iter_mut().for_each(|mut e| {
let tmp = e.from;
e.from = e.to;
e.to = tmp;
});
println!("Done.\r\nPreparing for Dijkstra's algorithm...");
//Since the graph is reversed, this is really the end point
let start = *graph.nodes.par_iter()
.find_any(|(_, v)| { v == &&"vacuum tube".to_string() })
.unwrap().0;
// Distances
let mut distances: FnvHashMap<usize, usize> = graph.nodes.par_iter().map(|(k, _)| {
(*k, std::usize::MAX)
}).collect();
distances.insert(start, 0);
// Neighbours
let mut all_neighbours: FnvHashMap<usize, Vec<(usize, usize)>> = FnvHashMap::default();
for e in &graph.edges {
if !all_neighbours.contains_key(&e.from) {
all_neighbours.insert(e.from, vec![(e.to, e.weight)]);
}
else {
// TODO we can speed this up by getting the mutable directly
let mut v = all_neighbours.remove(&e.from).unwrap();
v.push((e.to, e.weight));
all_neighbours.insert(e.from, v);
}
}
graph.edges = vec![]; // We no longer need the edges, so let's save some RAM
// Make sure every node is in all_neighbours.
for (&n, _) in &graph.nodes {
if !all_neighbours.contains_key(&n) {
all_neighbours.insert(n, vec![]);
}
}
//Unvisited nodes
let mut unvisited_nodes: FnvHashMap<usize, ()> = graph.nodes.par_iter().map(|(k, _)| { (*k, ()) }).collect();
let mut slightly_visited_nodes: MiniVec = MiniVec::new(HashMap::new());
println!("Done.\r\nPerforming Dijkstra's algorithm...");
let mut last_print = std::usize::MAX;
let mut cur_node = start;
while unvisited_nodes.len() > 0 {
//println!("Finding neighbours..");
let neighbours = &all_neighbours[&cur_node];
//println!("Executing neighbours...");
neighbours.iter().for_each(|(n, weight)| {
if !unvisited_nodes.contains_key(n) { return; }
let new_dist = distances[&cur_node] + weight; // Graph is unweighted, so distance + 1
let old_dist = distances[n];
if new_dist < old_dist {
distances.insert(*n, new_dist);
// Add into slightly_visited_nodes, while maintaing order
//println!("slightly_visited_insert");
slightly_visited_nodes.insert_element(*n, new_dist);
//println!("ok");
}
});
//println!("removing some stuff");
unvisited_nodes.remove(&cur_node).unwrap();
// Node is now *too* visited, so we should remove it
slightly_visited_nodes.remove_element(cur_node);
if unvisited_nodes.len() + 1000 <= last_print {
last_print = unvisited_nodes.len();
println!("{} nodes remaining.", last_print);
}
if slightly_visited_nodes.len() == 0 { break; }
cur_node = slightly_visited_nodes.get_min_distance();
//println!("Done");
}
println!("Done! Grabbing distances...");
for (id, dist) in distances {
if dist > 5 && dist < std::usize::MAX {
let s = &graph.nodes[&id];
if !s.starts_with("category:") && !s.starts_with("wikipedia:") && !s.starts_with("template:") {
println!("{} away: {}", dist, s);
}
}
}
}

+ 125
- 0
src/bin/wiki2graph.rs View File

@ -0,0 +1,125 @@
extern crate wikigraph;
use wikigraph::process::{ GraphIntermediate, process_graph, save_graph };
use wikigraph::wiki::WikiPage;
use rayon::ThreadPoolBuilder;
use std::fs::File;
use std::io::BufReader;
use std::sync::{ Arc, Mutex };
use std::env;
use xml::reader::{ EventReader, XmlEvent };
//Where are we in the file?
enum FileState {
InRoot,
InTitle,
InPage,
InPageTitle,
InPageBody
}
fn main() {
//Process command line args
let args: Vec<String> = env::args().collect();
if args.len() != 3 {
println!("Usage: ./{} wikipedia_xml_file output_bincode_file", args[0]);
return;
}
let input_filename = &args[1];
let output_filename = &args[2];
println!("Converting {} to {}", input_filename, output_filename);
let file = File::open(input_filename).unwrap();
let file = BufReader::new(file);
let parser = EventReader::new(file);
let pool = ThreadPoolBuilder::new().build().unwrap();
let graph = Arc::new(Mutex::new(GraphIntermediate::new()));
pool.install(|| {
rayon::scope(|scope| {
let mut state: FileState = FileState::InRoot;
let mut cur_page = WikiPage::new();
for e in parser {
// Keeps the job queue from overflowing
// Amazingly, Rayon has absolutely no way of dealing with this
// There isn't even a method to get the current number of jobs in the queue ):
while pool.current_thread_has_pending_tasks().unwrap() { }
match e {
Ok(XmlEvent::StartElement { name, .. }) => {
let name = name.local_name;
// println!("{}", name);
state = match (&state, name.as_str()) {
(FileState::InRoot, "sitename") => {
FileState::InTitle
}
(FileState::InRoot, "page") => {
FileState::InPage
}
(FileState::InPage, "title") => {
FileState::InPageTitle
}
(FileState::InPage, "text") => {
FileState::InPageBody
}
_ => { state }
}
}
Ok(XmlEvent::EndElement { name, .. }) => {
let name = name.local_name;
match (&state, name.as_str()) {
(FileState::InTitle, "sitename") => {
state = FileState::InRoot;
}
(FileState::InPage, "page") => {
state = FileState::InRoot;
// Process page
scope.spawn(|_| { process_graph(graph.clone(), cur_page); });
// Reset page
cur_page = WikiPage::new();
}
(FileState::InPageTitle, "title") => {
state = FileState::InPage;
}
(FileState::InPageBody, "text") => {
state = FileState::InPage;
// Freeze text so that we don't accidentally combine revisions
cur_page.frozen = true;
}
_ => {}
}
}
Ok(XmlEvent::Characters(data)) => {
match state {
FileState::InTitle => {
println!("Title: {}", data);
}
FileState::InPageTitle => {
cur_page.title += &data;
}
FileState::InPageBody => {
cur_page.add_content(&data);
}
_ => { }
}
}
Err(e) => {
println!("Error: {}", e);
break;
}
_ => { }
}
}
});
});
println!("Done! Saving...");
if let Ok(graph) = Arc::try_unwrap(graph) {
save_graph(graph.into_inner().unwrap(), output_filename);
}
}

+ 128
- 0
src/extractor.rs View File

@ -0,0 +1,128 @@
// Extracts links from mediawiki markdown
use regex::Regex;
pub fn extract_links(input: &str) -> Vec<(String, usize)> {
lazy_static! {
static ref NOWIKI_RE: Regex = Regex::new("<nowiki>.*</nowiki>").unwrap();
static ref LINK_RE: Regex = Regex::new(r#"\[\[(()|([^#].*?))(#.*?)?(\|(.*?))?\]\]"#).unwrap();
static ref REDIRECT_RE: Regex = Regex::new(r"(\s*)#redirect(\s*)\[\[(.*)\]\](\s*)").unwrap();
static ref TRANSCLUDE_RE: Regex = Regex::new(r"\{\{(.*?)(\|(.*?))?\}\}").unwrap();
static ref WP_RE: Regex = Regex::new(r"(?i)WP:(\s*)(?P<b>.*)").unwrap();
}
// Remove nowiki text from input
let new_text = NOWIKI_RE.replace_all(input, "");
let new_text = new_text.trim();
// Are we a redirect?
if REDIRECT_RE.is_match(&new_text.to_lowercase()) {
match LINK_RE.captures_iter(&new_text).next() {
Some(x) => { return vec![(x[1].trim().to_string(), 0)]; }
None => { println!("ERROR: Something went wrong with {}", input) }
}
}
let mut ret: Vec<(String, usize)> = LINK_RE.captures_iter(&new_text).map(|cap| {
let mut s = cap[1].trim();
if s.starts_with(":") { s = remove_first(s).unwrap(); }
(s.to_string(), 1)
}).filter(|x| {
let s = x.0.to_lowercase();
s != "" &&
!s.starts_with("special:") &&
!s.starts_with("media:") &&
!s.starts_with("file:")
}).collect();
ret.extend(TRANSCLUDE_RE.captures_iter(&new_text).map(|cap| {
let mut s = cap[1].trim().to_string();
if s.to_lowercase().starts_with("wp:") { s = WP_RE.replace_all(&s, "Wikipedia:$b").to_string(); }
else if s.starts_with(":") { s = remove_first(&s).unwrap().to_string() }
else { s = format!("Template:{}", s); }
(s, 0)
}).collect::<Vec<(String, usize)>>());
ret
}
fn remove_first(s: &str) -> Option<&str> {
s.chars().next().map(|c| &s[c.len_utf8()..])
}
#[cfg(test)]
mod tests {
use crate::extractor::extract_links;
#[test]
fn no_links() {
assert_eq!(Vec::<(String, usize)>::new(), extract_links("Here is some text that contains no links"));
}
#[test]
fn three_links() {
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1), ("gamma".to_string(), 1)],
extract_links("The quick brown [[alpha]] fox jumped over the [[beta ]] [[ gamma ]]"));
}
#[test]
fn links_with_captions() {
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1), ("gamma".to_string(), 1)],
extract_links("The quick brown [[alpha]] fox jumped over the [[beta | lazy]] [[gamma|dog]]"));
}
#[test]
fn escaped_links() {
assert_eq!(Vec::<(String, usize)>::new(), extract_links("<nowiki> this is a [[test]]</nowiki>"));
}
#[test]
fn empty_link() {
assert_eq!(Vec::<(String, usize)>::new(),
extract_links("I'm not even sure if this is legal [[]] [[ ]]"));
}
#[test]
fn no_caption() {
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1)],
extract_links("The quick brown [[alpha|]] fox jumped over the lazy dog [[beta | ]]"));
}
#[test]
fn no_page_links() {
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1)],
extract_links("The quick brown [[alpha|]] fox jumped over [[#table of contents]] the lazy dog [[beta | ]]"));
}
#[test]
fn anchor_on_other_page() {
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1)],
extract_links("The quick brown [[alpha#table of contents]] fox jumped over the lazy dog [[beta#test | The beta page]]"));
}
#[test]
fn media_and_special() {
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1)],
extract_links("The quick brown [[alpha]] fox jumped over the lazy dog [[beta]] [[Special: delta]] [[Media: epsilon]]"));
}
#[test]
fn test_redirects() {
assert_eq!(vec![("Dentistry".to_string(), 0)], extract_links("\r\n #rEdIrEcT [[Dentistry]]"));
}
#[test]
fn test_transclusion() {
assert_eq!(vec![("just a regular link".to_string(), 1),
("Template:a template".to_string(), 0),
("another Template".to_string(), 0),
("Wikipedia:last one".to_string(), 0),
("Template:the actual last one".to_string(), 0)],
extract_links("{{a template}} {{:another Template}} [[just a regular link]] normal text {{ wP: last one }} (: {{ the actual last one | whatever }}"));
}
#[test]
fn test_no_files() {
assert_eq!(vec![("alpha".to_string(), 1)],
extract_links("test [[alpha]] whatever [[FiLe: beta]] abc [[ file:Gamma]]"));
}
}

+ 9
- 0
src/lib.rs View File

@ -0,0 +1,9 @@
#![feature(vec_remove_item, map_first_last)]
extern crate xml;
#[macro_use]
extern crate lazy_static;
pub mod wiki;
pub mod process;
pub mod extractor;
pub mod minivec;

+ 142
- 0
src/minivec.rs View File

@ -0,0 +1,142 @@
use std::collections::{BTreeMap, HashMap};
#[derive(Debug)]
pub struct MiniVec {
next_idx: usize, // Keeps track of what idx to use next
sorted_by_id: HashMap<usize, usize>, // id, idx
sorted_by_distance: BTreeMap<usize, HashMap<usize, ()>>, // distance, [idx]
idx_to_id: HashMap<usize, usize>, // idx, id
id_to_distance: HashMap<usize, usize> // id, distance
}
impl MiniVec {
pub fn new(hsh: HashMap<usize, usize>) -> Self {
let mut ret = Self { next_idx: 0,
sorted_by_id: HashMap::new(),
sorted_by_distance: BTreeMap::new(),
idx_to_id: HashMap::new(),
id_to_distance: HashMap::new() };
for (k, v) in hsh {
let idx = ret.next_idx;
ret.next_idx += 1;
ret.sorted_by_id.insert(k, idx);
ret.idx_to_id.insert(idx, k);
ret.insert_idx_into_dist(idx, v);
ret.id_to_distance.insert(k, v);
}
ret
}
pub fn insert_element(&mut self, id: usize, dist: usize) {
if self.sorted_by_id.contains_key(&id) {
self.remove_element(id);
}
let idx = self.next_idx;
self.next_idx += 1;
self.sorted_by_id.insert(id, idx);
self.idx_to_id.insert(idx, id);
self.insert_idx_into_dist(idx, dist);
self.id_to_distance.insert(id, dist);
}
pub fn remove_element(&mut self, id: usize) {
if let Some(idx) = self.sorted_by_id.remove(&id) {
let dist = self.id_to_distance.remove(&id).unwrap();
self.idx_to_id.remove(&idx);
let a = self.sorted_by_distance.get_mut(&dist).unwrap();
if a.len() > 1 {
a.remove(&idx);
}
else {
self.sorted_by_distance.remove(&dist);
}
}
}
pub fn get_min_distance(&self) -> usize {
match &self.sorted_by_distance.first_key_value().unwrap().1.keys().next() {
Some(x) => self.idx_to_id[x],
None => { panic!("ERR: Something went wrong! {:?}", self)}
}
//self.idx_to_id[&self.sorted_by_distance.first_key_value().unwrap().1.keys().next().unwrap()]
}
pub fn len(&self) -> usize {
self.idx_to_id.len()
}
fn insert_idx_into_dist(&mut self, idx: usize, dist: usize) {
//println!("idx->dist");
if self.sorted_by_distance.contains_key(&dist) {
//println!("a");
self.sorted_by_distance.get_mut(&dist).unwrap().insert(idx, ());
//println!("d");
}
else {
let mut bt: HashMap<usize, ()> = HashMap::new();
bt.insert(idx, ());
self.sorted_by_distance.insert(dist, bt);
}
//println!("ok");
}
}
#[cfg(test)]
mod tests {
use crate::minivec::MiniVec;
use std::collections::{ BTreeMap, HashMap };
#[test]
fn basic_test() {
let mut hm: HashMap<usize, usize> = HashMap::new();
hm.insert(0, 1);
hm.insert(2, 3);
hm.insert(5, 3);
hm.insert(3, 4);
hm.insert(12, 15);
let mut mv = MiniVec::new(hm);
assert_eq!(0, mv.get_min_distance());
mv.remove_element(0);
mv.remove_element(5);
mv.insert_element(0, 3); // Updating an existing node
let mut ids: Vec<usize> = mv.sorted_by_distance[&3]
.keys()
.map(|idx| {
mv.idx_to_id[idx]
})
.collect();
ids.sort();
assert_eq!(vec![0, 2], ids);
let min_dist = mv.get_min_distance();
assert!(min_dist == 0 || min_dist == 2); // Could be either, since they're the same dist
mv.remove_element(0);
mv.insert_element(10, 2);
mv.remove_element(3);
mv.remove_element(2);
mv.insert_element(12, 0);
assert_eq!(12, mv.get_min_distance());
mv.remove_element(10);
mv.remove_element(12);
mv.remove_element(12);
let empty_hm: HashMap<usize, usize> = HashMap::new();
let empty_bt: BTreeMap<usize, HashMap<usize, ()>> = BTreeMap::new();
assert_eq!(8, mv.next_idx);
assert_eq!(empty_hm, mv.sorted_by_id);
assert_eq!(empty_bt, mv.sorted_by_distance);
assert_eq!(empty_hm, mv.idx_to_id);
assert_eq!(empty_hm, mv.id_to_distance);
}
}

+ 115
- 0
src/process.rs View File

@ -0,0 +1,115 @@
use crate::extractor::extract_links;
use crate::wiki::WikiPage;
use serde::{Serialize, Deserialize};
use std::fs::File;
use std::collections::HashMap;
use std::io::BufWriter;
use std::io::prelude::*;
use std::sync::{Arc, Mutex};
use rayon::prelude::*;
#[derive(Serialize, Deserialize, Clone, Copy)]
pub struct MyEdge<T> {
pub from: T,
pub to: T,
pub weight: usize
}
impl<T> MyEdge<T> {
fn new(from: T, to: T, weight: usize) -> Self {
Self { from, to, weight }
}
}
#[derive(Serialize, Deserialize)]
pub struct Graph {
pub edges: Vec<MyEdge<usize>>,
pub nodes: HashMap<usize, String>
}
impl Graph {
fn new(nodes: HashMap<usize, String>, edges: Vec<MyEdge<usize>>) -> Self {
Self {
edges,
nodes
}
}
pub fn load_bin(filename: &str) -> Self {
let mut buffer = Vec::new();
File::open(filename).unwrap().read_to_end(&mut buffer).unwrap();
bincode::deserialize(&mut buffer).unwrap()
}
pub fn neighbours(&self, node: usize) -> Vec<usize> {
self.edges.par_iter()
.filter(|e| { e.from == node })
.map(|e| { e.to })
.collect()
}
}
pub struct GraphIntermediate {
pub edges: Vec<MyEdge<usize>>,
pub nodes: HashMap<String, usize>
}
impl GraphIntermediate {
pub fn new() -> Self {
Self {
edges: Vec::new(),
nodes: HashMap::new(),
}
}
}
pub fn process_graph(graph: Arc<Mutex<GraphIntermediate>>, page: WikiPage) {
let links = extract_links(&page.body);
let mut graph = graph.lock().unwrap();
links.iter().for_each(|(target, weight)| {
// Check if we have an existing node. If not, add one
let target_lwr = target.to_lowercase();
if !graph.nodes.contains_key(&page.title.to_lowercase()) {
let len = graph.nodes.len();
graph.nodes.insert(page.title.to_lowercase(), len);
}
if !graph.nodes.contains_key(&target_lwr) {
let len = graph.nodes.len();
graph.nodes.insert(target_lwr.clone(), len);
}
let start_key = *graph.nodes.get(&page.title.to_lowercase()).unwrap();
let end_key = *graph.nodes.get(&target_lwr).unwrap();
graph.edges.push(MyEdge::new(start_key, end_key, *weight));
// println!("Target: {}", target_lwr);
if graph.edges.len() % 10000 == 0 /*|| graph.edges.len() > 58730000*/ {
println!("{} nodes, {} edges.", graph.nodes.len(), graph.edges.len());
println!("{}", page.title);
}
});
}
pub fn save_graph(graph_og: GraphIntermediate, filename: &str) {
//Create a graph from our graph intermediate
let graph = Graph::new(graph_og.nodes.iter().map(|(k, v)|{
(*v, k.clone())
}).collect::<HashMap<usize, String>>(),
graph_og.edges.clone());
std::mem::drop(graph_og); // Save some RAM
let mut buffer = BufWriter::new(File::create(filename).unwrap());
/* BINCODE */
buffer.write_all(&bincode::serialize(&graph).unwrap()).unwrap();
/* JSON */
//buffer.write_all(serde_json::to_string_pretty(&graph).unwrap().as_bytes()).unwrap();
buffer.flush().unwrap();
}

+ 21
- 0
src/wiki.rs View File

@ -0,0 +1,21 @@
pub struct WikiPage {
pub title: String,
pub body: String,
pub frozen: bool
}
impl WikiPage {
pub fn new() -> Self {
Self {
title: "".to_string(),
body: "".to_string(),
frozen: false
}
}
pub fn add_content(&mut self, data: &str) {
if !self.frozen {
self.body += data;
}
}
}

Loading…
Cancel
Save