@ -0,0 +1,2 @@ | |||
/target | |||
/.idea/ |
@ -0,0 +1,306 @@ | |||
# This file is automatically @generated by Cargo. | |||
# It is not intended for manual editing. | |||
[[package]] | |||
name = "aho-corasick" | |||
version = "0.7.13" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" | |||
dependencies = [ | |||
"memchr", | |||
] | |||
[[package]] | |||
name = "autocfg" | |||
version = "1.0.0" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" | |||
[[package]] | |||
name = "bincode" | |||
version = "1.3.1" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "f30d3a39baa26f9651f17b375061f3233dde33424a8b72b0dbe93a68a0bc896d" | |||
dependencies = [ | |||
"byteorder", | |||
"serde", | |||
] | |||
[[package]] | |||
name = "byteorder" | |||
version = "1.3.4" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" | |||
[[package]] | |||
name = "cfg-if" | |||
version = "0.1.10" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" | |||
[[package]] | |||
name = "crossbeam-deque" | |||
version = "0.7.3" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285" | |||
dependencies = [ | |||
"crossbeam-epoch", | |||
"crossbeam-utils", | |||
"maybe-uninit", | |||
] | |||
[[package]] | |||
name = "crossbeam-epoch" | |||
version = "0.8.2" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" | |||
dependencies = [ | |||
"autocfg", | |||
"cfg-if", | |||
"crossbeam-utils", | |||
"lazy_static", | |||
"maybe-uninit", | |||
"memoffset", | |||
"scopeguard", | |||
] | |||
[[package]] | |||
name = "crossbeam-queue" | |||
version = "0.2.3" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "774ba60a54c213d409d5353bda12d49cd68d14e45036a285234c8d6f91f92570" | |||
dependencies = [ | |||
"cfg-if", | |||
"crossbeam-utils", | |||
"maybe-uninit", | |||
] | |||
[[package]] | |||
name = "crossbeam-utils" | |||
version = "0.7.2" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" | |||
dependencies = [ | |||
"autocfg", | |||
"cfg-if", | |||
"lazy_static", | |||
] | |||
[[package]] | |||
name = "either" | |||
version = "1.5.3" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" | |||
[[package]] | |||
name = "fnv" | |||
version = "1.0.7" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" | |||
[[package]] | |||
name = "hermit-abi" | |||
version = "0.1.15" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "3deed196b6e7f9e44a2ae8d94225d80302d81208b1bb673fd21fe634645c85a9" | |||
dependencies = [ | |||
"libc", | |||
] | |||
[[package]] | |||
name = "itoa" | |||
version = "0.4.6" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "dc6f3ad7b9d11a0c00842ff8de1b60ee58661048eb8049ed33c73594f359d7e6" | |||
[[package]] | |||
name = "lazy_static" | |||
version = "1.4.0" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" | |||
[[package]] | |||
name = "libc" | |||
version = "0.2.74" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "a2f02823cf78b754822df5f7f268fb59822e7296276d3e069d8e8cb26a14bd10" | |||
[[package]] | |||
name = "maybe-uninit" | |||
version = "2.0.0" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" | |||
[[package]] | |||
name = "memchr" | |||
version = "2.3.3" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" | |||
[[package]] | |||
name = "memoffset" | |||
version = "0.5.5" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "c198b026e1bbf08a937e94c6c60f9ec4a2267f5b0d2eec9c1b21b061ce2be55f" | |||
dependencies = [ | |||
"autocfg", | |||
] | |||
[[package]] | |||
name = "num_cpus" | |||
version = "1.13.0" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" | |||
dependencies = [ | |||
"hermit-abi", | |||
"libc", | |||
] | |||
[[package]] | |||
name = "proc-macro2" | |||
version = "1.0.19" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "04f5f085b5d71e2188cb8271e5da0161ad52c3f227a661a3c135fdf28e258b12" | |||
dependencies = [ | |||
"unicode-xid", | |||
] | |||
[[package]] | |||
name = "quote" | |||
version = "1.0.7" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37" | |||
dependencies = [ | |||
"proc-macro2", | |||
] | |||
[[package]] | |||
name = "rayon" | |||
version = "1.3.1" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "62f02856753d04e03e26929f820d0a0a337ebe71f849801eea335d464b349080" | |||
dependencies = [ | |||
"autocfg", | |||
"crossbeam-deque", | |||
"either", | |||
"rayon-core", | |||
] | |||
[[package]] | |||
name = "rayon-core" | |||
version = "1.7.1" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "e92e15d89083484e11353891f1af602cc661426deb9564c298b270c726973280" | |||
dependencies = [ | |||
"crossbeam-deque", | |||
"crossbeam-queue", | |||
"crossbeam-utils", | |||
"lazy_static", | |||
"num_cpus", | |||
] | |||
[[package]] | |||
name = "regex" | |||
version = "1.3.9" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" | |||
dependencies = [ | |||
"aho-corasick", | |||
"memchr", | |||
"regex-syntax", | |||
"thread_local", | |||
] | |||
[[package]] | |||
name = "regex-syntax" | |||
version = "0.6.18" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8" | |||
[[package]] | |||
name = "ryu" | |||
version = "1.0.5" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" | |||
[[package]] | |||
name = "scopeguard" | |||
version = "1.1.0" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" | |||
[[package]] | |||
name = "serde" | |||
version = "1.0.114" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "5317f7588f0a5078ee60ef675ef96735a1442132dc645eb1d12c018620ed8cd3" | |||
dependencies = [ | |||
"serde_derive", | |||
] | |||
[[package]] | |||
name = "serde_derive" | |||
version = "1.0.114" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "2a0be94b04690fbaed37cddffc5c134bf537c8e3329d53e982fe04c374978f8e" | |||
dependencies = [ | |||
"proc-macro2", | |||
"quote", | |||
"syn", | |||
] | |||
[[package]] | |||
name = "serde_json" | |||
version = "1.0.57" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "164eacbdb13512ec2745fb09d51fd5b22b0d65ed294a1dcf7285a360c80a675c" | |||
dependencies = [ | |||
"itoa", | |||
"ryu", | |||
"serde", | |||
] | |||
[[package]] | |||
name = "syn" | |||
version = "1.0.36" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "4cdb98bcb1f9d81d07b536179c269ea15999b5d14ea958196413869445bb5250" | |||
dependencies = [ | |||
"proc-macro2", | |||
"quote", | |||
"unicode-xid", | |||
] | |||
[[package]] | |||
name = "thread_local" | |||
version = "1.0.1" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" | |||
dependencies = [ | |||
"lazy_static", | |||
] | |||
[[package]] | |||
name = "unicode-xid" | |||
version = "0.2.1" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" | |||
[[package]] | |||
name = "wikigraph" | |||
version = "0.1.0" | |||
dependencies = [ | |||
"bincode", | |||
"fnv", | |||
"lazy_static", | |||
"rayon", | |||
"regex", | |||
"serde", | |||
"serde_json", | |||
"xml-rs", | |||
] | |||
[[package]] | |||
name = "xml-rs" | |||
version = "0.8.3" | |||
source = "registry+https://github.com/rust-lang/crates.io-index" | |||
checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a" |
@ -0,0 +1,16 @@ | |||
[package] | |||
name = "wikigraph" | |||
version = "0.1.0" | |||
authors = ["Stephen <stephen@stephendownward.ca>"] | |||
edition = "2018" | |||
[dependencies] | |||
xml-rs="0.8.3" | |||
rayon="1.3" | |||
serde = { version = "1.0", features = ["derive"] } | |||
serde_json="1.0" | |||
regex = "1.3" | |||
lazy_static = "1.4" | |||
bincode = "1.3" | |||
fnv = "1.0" | |||
@ -0,0 +1,16 @@ | |||
# Wikigraph | |||
Some utilities designed to take a wikipedia xml dump and perform some analysis. These are not production-grade tools - they are basically just experiments. | |||
This software is very RAM-intensive! I have 32GB of RAM and I found myself using all of it at some points. | |||
## Applications | |||
This repo contains 3 applications: | |||
- `wiki2graph`: Takes an XML file and converts it to a collection of nodes and edges (a graph), which is saved as a file. | |||
- `explorer`: Takes in a graph and allows seeing all links from a source article, given the name of that source article. This is primarily used for debugging. | |||
- `longestpath`: Takes a graph and finds the longest path from an article to a target article. The target article is specified in the source code. The default is "Vacuum Tube". | |||
@ -0,0 +1,48 @@ | |||
extern crate wikigraph; | |||
use wikigraph::process::Graph; | |||
use std::env; | |||
use std::io; | |||
use std::io::prelude::*; | |||
fn main() { | |||
let args: Vec<String> = env::args().collect(); | |||
if args.len() != 2 { | |||
println!("Usage: ./{} bincode_file", args[0]); | |||
return; | |||
} | |||
println!("Loading graph..."); | |||
let graph = Graph::load_bin(&args[1]); | |||
println!("Done. {} nodes loaded.", graph.nodes.len()); | |||
// Print all article titles | |||
// graph.nodes.iter().for_each(|(_, v)| { println!("{}", v) }); | |||
loop { | |||
print!("Enter an article name: "); | |||
io::stdout().flush().unwrap(); | |||
let mut buffer = String::new(); | |||
match io::stdin().read_line(&mut buffer) { | |||
Ok(_) => { | |||
let buffer = buffer.trim(); | |||
// Lookup id from article name. O(n) time | |||
match graph.nodes.iter().find(|(_, v)| { v == &&buffer }) { | |||
Some((&id, _)) => { | |||
// Lookup edges | |||
graph.edges.iter() | |||
.filter(|edge| { edge.from == id }) | |||
.map(|edge| { edge.to }) | |||
.for_each(|target_id| { | |||
println!("-> {}", graph.nodes[&target_id]); | |||
}); | |||
} | |||
None => { | |||
println!("Could not find that article!"); | |||
continue; | |||
} | |||
} | |||
} | |||
Err(_) => { break; } | |||
} | |||
} | |||
} |
@ -0,0 +1,112 @@ | |||
extern crate wikigraph; | |||
use wikigraph::{ process::Graph, minivec::MiniVec }; | |||
use rayon::prelude::*; | |||
use fnv::FnvHashMap; // TODO use this more! HashMaps should all be removed | |||
use std::collections::HashMap; | |||
use std::env; | |||
// Calculates articles that are farthest away from "Vacuum Tube" | |||
// in the direction of (farthest article) --> Vacuum Tube | |||
fn main() { | |||
let args: Vec<String> = env::args().collect(); | |||
if args.len() != 2 { | |||
println!("Usage: ./{} bincode_file", args[0]); | |||
return; | |||
} | |||
println!("Loading graph..."); | |||
let mut graph = Graph::load_bin(&args[1]); | |||
println!("Done.\r\nReversing direction of edges..."); | |||
graph.edges.par_iter_mut().for_each(|mut e| { | |||
let tmp = e.from; | |||
e.from = e.to; | |||
e.to = tmp; | |||
}); | |||
println!("Done.\r\nPreparing for Dijkstra's algorithm..."); | |||
//Since the graph is reversed, this is really the end point | |||
let start = *graph.nodes.par_iter() | |||
.find_any(|(_, v)| { v == &&"vacuum tube".to_string() }) | |||
.unwrap().0; | |||
// Distances | |||
let mut distances: FnvHashMap<usize, usize> = graph.nodes.par_iter().map(|(k, _)| { | |||
(*k, std::usize::MAX) | |||
}).collect(); | |||
distances.insert(start, 0); | |||
// Neighbours | |||
let mut all_neighbours: FnvHashMap<usize, Vec<(usize, usize)>> = FnvHashMap::default(); | |||
for e in &graph.edges { | |||
if !all_neighbours.contains_key(&e.from) { | |||
all_neighbours.insert(e.from, vec![(e.to, e.weight)]); | |||
} | |||
else { | |||
// TODO we can speed this up by getting the mutable directly | |||
let mut v = all_neighbours.remove(&e.from).unwrap(); | |||
v.push((e.to, e.weight)); | |||
all_neighbours.insert(e.from, v); | |||
} | |||
} | |||
graph.edges = vec![]; // We no longer need the edges, so let's save some RAM | |||
// Make sure every node is in all_neighbours. | |||
for (&n, _) in &graph.nodes { | |||
if !all_neighbours.contains_key(&n) { | |||
all_neighbours.insert(n, vec![]); | |||
} | |||
} | |||
//Unvisited nodes | |||
let mut unvisited_nodes: FnvHashMap<usize, ()> = graph.nodes.par_iter().map(|(k, _)| { (*k, ()) }).collect(); | |||
let mut slightly_visited_nodes: MiniVec = MiniVec::new(HashMap::new()); | |||
println!("Done.\r\nPerforming Dijkstra's algorithm..."); | |||
let mut last_print = std::usize::MAX; | |||
let mut cur_node = start; | |||
while unvisited_nodes.len() > 0 { | |||
//println!("Finding neighbours.."); | |||
let neighbours = &all_neighbours[&cur_node]; | |||
//println!("Executing neighbours..."); | |||
neighbours.iter().for_each(|(n, weight)| { | |||
if !unvisited_nodes.contains_key(n) { return; } | |||
let new_dist = distances[&cur_node] + weight; // Graph is unweighted, so distance + 1 | |||
let old_dist = distances[n]; | |||
if new_dist < old_dist { | |||
distances.insert(*n, new_dist); | |||
// Add into slightly_visited_nodes, while maintaing order | |||
//println!("slightly_visited_insert"); | |||
slightly_visited_nodes.insert_element(*n, new_dist); | |||
//println!("ok"); | |||
} | |||
}); | |||
//println!("removing some stuff"); | |||
unvisited_nodes.remove(&cur_node).unwrap(); | |||
// Node is now *too* visited, so we should remove it | |||
slightly_visited_nodes.remove_element(cur_node); | |||
if unvisited_nodes.len() + 1000 <= last_print { | |||
last_print = unvisited_nodes.len(); | |||
println!("{} nodes remaining.", last_print); | |||
} | |||
if slightly_visited_nodes.len() == 0 { break; } | |||
cur_node = slightly_visited_nodes.get_min_distance(); | |||
//println!("Done"); | |||
} | |||
println!("Done! Grabbing distances..."); | |||
for (id, dist) in distances { | |||
if dist > 5 && dist < std::usize::MAX { | |||
let s = &graph.nodes[&id]; | |||
if !s.starts_with("category:") && !s.starts_with("wikipedia:") && !s.starts_with("template:") { | |||
println!("{} away: {}", dist, s); | |||
} | |||
} | |||
} | |||
} |
@ -0,0 +1,125 @@ | |||
extern crate wikigraph; | |||
use wikigraph::process::{ GraphIntermediate, process_graph, save_graph }; | |||
use wikigraph::wiki::WikiPage; | |||
use rayon::ThreadPoolBuilder; | |||
use std::fs::File; | |||
use std::io::BufReader; | |||
use std::sync::{ Arc, Mutex }; | |||
use std::env; | |||
use xml::reader::{ EventReader, XmlEvent }; | |||
//Where are we in the file? | |||
enum FileState { | |||
InRoot, | |||
InTitle, | |||
InPage, | |||
InPageTitle, | |||
InPageBody | |||
} | |||
fn main() { | |||
//Process command line args | |||
let args: Vec<String> = env::args().collect(); | |||
if args.len() != 3 { | |||
println!("Usage: ./{} wikipedia_xml_file output_bincode_file", args[0]); | |||
return; | |||
} | |||
let input_filename = &args[1]; | |||
let output_filename = &args[2]; | |||
println!("Converting {} to {}", input_filename, output_filename); | |||
let file = File::open(input_filename).unwrap(); | |||
let file = BufReader::new(file); | |||
let parser = EventReader::new(file); | |||
let pool = ThreadPoolBuilder::new().build().unwrap(); | |||
let graph = Arc::new(Mutex::new(GraphIntermediate::new())); | |||
pool.install(|| { | |||
rayon::scope(|scope| { | |||
let mut state: FileState = FileState::InRoot; | |||
let mut cur_page = WikiPage::new(); | |||
for e in parser { | |||
// Keeps the job queue from overflowing | |||
// Amazingly, Rayon has absolutely no way of dealing with this | |||
// There isn't even a method to get the current number of jobs in the queue ): | |||
while pool.current_thread_has_pending_tasks().unwrap() { } | |||
match e { | |||
Ok(XmlEvent::StartElement { name, .. }) => { | |||
let name = name.local_name; | |||
// println!("{}", name); | |||
state = match (&state, name.as_str()) { | |||
(FileState::InRoot, "sitename") => { | |||
FileState::InTitle | |||
} | |||
(FileState::InRoot, "page") => { | |||
FileState::InPage | |||
} | |||
(FileState::InPage, "title") => { | |||
FileState::InPageTitle | |||
} | |||
(FileState::InPage, "text") => { | |||
FileState::InPageBody | |||
} | |||
_ => { state } | |||
} | |||
} | |||
Ok(XmlEvent::EndElement { name, .. }) => { | |||
let name = name.local_name; | |||
match (&state, name.as_str()) { | |||
(FileState::InTitle, "sitename") => { | |||
state = FileState::InRoot; | |||
} | |||
(FileState::InPage, "page") => { | |||
state = FileState::InRoot; | |||
// Process page | |||
scope.spawn(|_| { process_graph(graph.clone(), cur_page); }); | |||
// Reset page | |||
cur_page = WikiPage::new(); | |||
} | |||
(FileState::InPageTitle, "title") => { | |||
state = FileState::InPage; | |||
} | |||
(FileState::InPageBody, "text") => { | |||
state = FileState::InPage; | |||
// Freeze text so that we don't accidentally combine revisions | |||
cur_page.frozen = true; | |||
} | |||
_ => {} | |||
} | |||
} | |||
Ok(XmlEvent::Characters(data)) => { | |||
match state { | |||
FileState::InTitle => { | |||
println!("Title: {}", data); | |||
} | |||
FileState::InPageTitle => { | |||
cur_page.title += &data; | |||
} | |||
FileState::InPageBody => { | |||
cur_page.add_content(&data); | |||
} | |||
_ => { } | |||
} | |||
} | |||
Err(e) => { | |||
println!("Error: {}", e); | |||
break; | |||
} | |||
_ => { } | |||
} | |||
} | |||
}); | |||
}); | |||
println!("Done! Saving..."); | |||
if let Ok(graph) = Arc::try_unwrap(graph) { | |||
save_graph(graph.into_inner().unwrap(), output_filename); | |||
} | |||
} |
@ -0,0 +1,128 @@ | |||
// Extracts links from mediawiki markdown | |||
use regex::Regex; | |||
pub fn extract_links(input: &str) -> Vec<(String, usize)> { | |||
lazy_static! { | |||
static ref NOWIKI_RE: Regex = Regex::new("<nowiki>.*</nowiki>").unwrap(); | |||
static ref LINK_RE: Regex = Regex::new(r#"\[\[(()|([^#].*?))(#.*?)?(\|(.*?))?\]\]"#).unwrap(); | |||
static ref REDIRECT_RE: Regex = Regex::new(r"(\s*)#redirect(\s*)\[\[(.*)\]\](\s*)").unwrap(); | |||
static ref TRANSCLUDE_RE: Regex = Regex::new(r"\{\{(.*?)(\|(.*?))?\}\}").unwrap(); | |||
static ref WP_RE: Regex = Regex::new(r"(?i)WP:(\s*)(?P<b>.*)").unwrap(); | |||
} | |||
// Remove nowiki text from input | |||
let new_text = NOWIKI_RE.replace_all(input, ""); | |||
let new_text = new_text.trim(); | |||
// Are we a redirect? | |||
if REDIRECT_RE.is_match(&new_text.to_lowercase()) { | |||
match LINK_RE.captures_iter(&new_text).next() { | |||
Some(x) => { return vec![(x[1].trim().to_string(), 0)]; } | |||
None => { println!("ERROR: Something went wrong with {}", input) } | |||
} | |||
} | |||
let mut ret: Vec<(String, usize)> = LINK_RE.captures_iter(&new_text).map(|cap| { | |||
let mut s = cap[1].trim(); | |||
if s.starts_with(":") { s = remove_first(s).unwrap(); } | |||
(s.to_string(), 1) | |||
}).filter(|x| { | |||
let s = x.0.to_lowercase(); | |||
s != "" && | |||
!s.starts_with("special:") && | |||
!s.starts_with("media:") && | |||
!s.starts_with("file:") | |||
}).collect(); | |||
ret.extend(TRANSCLUDE_RE.captures_iter(&new_text).map(|cap| { | |||
let mut s = cap[1].trim().to_string(); | |||
if s.to_lowercase().starts_with("wp:") { s = WP_RE.replace_all(&s, "Wikipedia:$b").to_string(); } | |||
else if s.starts_with(":") { s = remove_first(&s).unwrap().to_string() } | |||
else { s = format!("Template:{}", s); } | |||
(s, 0) | |||
}).collect::<Vec<(String, usize)>>()); | |||
ret | |||
} | |||
fn remove_first(s: &str) -> Option<&str> { | |||
s.chars().next().map(|c| &s[c.len_utf8()..]) | |||
} | |||
#[cfg(test)] | |||
mod tests { | |||
use crate::extractor::extract_links; | |||
#[test] | |||
fn no_links() { | |||
assert_eq!(Vec::<(String, usize)>::new(), extract_links("Here is some text that contains no links")); | |||
} | |||
#[test] | |||
fn three_links() { | |||
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1), ("gamma".to_string(), 1)], | |||
extract_links("The quick brown [[alpha]] fox jumped over the [[beta ]] [[ gamma ]]")); | |||
} | |||
#[test] | |||
fn links_with_captions() { | |||
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1), ("gamma".to_string(), 1)], | |||
extract_links("The quick brown [[alpha]] fox jumped over the [[beta | lazy]] [[gamma|dog]]")); | |||
} | |||
#[test] | |||
fn escaped_links() { | |||
assert_eq!(Vec::<(String, usize)>::new(), extract_links("<nowiki> this is a [[test]]</nowiki>")); | |||
} | |||
#[test] | |||
fn empty_link() { | |||
assert_eq!(Vec::<(String, usize)>::new(), | |||
extract_links("I'm not even sure if this is legal [[]] [[ ]]")); | |||
} | |||
#[test] | |||
fn no_caption() { | |||
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1)], | |||
extract_links("The quick brown [[alpha|]] fox jumped over the lazy dog [[beta | ]]")); | |||
} | |||
#[test] | |||
fn no_page_links() { | |||
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1)], | |||
extract_links("The quick brown [[alpha|]] fox jumped over [[#table of contents]] the lazy dog [[beta | ]]")); | |||
} | |||
#[test] | |||
fn anchor_on_other_page() { | |||
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1)], | |||
extract_links("The quick brown [[alpha#table of contents]] fox jumped over the lazy dog [[beta#test | The beta page]]")); | |||
} | |||
#[test] | |||
fn media_and_special() { | |||
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1)], | |||
extract_links("The quick brown [[alpha]] fox jumped over the lazy dog [[beta]] [[Special: delta]] [[Media: epsilon]]")); | |||
} | |||
#[test] | |||
fn test_redirects() { | |||
assert_eq!(vec![("Dentistry".to_string(), 0)], extract_links("\r\n #rEdIrEcT [[Dentistry]]")); | |||
} | |||
#[test] | |||
fn test_transclusion() { | |||
assert_eq!(vec![("just a regular link".to_string(), 1), | |||
("Template:a template".to_string(), 0), | |||
("another Template".to_string(), 0), | |||
("Wikipedia:last one".to_string(), 0), | |||
("Template:the actual last one".to_string(), 0)], | |||
extract_links("{{a template}} {{:another Template}} [[just a regular link]] normal text {{ wP: last one }} (: {{ the actual last one | whatever }}")); | |||
} | |||
#[test] | |||
fn test_no_files() { | |||
assert_eq!(vec![("alpha".to_string(), 1)], | |||
extract_links("test [[alpha]] whatever [[FiLe: beta]] abc [[ file:Gamma]]")); | |||
} | |||
} |
@ -0,0 +1,9 @@ | |||
#![feature(vec_remove_item, map_first_last)] | |||
extern crate xml; | |||
#[macro_use] | |||
extern crate lazy_static; | |||
pub mod wiki; | |||
pub mod process; | |||
pub mod extractor; | |||
pub mod minivec; |
@ -0,0 +1,142 @@ | |||
use std::collections::{BTreeMap, HashMap}; | |||
#[derive(Debug)] | |||
pub struct MiniVec { | |||
next_idx: usize, // Keeps track of what idx to use next | |||
sorted_by_id: HashMap<usize, usize>, // id, idx | |||
sorted_by_distance: BTreeMap<usize, HashMap<usize, ()>>, // distance, [idx] | |||
idx_to_id: HashMap<usize, usize>, // idx, id | |||
id_to_distance: HashMap<usize, usize> // id, distance | |||
} | |||
impl MiniVec { | |||
pub fn new(hsh: HashMap<usize, usize>) -> Self { | |||
let mut ret = Self { next_idx: 0, | |||
sorted_by_id: HashMap::new(), | |||
sorted_by_distance: BTreeMap::new(), | |||
idx_to_id: HashMap::new(), | |||
id_to_distance: HashMap::new() }; | |||
for (k, v) in hsh { | |||
let idx = ret.next_idx; | |||
ret.next_idx += 1; | |||
ret.sorted_by_id.insert(k, idx); | |||
ret.idx_to_id.insert(idx, k); | |||
ret.insert_idx_into_dist(idx, v); | |||
ret.id_to_distance.insert(k, v); | |||
} | |||
ret | |||
} | |||
pub fn insert_element(&mut self, id: usize, dist: usize) { | |||
if self.sorted_by_id.contains_key(&id) { | |||
self.remove_element(id); | |||
} | |||
let idx = self.next_idx; | |||
self.next_idx += 1; | |||
self.sorted_by_id.insert(id, idx); | |||
self.idx_to_id.insert(idx, id); | |||
self.insert_idx_into_dist(idx, dist); | |||
self.id_to_distance.insert(id, dist); | |||
} | |||
pub fn remove_element(&mut self, id: usize) { | |||
if let Some(idx) = self.sorted_by_id.remove(&id) { | |||
let dist = self.id_to_distance.remove(&id).unwrap(); | |||
self.idx_to_id.remove(&idx); | |||
let a = self.sorted_by_distance.get_mut(&dist).unwrap(); | |||
if a.len() > 1 { | |||
a.remove(&idx); | |||
} | |||
else { | |||
self.sorted_by_distance.remove(&dist); | |||
} | |||
} | |||
} | |||
pub fn get_min_distance(&self) -> usize { | |||
match &self.sorted_by_distance.first_key_value().unwrap().1.keys().next() { | |||
Some(x) => self.idx_to_id[x], | |||
None => { panic!("ERR: Something went wrong! {:?}", self)} | |||
} | |||
//self.idx_to_id[&self.sorted_by_distance.first_key_value().unwrap().1.keys().next().unwrap()] | |||
} | |||
pub fn len(&self) -> usize { | |||
self.idx_to_id.len() | |||
} | |||
fn insert_idx_into_dist(&mut self, idx: usize, dist: usize) { | |||
//println!("idx->dist"); | |||
if self.sorted_by_distance.contains_key(&dist) { | |||
//println!("a"); | |||
self.sorted_by_distance.get_mut(&dist).unwrap().insert(idx, ()); | |||
//println!("d"); | |||
} | |||
else { | |||
let mut bt: HashMap<usize, ()> = HashMap::new(); | |||
bt.insert(idx, ()); | |||
self.sorted_by_distance.insert(dist, bt); | |||
} | |||
//println!("ok"); | |||
} | |||
} | |||
#[cfg(test)] | |||
mod tests { | |||
use crate::minivec::MiniVec; | |||
use std::collections::{ BTreeMap, HashMap }; | |||
#[test] | |||
fn basic_test() { | |||
let mut hm: HashMap<usize, usize> = HashMap::new(); | |||
hm.insert(0, 1); | |||
hm.insert(2, 3); | |||
hm.insert(5, 3); | |||
hm.insert(3, 4); | |||
hm.insert(12, 15); | |||
let mut mv = MiniVec::new(hm); | |||
assert_eq!(0, mv.get_min_distance()); | |||
mv.remove_element(0); | |||
mv.remove_element(5); | |||
mv.insert_element(0, 3); // Updating an existing node | |||
let mut ids: Vec<usize> = mv.sorted_by_distance[&3] | |||
.keys() | |||
.map(|idx| { | |||
mv.idx_to_id[idx] | |||
}) | |||
.collect(); | |||
ids.sort(); | |||
assert_eq!(vec![0, 2], ids); | |||
let min_dist = mv.get_min_distance(); | |||
assert!(min_dist == 0 || min_dist == 2); // Could be either, since they're the same dist | |||
mv.remove_element(0); | |||
mv.insert_element(10, 2); | |||
mv.remove_element(3); | |||
mv.remove_element(2); | |||
mv.insert_element(12, 0); | |||
assert_eq!(12, mv.get_min_distance()); | |||
mv.remove_element(10); | |||
mv.remove_element(12); | |||
mv.remove_element(12); | |||
let empty_hm: HashMap<usize, usize> = HashMap::new(); | |||
let empty_bt: BTreeMap<usize, HashMap<usize, ()>> = BTreeMap::new(); | |||
assert_eq!(8, mv.next_idx); | |||
assert_eq!(empty_hm, mv.sorted_by_id); | |||
assert_eq!(empty_bt, mv.sorted_by_distance); | |||
assert_eq!(empty_hm, mv.idx_to_id); | |||
assert_eq!(empty_hm, mv.id_to_distance); | |||
} | |||
} |
@ -0,0 +1,115 @@ | |||
use crate::extractor::extract_links; | |||
use crate::wiki::WikiPage; | |||
use serde::{Serialize, Deserialize}; | |||
use std::fs::File; | |||
use std::collections::HashMap; | |||
use std::io::BufWriter; | |||
use std::io::prelude::*; | |||
use std::sync::{Arc, Mutex}; | |||
use rayon::prelude::*; | |||
#[derive(Serialize, Deserialize, Clone, Copy)] | |||
pub struct MyEdge<T> { | |||
pub from: T, | |||
pub to: T, | |||
pub weight: usize | |||
} | |||
impl<T> MyEdge<T> { | |||
fn new(from: T, to: T, weight: usize) -> Self { | |||
Self { from, to, weight } | |||
} | |||
} | |||
#[derive(Serialize, Deserialize)] | |||
pub struct Graph { | |||
pub edges: Vec<MyEdge<usize>>, | |||
pub nodes: HashMap<usize, String> | |||
} | |||
impl Graph { | |||
fn new(nodes: HashMap<usize, String>, edges: Vec<MyEdge<usize>>) -> Self { | |||
Self { | |||
edges, | |||
nodes | |||
} | |||
} | |||
pub fn load_bin(filename: &str) -> Self { | |||
let mut buffer = Vec::new(); | |||
File::open(filename).unwrap().read_to_end(&mut buffer).unwrap(); | |||
bincode::deserialize(&mut buffer).unwrap() | |||
} | |||
pub fn neighbours(&self, node: usize) -> Vec<usize> { | |||
self.edges.par_iter() | |||
.filter(|e| { e.from == node }) | |||
.map(|e| { e.to }) | |||
.collect() | |||
} | |||
} | |||
pub struct GraphIntermediate { | |||
pub edges: Vec<MyEdge<usize>>, | |||
pub nodes: HashMap<String, usize> | |||
} | |||
impl GraphIntermediate { | |||
pub fn new() -> Self { | |||
Self { | |||
edges: Vec::new(), | |||
nodes: HashMap::new(), | |||
} | |||
} | |||
} | |||
pub fn process_graph(graph: Arc<Mutex<GraphIntermediate>>, page: WikiPage) { | |||
let links = extract_links(&page.body); | |||
let mut graph = graph.lock().unwrap(); | |||
links.iter().for_each(|(target, weight)| { | |||
// Check if we have an existing node. If not, add one | |||
let target_lwr = target.to_lowercase(); | |||
if !graph.nodes.contains_key(&page.title.to_lowercase()) { | |||
let len = graph.nodes.len(); | |||
graph.nodes.insert(page.title.to_lowercase(), len); | |||
} | |||
if !graph.nodes.contains_key(&target_lwr) { | |||
let len = graph.nodes.len(); | |||
graph.nodes.insert(target_lwr.clone(), len); | |||
} | |||
let start_key = *graph.nodes.get(&page.title.to_lowercase()).unwrap(); | |||
let end_key = *graph.nodes.get(&target_lwr).unwrap(); | |||
graph.edges.push(MyEdge::new(start_key, end_key, *weight)); | |||
// println!("Target: {}", target_lwr); | |||
if graph.edges.len() % 10000 == 0 /*|| graph.edges.len() > 58730000*/ { | |||
println!("{} nodes, {} edges.", graph.nodes.len(), graph.edges.len()); | |||
println!("{}", page.title); | |||
} | |||
}); | |||
} | |||
pub fn save_graph(graph_og: GraphIntermediate, filename: &str) { | |||
//Create a graph from our graph intermediate | |||
let graph = Graph::new(graph_og.nodes.iter().map(|(k, v)|{ | |||
(*v, k.clone()) | |||
}).collect::<HashMap<usize, String>>(), | |||
graph_og.edges.clone()); | |||
std::mem::drop(graph_og); // Save some RAM | |||
let mut buffer = BufWriter::new(File::create(filename).unwrap()); | |||
/* BINCODE */ | |||
buffer.write_all(&bincode::serialize(&graph).unwrap()).unwrap(); | |||
/* JSON */ | |||
//buffer.write_all(serde_json::to_string_pretty(&graph).unwrap().as_bytes()).unwrap(); | |||
buffer.flush().unwrap(); | |||
} |
@ -0,0 +1,21 @@ | |||
pub struct WikiPage { | |||
pub title: String, | |||
pub body: String, | |||
pub frozen: bool | |||
} | |||
impl WikiPage { | |||
pub fn new() -> Self { | |||
Self { | |||
title: "".to_string(), | |||
body: "".to_string(), | |||
frozen: false | |||
} | |||
} | |||
pub fn add_content(&mut self, data: &str) { | |||
if !self.frozen { | |||
self.body += data; | |||
} | |||
} | |||
} |