Browse Source

Use smaller data types where possible to save on ram

master
Stephen 2 months ago
parent
commit
d1779224fa
9 changed files with 261 additions and 188 deletions
  1. +2
    -0
      rustfmt.toml
  2. +10
    -6
      src/bin/explorer.rs
  3. +33
    -21
      src/bin/longestpath.rs
  4. +32
    -37
      src/bin/wiki2graph.rs
  5. +89
    -41
      src/extractor.rs
  6. +2
    -2
      src/lib.rs
  7. +50
    -44
      src/minivec.rs
  8. +41
    -35
      src/process.rs
  9. +2
    -2
      src/wiki.rs

+ 2
- 0
rustfmt.toml View File

@ -0,0 +1,2 @@
tab_spaces = 4
hard_tabs = true

+ 10
- 6
src/bin/explorer.rs View File

@ -1,8 +1,8 @@
extern crate wikigraph;
use wikigraph::process::Graph;
use std::env;
use std::io;
use std::io::prelude::*;
use wikigraph::process::Graph;
fn main() {
let args: Vec<String> = env::args().collect();
@ -26,12 +26,14 @@ fn main() {
Ok(_) => {
let buffer = buffer.trim();
// Lookup id from article name. O(n) time
match graph.nodes.iter().find(|(_, v)| { v == &&buffer }) {
match graph.nodes.iter().find(|(_, v)| v == &&buffer) {
Some((&id, _)) => {
// Lookup edges
graph.edges.iter()
.filter(|edge| { edge.from == id })
.map(|edge| { edge.to })
graph
.edges
.iter()
.filter(|edge| edge.from == id)
.map(|edge| edge.to)
.for_each(|target_id| {
println!("-> {}", graph.nodes[&target_id]);
});
@ -42,7 +44,9 @@ fn main() {
}
}
}
Err(_) => { break; }
Err(_) => {
break;
}
}
}
}

+ 33
- 21
src/bin/longestpath.rs View File

@ -1,9 +1,9 @@
extern crate wikigraph;
use wikigraph::{ process::Graph, minivec::MiniVec };
use wikigraph::{minivec::MiniVec, process::Graph};
use rayon::prelude::*;
use fnv::FnvHashMap; // TODO use this more! HashMaps should all be removed
use fnv::{FnvHashMap, FnvHashSet}; // TODO use this more! HashMaps should all be removed
use std::collections::HashMap;
use std::env;
@ -17,7 +17,7 @@ fn main() {
println!("Usage: ./{} bincode_file", args[0]);
return;
}
println!("Loading graph...");
let mut graph = Graph::load_bin(&args[1]);
println!("Done.\r\nReversing direction of edges...");
@ -28,23 +28,28 @@ fn main() {
});
println!("Done.\r\nPreparing for Dijkstra's algorithm...");
//Since the graph is reversed, this is really the end point
let start = *graph.nodes.par_iter()
.find_any(|(_, v)| { v == &&"vacuum tube".to_string() })
.unwrap().0;
let start = *graph
.nodes
.par_iter()
.find_any(|(_, v)| v == &&"vacuum tube".to_string())
.unwrap()
.0;
// Distances
let mut distances: FnvHashMap<usize, usize> = graph.nodes.par_iter().map(|(k, _)| {
(*k, std::usize::MAX)
}).collect();
let mut distances: FnvHashMap<u32, u8> = graph
.nodes
.par_iter()
.map(|(k, _)| (*k, std::u8::MAX))
.collect();
distances.insert(start, 0);
// Neighbours
let mut all_neighbours: FnvHashMap<usize, Vec<(usize, usize)>> = FnvHashMap::default();
// FnvHashMap<from, Vec<to, weight>>
let mut all_neighbours: FnvHashMap<u32, Vec<(u32, u8)>> = FnvHashMap::default();
for e in &graph.edges {
if !all_neighbours.contains_key(&e.from) {
all_neighbours.insert(e.from, vec![(e.to, e.weight)]);
}
else {
} else {
// TODO we can speed this up by getting the mutable directly
let mut v = all_neighbours.remove(&e.from).unwrap();
v.push((e.to, e.weight));
@ -52,7 +57,7 @@ fn main() {
}
}
graph.edges = vec![]; // We no longer need the edges, so let's save some RAM
// Make sure every node is in all_neighbours.
// Make sure every node is in all_neighbours.
for (&n, _) in &graph.nodes {
if !all_neighbours.contains_key(&n) {
all_neighbours.insert(n, vec![]);
@ -60,11 +65,11 @@ fn main() {
}
//Unvisited nodes
let mut unvisited_nodes: FnvHashMap<usize, ()> = graph.nodes.par_iter().map(|(k, _)| { (*k, ()) }).collect();
let mut unvisited_nodes: FnvHashSet<u32> = graph.nodes.par_iter().map(|(k, _)| *k).collect();
let mut slightly_visited_nodes: MiniVec = MiniVec::new(HashMap::new());
println!("Done.\r\nPerforming Dijkstra's algorithm...");
let mut last_print = std::usize::MAX;
let mut cur_node = start;
while unvisited_nodes.len() > 0 {
@ -72,7 +77,9 @@ fn main() {
let neighbours = &all_neighbours[&cur_node];
//println!("Executing neighbours...");
neighbours.iter().for_each(|(n, weight)| {
if !unvisited_nodes.contains_key(n) { return; }
if !unvisited_nodes.contains(n) {
return;
}
let new_dist = distances[&cur_node] + weight; // Graph is unweighted, so distance + 1
let old_dist = distances[n];
if new_dist < old_dist {
@ -85,7 +92,7 @@ fn main() {
}
});
//println!("removing some stuff");
unvisited_nodes.remove(&cur_node).unwrap();
unvisited_nodes.remove(&cur_node);
// Node is now *too* visited, so we should remove it
slightly_visited_nodes.remove_element(cur_node);
@ -94,17 +101,22 @@ fn main() {
println!("{} nodes remaining.", last_print);
}
if slightly_visited_nodes.len() == 0 { break; }
if slightly_visited_nodes.len() == 0 {
break;
}
cur_node = slightly_visited_nodes.get_min_distance();
//println!("Done");
}
println!("Done! Grabbing distances...");
for (id, dist) in distances {
if dist > 5 && dist < std::usize::MAX {
if dist > 5 && dist < std::u8::MAX {
let s = &graph.nodes[&id];
if !s.starts_with("category:") && !s.starts_with("wikipedia:") && !s.starts_with("template:") {
if !s.starts_with("category:")
&& !s.starts_with("wikipedia:")
&& !s.starts_with("template:")
{
println!("{} away: {}", dist, s);
}
}


+ 32
- 37
src/bin/wiki2graph.rs View File

@ -1,16 +1,16 @@
extern crate wikigraph;
use wikigraph::process::{ GraphIntermediate, process_graph, save_graph };
use wikigraph::process::{process_graph, save_graph, GraphIntermediate};
use wikigraph::wiki::WikiPage;
use rayon::ThreadPoolBuilder;
use std::env;
use std::fs::File;
use std::io::BufReader;
use std::sync::{ Arc, Mutex };
use std::env;
use std::sync::{Arc, Mutex};
use xml::reader::{ EventReader, XmlEvent };
use xml::reader::{EventReader, XmlEvent};
//Where are we in the file?
enum FileState {
@ -18,14 +18,17 @@ enum FileState {
InTitle,
InPage,
InPageTitle,
InPageBody
InPageBody,
}
fn main() {
//Process command line args
let args: Vec<String> = env::args().collect();
if args.len() != 3 {
println!("Usage: ./{} wikipedia_xml_file output_bincode_file", args[0]);
println!(
"Usage: ./{} wikipedia_xml_file output_bincode_file",
args[0]
);
return;
}
@ -33,7 +36,7 @@ fn main() {
let output_filename = &args[2];
println!("Converting {} to {}", input_filename, output_filename);
let file = File::open(input_filename).unwrap();
let file = BufReader::new(file);
@ -49,25 +52,17 @@ fn main() {
// Keeps the job queue from overflowing
// Amazingly, Rayon has absolutely no way of dealing with this
// There isn't even a method to get the current number of jobs in the queue ):
while pool.current_thread_has_pending_tasks().unwrap() { }
while pool.current_thread_has_pending_tasks().unwrap() {}
match e {
Ok(XmlEvent::StartElement { name, .. }) => {
let name = name.local_name;
// println!("{}", name);
state = match (&state, name.as_str()) {
(FileState::InRoot, "sitename") => {
FileState::InTitle
}
(FileState::InRoot, "page") => {
FileState::InPage
}
(FileState::InPage, "title") => {
FileState::InPageTitle
}
(FileState::InPage, "text") => {
FileState::InPageBody
}
_ => { state }
(FileState::InRoot, "sitename") => FileState::InTitle,
(FileState::InRoot, "page") => FileState::InPage,
(FileState::InPage, "title") => FileState::InPageTitle,
(FileState::InPage, "text") => FileState::InPageBody,
_ => state,
}
}
Ok(XmlEvent::EndElement { name, .. }) => {
@ -79,8 +74,10 @@ fn main() {
(FileState::InPage, "page") => {
state = FileState::InRoot;
// Process page
scope.spawn(|_| { process_graph(graph.clone(), cur_page); });
scope.spawn(|_| {
process_graph(graph.clone(), cur_page);
});
// Reset page
cur_page = WikiPage::new();
}
@ -95,25 +92,23 @@ fn main() {
_ => {}
}
}
Ok(XmlEvent::Characters(data)) => {
match state {
FileState::InTitle => {
println!("Title: {}", data);
}
FileState::InPageTitle => {
cur_page.title += &data;
}
FileState::InPageBody => {
cur_page.add_content(&data);
}
_ => { }
Ok(XmlEvent::Characters(data)) => match state {
FileState::InTitle => {
println!("Title: {}", data);
}
}
FileState::InPageTitle => {
cur_page.title += &data;
}
FileState::InPageBody => {
cur_page.add_content(&data);
}
_ => {}
},
Err(e) => {
println!("Error: {}", e);
break;
}
_ => { }
_ => {}
}
}
});


+ 89
- 41
src/extractor.rs View File

@ -2,11 +2,13 @@
use regex::Regex;
pub fn extract_links(input: &str) -> Vec<(String, usize)> {
pub fn extract_links(input: &str) -> Vec<(String, u8)> {
lazy_static! {
static ref NOWIKI_RE: Regex = Regex::new("<nowiki>.*</nowiki>").unwrap();
static ref LINK_RE: Regex = Regex::new(r#"\[\[(()|([^#].*?))(#.*?)?(\|(.*?))?\]\]"#).unwrap();
static ref REDIRECT_RE: Regex = Regex::new(r"(\s*)#redirect(\s*)\[\[(.*)\]\](\s*)").unwrap();
static ref LINK_RE: Regex =
Regex::new(r#"\[\[(()|([^#].*?))(#.*?)?(\|(.*?))?\]\]"#).unwrap();
static ref REDIRECT_RE: Regex =
Regex::new(r"(\s*)#redirect(\s*)\[\[(.*)\]\](\s*)").unwrap();
static ref TRANSCLUDE_RE: Regex = Regex::new(r"\{\{(.*?)(\|(.*?))?\}\}").unwrap();
static ref WP_RE: Regex = Regex::new(r"(?i)WP:(\s*)(?P<b>.*)").unwrap();
}
@ -17,74 +19,115 @@ pub fn extract_links(input: &str) -> Vec<(String, usize)> {
// Are we a redirect?
if REDIRECT_RE.is_match(&new_text.to_lowercase()) {
match LINK_RE.captures_iter(&new_text).next() {
Some(x) => { return vec![(x[1].trim().to_string(), 0)]; }
None => { println!("ERROR: Something went wrong with {}", input) }
Some(x) => {
return vec![(x[1].trim().to_string(), 0)];
}
None => println!("ERROR: Something went wrong with {}", input),
}
}
let mut ret: Vec<(String, usize)> = LINK_RE.captures_iter(&new_text).map(|cap| {
let mut s = cap[1].trim();
if s.starts_with(":") { s = remove_first(s).unwrap(); }
(s.to_string(), 1)
}).filter(|x| {
let s = x.0.to_lowercase();
s != "" &&
!s.starts_with("special:") &&
!s.starts_with("media:") &&
!s.starts_with("file:")
}).collect();
ret.extend(TRANSCLUDE_RE.captures_iter(&new_text).map(|cap| {
let mut s = cap[1].trim().to_string();
if s.to_lowercase().starts_with("wp:") { s = WP_RE.replace_all(&s, "Wikipedia:$b").to_string(); }
else if s.starts_with(":") { s = remove_first(&s).unwrap().to_string() }
else { s = format!("Template:{}", s); }
(s, 0)
}).collect::<Vec<(String, usize)>>());
let mut ret: Vec<(String, u8)> = LINK_RE
.captures_iter(&new_text)
.map(|cap| {
let mut s = cap[1].trim();
if s.starts_with(":") {
s = remove_first(s).unwrap();
}
(s.to_string(), 1)
})
.filter(|x| {
let s = x.0.to_lowercase();
s != ""
&& !s.starts_with("special:")
&& !s.starts_with("media:")
&& !s.starts_with("file:")
})
.collect();
ret.extend(
TRANSCLUDE_RE
.captures_iter(&new_text)
.map(|cap| {
let mut s = cap[1].trim().to_string();
if s.to_lowercase().starts_with("wp:") {
s = WP_RE.replace_all(&s, "Wikipedia:$b").to_string();
} else if s.starts_with(":") {
s = remove_first(&s).unwrap().to_string()
} else {
s = format!("Template:{}", s);
}
(s, 0)
})
.collect::<Vec<(String, u8)>>(),
);
ret
}
fn remove_first(s: &str) -> Option<&str> {
s.chars().next().map(|c| &s[c.len_utf8()..])
s.chars().next().map(|c| &s[c.len_utf8()..])
}
#[cfg(test)]
mod tests {
use crate::extractor::extract_links;
#[test]
fn no_links() {
assert_eq!(Vec::<(String, usize)>::new(), extract_links("Here is some text that contains no links"));
assert_eq!(
Vec::<(String, u8)>::new(),
extract_links("Here is some text that contains no links")
);
}
#[test]
fn three_links() {
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1), ("gamma".to_string(), 1)],
extract_links("The quick brown [[alpha]] fox jumped over the [[beta ]] [[ gamma ]]"));
assert_eq!(
vec![
("alpha".to_string(), 1),
("beta".to_string(), 1),
("gamma".to_string(), 1)
],
extract_links("The quick brown [[alpha]] fox jumped over the [[beta ]] [[ gamma ]]")
);
}
#[test]
fn links_with_captions() {
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1), ("gamma".to_string(), 1)],
extract_links("The quick brown [[alpha]] fox jumped over the [[beta | lazy]] [[gamma|dog]]"));
assert_eq!(
vec![
("alpha".to_string(), 1),
("beta".to_string(), 1),
("gamma".to_string(), 1)
],
extract_links(
"The quick brown [[alpha]] fox jumped over the [[beta | lazy]] [[gamma|dog]]"
)
);
}
#[test]
fn escaped_links() {
assert_eq!(Vec::<(String, usize)>::new(), extract_links("<nowiki> this is a [[test]]</nowiki>"));
assert_eq!(
Vec::<(String, u8)>::new(),
extract_links("<nowiki> this is a [[test]]</nowiki>")
);
}
#[test]
fn empty_link() {
assert_eq!(Vec::<(String, usize)>::new(),
extract_links("I'm not even sure if this is legal [[]] [[ ]]"));
assert_eq!(
Vec::<(String, u8)>::new(),
extract_links("I'm not even sure if this is legal [[]] [[ ]]")
);
}
#[test]
fn no_caption() {
assert_eq!(vec![("alpha".to_string(), 1), ("beta".to_string(), 1)],
extract_links("The quick brown [[alpha|]] fox jumped over the lazy dog [[beta | ]]"));
assert_eq!(
vec![("alpha".to_string(), 1), ("beta".to_string(), 1)],
extract_links("The quick brown [[alpha|]] fox jumped over the lazy dog [[beta | ]]")
);
}
#[test]
@ -107,7 +150,10 @@ mod tests {
#[test]
fn test_redirects() {
assert_eq!(vec![("Dentistry".to_string(), 0)], extract_links("\r\n #rEdIrEcT [[Dentistry]]"));
assert_eq!(
vec![("Dentistry".to_string(), 0)],
extract_links("\r\n #rEdIrEcT [[Dentistry]]")
);
}
#[test]
@ -122,7 +168,9 @@ mod tests {
#[test]
fn test_no_files() {
assert_eq!(vec![("alpha".to_string(), 1)],
extract_links("test [[alpha]] whatever [[FiLe: beta]] abc [[ file:Gamma]]"));
assert_eq!(
vec![("alpha".to_string(), 1)],
extract_links("test [[alpha]] whatever [[FiLe: beta]] abc [[ file:Gamma]]")
);
}
}

+ 2
- 2
src/lib.rs View File

@ -3,7 +3,7 @@ extern crate xml;
#[macro_use]
extern crate lazy_static;
pub mod wiki;
pub mod process;
pub mod extractor;
pub mod minivec;
pub mod process;
pub mod wiki;

+ 50
- 44
src/minivec.rs View File

@ -1,21 +1,24 @@
use std::collections::{BTreeMap, HashMap};
use std::collections::{BTreeMap, HashMap, HashSet};
#[derive(Debug)]
pub struct MiniVec {
next_idx: usize, // Keeps track of what idx to use next
sorted_by_id: HashMap<usize, usize>, // id, idx
sorted_by_distance: BTreeMap<usize, HashMap<usize, ()>>, // distance, [idx]
idx_to_id: HashMap<usize, usize>, // idx, id
id_to_distance: HashMap<usize, usize> // id, distance
next_idx: usize, // Keeps track of what idx to use next
sorted_by_id: HashMap<u32, usize>, // id, idx
sorted_by_distance: BTreeMap<u8, HashSet<usize>>, // distance, [idx]
idx_to_id: HashMap<usize, u32>, // idx, id
id_to_distance: HashMap<u32, u8>, // id, distance
}
impl MiniVec {
pub fn new(hsh: HashMap<usize, usize>) -> Self {
let mut ret = Self { next_idx: 0,
sorted_by_id: HashMap::new(),
sorted_by_distance: BTreeMap::new(),
idx_to_id: HashMap::new(),
id_to_distance: HashMap::new() };
// Takes in hashmap of <node_id, distance>
pub fn new(hsh: HashMap<u32, u8>) -> Self {
let mut ret = Self {
next_idx: 0,
sorted_by_id: HashMap::new(),
sorted_by_distance: BTreeMap::new(),
idx_to_id: HashMap::new(),
id_to_distance: HashMap::new(),
};
for (k, v) in hsh {
let idx = ret.next_idx;
ret.next_idx += 1;
@ -28,11 +31,11 @@ impl MiniVec {
ret
}
pub fn insert_element(&mut self, id: usize, dist: usize) {
pub fn insert_element(&mut self, id: u32, dist: u8) {
if self.sorted_by_id.contains_key(&id) {
self.remove_element(id);
}
let idx = self.next_idx;
self.next_idx += 1;
self.sorted_by_id.insert(id, idx);
@ -41,24 +44,31 @@ impl MiniVec {
self.id_to_distance.insert(id, dist);
}
pub fn remove_element(&mut self, id: usize) {
pub fn remove_element(&mut self, id: u32) {
if let Some(idx) = self.sorted_by_id.remove(&id) {
let dist = self.id_to_distance.remove(&id).unwrap();
self.idx_to_id.remove(&idx);
let a = self.sorted_by_distance.get_mut(&dist).unwrap();
if a.len() > 1 {
a.remove(&idx);
}
else {
} else {
self.sorted_by_distance.remove(&dist);
}
}
}
pub fn get_min_distance(&self) -> usize {
match &self.sorted_by_distance.first_key_value().unwrap().1.keys().next() {
// Returns id of node with min distance
pub fn get_min_distance(&self) -> u32 {
match &self
.sorted_by_distance
.first_key_value()
.unwrap()
.1
.iter()
.next()
{
Some(x) => self.idx_to_id[x],
None => { panic!("ERR: Something went wrong! {:?}", self)}
None => panic!("ERR: Something went wrong! {:?}", self),
}
//self.idx_to_id[&self.sorted_by_distance.first_key_value().unwrap().1.keys().next().unwrap()]
}
@ -67,16 +77,15 @@ impl MiniVec {
self.idx_to_id.len()
}
fn insert_idx_into_dist(&mut self, idx: usize, dist: usize) {
fn insert_idx_into_dist(&mut self, idx: usize, dist: u8) {
//println!("idx->dist");
if self.sorted_by_distance.contains_key(&dist) {
//println!("a");
self.sorted_by_distance.get_mut(&dist).unwrap().insert(idx, ());
//println!("d");
}
else {
let mut bt: HashMap<usize, ()> = HashMap::new();
bt.insert(idx, ());
self.sorted_by_distance.get_mut(&dist).unwrap().insert(idx);
//println!("d");
} else {
let mut bt: HashSet<usize> = HashSet::new();
bt.insert(idx);
self.sorted_by_distance.insert(dist, bt);
}
//println!("ok");
@ -86,11 +95,11 @@ impl MiniVec {
#[cfg(test)]
mod tests {
use crate::minivec::MiniVec;
use std::collections::{ BTreeMap, HashMap };
use std::collections::{BTreeMap, HashMap};
#[test]
fn basic_test() {
let mut hm: HashMap<usize, usize> = HashMap::new();
let mut hm: HashMap<u32, u8> = HashMap::new();
hm.insert(0, 1);
hm.insert(2, 3);
hm.insert(5, 3);
@ -99,44 +108,41 @@ mod tests {
let mut mv = MiniVec::new(hm);
assert_eq!(0, mv.get_min_distance());
mv.remove_element(0);
mv.remove_element(5);
mv.insert_element(0, 3); // Updating an existing node
let mut ids: Vec<usize> = mv.sorted_by_distance[&3]
.keys()
.map(|idx| {
mv.idx_to_id[idx]
})
let mut ids: Vec<u32> = mv.sorted_by_distance[&3]
.iter()
.map(|idx| mv.idx_to_id[idx])
.collect();
ids.sort();
assert_eq!(vec![0, 2], ids);
let min_dist = mv.get_min_distance();
assert!(min_dist == 0 || min_dist == 2); // Could be either, since they're the same dist
mv.remove_element(0);
mv.insert_element(10, 2);
mv.remove_element(3);
mv.remove_element(2);
mv.insert_element(12, 0);
assert_eq!(12, mv.get_min_distance());
mv.remove_element(10);
mv.remove_element(12);
mv.remove_element(12);
let empty_hm: HashMap<usize, usize> = HashMap::new();
let empty_bt: BTreeMap<usize, HashMap<usize, ()>> = BTreeMap::new();
let empty_hm = HashMap::new();
let empty_bt = BTreeMap::new();
assert_eq!(8, mv.next_idx);
assert_eq!(empty_hm, mv.sorted_by_id);
assert_eq!(empty_bt, mv.sorted_by_distance);
assert_eq!(empty_hm, mv.idx_to_id);
assert_eq!(empty_hm, mv.id_to_distance);
assert_eq!(HashMap::new(), mv.idx_to_id);
assert_eq!(HashMap::new(), mv.id_to_distance);
}
}

+ 41
- 35
src/process.rs View File

@ -1,12 +1,13 @@
use crate::extractor::extract_links;
use crate::wiki::WikiPage;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
use std::fs::File;
use std::collections::HashMap;
use std::io::BufWriter;
use std::fs::File;
use std::io::prelude::*;
use std::io::BufReader;
use std::io::BufWriter;
use std::sync::{Arc, Mutex};
use rayon::prelude::*;
@ -15,46 +16,43 @@ use rayon::prelude::*;
pub struct MyEdge<T> {
pub from: T,
pub to: T,
pub weight: usize
pub weight: u8,
}
impl<T> MyEdge<T> {
fn new(from: T, to: T, weight: usize) -> Self {
fn new(from: T, to: T, weight: u8) -> Self {
Self { from, to, weight }
}
}
#[derive(Serialize, Deserialize)]
pub struct Graph {
pub edges: Vec<MyEdge<usize>>,
pub nodes: HashMap<usize, String>
pub edges: Vec<MyEdge<u32>>,
pub nodes: HashMap<u32, String>,
}
impl Graph {
fn new(nodes: HashMap<usize, String>, edges: Vec<MyEdge<usize>>) -> Self {
Self {
edges,
nodes
}
fn new(nodes: HashMap<u32, String>, edges: Vec<MyEdge<u32>>) -> Self {
Self { edges, nodes }
}
pub fn load_bin(filename: &str) -> Self {
let mut buffer = Vec::new();
File::open(filename).unwrap().read_to_end(&mut buffer).unwrap();
bincode::deserialize(&mut buffer).unwrap()
let file = File::open(filename).unwrap();
bincode::deserialize_from(BufReader::new(file)).unwrap()
}
pub fn neighbours(&self, node: usize) -> Vec<usize> {
self.edges.par_iter()
.filter(|e| { e.from == node })
.map(|e| { e.to })
pub fn neighbours(&self, node: u32) -> Vec<u32> {
self.edges
.par_iter()
.filter(|e| e.from == node)
.map(|e| e.to)
.collect()
}
}
pub struct GraphIntermediate {
pub edges: Vec<MyEdge<usize>>,
pub nodes: HashMap<String, usize>
pub edges: Vec<MyEdge<u32>>,
pub nodes: HashMap<String, u32>,
}
impl GraphIntermediate {
@ -72,25 +70,27 @@ pub fn process_graph(graph: Arc<Mutex<GraphIntermediate>>, page: WikiPage) {
links.iter().for_each(|(target, weight)| {
// Check if we have an existing node. If not, add one
let target_lwr = target.to_lowercase();
if !graph.nodes.contains_key(&page.title.to_lowercase()) {
let len = graph.nodes.len();
let len = graph.nodes.len() as u32;
graph.nodes.insert(page.title.to_lowercase(), len);
}
if !graph.nodes.contains_key(&target_lwr) {
let len = graph.nodes.len();
let len = graph.nodes.len() as u32;
graph.nodes.insert(target_lwr.clone(), len);
}
let start_key = *graph.nodes.get(&page.title.to_lowercase()).unwrap();
let end_key = *graph.nodes.get(&target_lwr).unwrap();
graph.edges.push(MyEdge::new(start_key, end_key, *weight));
// println!("Target: {}", target_lwr);
if graph.edges.len() % 10000 == 0 /*|| graph.edges.len() > 58730000*/ {
if graph.edges.len() % 10000 == 0
/*|| graph.edges.len() > 58730000*/
{
println!("{} nodes, {} edges.", graph.nodes.len(), graph.edges.len());
println!("{}", page.title);
}
@ -99,16 +99,22 @@ pub fn process_graph(graph: Arc<Mutex<GraphIntermediate>>, page: WikiPage) {
pub fn save_graph(graph_og: GraphIntermediate, filename: &str) {
//Create a graph from our graph intermediate
let graph = Graph::new(graph_og.nodes.iter().map(|(k, v)|{
(*v, k.clone())
}).collect::<HashMap<usize, String>>(),
graph_og.edges.clone());
let graph = Graph::new(
graph_og
.nodes
.iter()
.map(|(k, v)| (*v, k.clone()))
.collect::<HashMap<u32, String>>(),
graph_og.edges.clone(),
);
std::mem::drop(graph_og); // Save some RAM
let mut buffer = BufWriter::new(File::create(filename).unwrap());
/* BINCODE */
buffer.write_all(&bincode::serialize(&graph).unwrap()).unwrap();
buffer
.write_all(&bincode::serialize(&graph).unwrap())
.unwrap();
/* JSON */
//buffer.write_all(serde_json::to_string_pretty(&graph).unwrap().as_bytes()).unwrap();
buffer.flush().unwrap();


+ 2
- 2
src/wiki.rs View File

@ -1,7 +1,7 @@
pub struct WikiPage {
pub title: String,
pub body: String,
pub frozen: bool
pub frozen: bool,
}
impl WikiPage {
@ -9,7 +9,7 @@ impl WikiPage {
Self {
title: "".to_string(),
body: "".to_string(),
frozen: false
frozen: false,
}
}


Loading…
Cancel
Save