1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
//! When we are combining two NFAs, we will grab all the outgoing
//! edges from a set of nodes and wind up with a bunch of potentially
//! overlapping character ranges like:
//!
//! a-z
//! c-l
//! 0-9
//!
//! This module contains code to turn those into non-overlapping ranges like:
//!
//! a-b
//! c-l
//! m-z
//! 0-9
//!
//! Specifically, we want to ensure that the same set of characters is
//! covered when we started, and that each of the input ranges is
//! covered precisely by some set of ranges in the output.
use crate::collections::Set;
use crate::lexer::nfa::Test;
use std::cmp;
use std::ops::RangeInclusive;
pub fn remove_overlap(ranges: &Set<Test>) -> Vec<Test> {
// We will do this in the dumbest possible way to start. :)
// Maintain a result vector that contains disjoint ranges. To
// insert a new range, we walk over this vector and split things
// up as we go. This algorithm is so naive as to be exponential, I
// think. Sue me.
let mut disjoint_ranges = vec![];
for range in ranges {
add_range(range.to_owned(), 0, &mut disjoint_ranges);
}
// the algorithm above leaves some empty ranges in for simplicity;
// prune them out.
disjoint_ranges.retain(|r| !r.is_empty());
disjoint_ranges.sort();
disjoint_ranges
}
fn add_range(range: Test, start_index: usize, disjoint_ranges: &mut Vec<Test>) {
if range.is_empty() {
return;
}
// Find first overlapping range in `disjoint_ranges`, if any.
match disjoint_ranges[start_index..]
.iter()
.position(|r| r.intersects(&range))
{
Some(index) => {
let index = index + start_index;
let overlapping_range = &disjoint_ranges[index];
// If the range we are trying to add already exists, we're all done.
if overlapping_range == &range {
return;
}
// Otherwise, we want to create three ranges (some of which may
// be empty). e.g. imagine one range is `a-z` and the other
// is `c-l`, we want `a-b`, `c-l`, and `m-z`.
let min_min = cmp::min(range.start(), overlapping_range.start());
let mid_min = cmp::max(range.start(), overlapping_range.start());
let mid_max = cmp::min(range.end(), overlapping_range.end());
let max_max = cmp::max(range.end(), overlapping_range.end());
// When working with inclusive ranges, we need to be sure to not double count
// the meeting points of low-mid_range and mid-max_range.
// So we adjust the end of the low_range and start of max_range as these elements are already included in the start of their corresponding next ranges.
let low_range = if mid_min == 0 {
// This is an edgecase where both ranges start at the null character
// In this case we don't want to create a range from 0 to -1
// Thus we create an empty range
Test::new(RangeInclusive::new(1, 0))
} else {
Test::new(min_min..=mid_min - 1)
};
let mid_range = Test::new(mid_min..=mid_max);
let max_range = Test::new(mid_max + 1..=max_max);
assert!(low_range.is_disjoint(&mid_range));
assert!(low_range.is_disjoint(&max_range));
assert!(mid_range.is_disjoint(&max_range));
// Replace the existing range with the low range, and then
// add the mid and max ranges in. (The low range may be
// empty, but we'll prune that out later.)
disjoint_ranges[index] = low_range;
add_range(mid_range, index + 1, disjoint_ranges);
add_range(max_range, index + 1, disjoint_ranges);
}
None => {
// no overlap -- easy case.
disjoint_ranges.push(range);
}
}
}
#[cfg(test)]
macro_rules! test {
($($range:expr,)*) => {
{
use crate::collections::set;
use crate::lexer::nfa::Test;
use std::char;
let mut s = set();
$({ let r = $range; s.insert(Test::inclusive_range(*r.start(), *r.end())); })*
remove_overlap(&s).into_iter()
.map(|r|
char::from_u32(r.start()).unwrap() ..=
char::from_u32(r.end()).unwrap())
.collect::<Vec<_>>()
}
}
}
#[test]
fn alphabet() {
let result = test! {
'a' ..= 'z',
'c' ..= 'l',
'0' ..= '9',
};
assert_eq!(result, vec!['0'..='9', 'a'..='b', 'c'..='l', 'm'..='z']);
}
#[test]
fn repeat() {
let result = test! {
'a' ..= 'z',
'c' ..= 'l',
'l' ..= 'z',
'0' ..= '9',
};
assert_eq!(
result,
vec!['0'..='9', 'a'..='b', 'c'..='k', 'l'..='l', 'm'..='z']
);
}
#[test]
fn stagger() {
let result = test! {
'0' ..= '3',
'2' ..= '4',
'3' ..= '5',
};
assert_eq!(
result,
vec!['0'..='1', '2'..='2', '3'..='3', '4'..='4', '5'..='5']
);
}
#[test]
fn empty_range() {
let result = test! {
'b' ..= 'b',
'a' ..= 'z',
};
assert_eq!(result, vec!['a'..='a', 'b'..='b', 'c'..='z']);
}
#[test]
fn null() {
let result = test! {
'\0' ..= '\0',
'\0' ..= 'a',
};
assert_eq!(result, vec!['\0'..='\0', 1 as char..='a']);
}