diff --git a/mk/crates.mk b/mk/crates.mk index 5957405f0f9..be1965b7eda 100644 --- a/mk/crates.mk +++ b/mk/crates.mk @@ -51,7 +51,7 @@ TARGET_CRATES := libc std flate arena term \ serialize getopts collections test rand \ - log regex graphviz core rbml alloc \ + log graphviz core rbml alloc \ unicode rustc_bitflags RUSTC_CRATES := rustc rustc_typeck rustc_borrowck rustc_resolve rustc_driver \ rustc_trans rustc_back rustc_llvm rustc_privacy @@ -95,16 +95,15 @@ DEPS_term := std log DEPS_getopts := std DEPS_collections := core alloc unicode DEPS_num := std -DEPS_test := std getopts serialize rbml term regex native:rust_test_helpers +DEPS_test := std getopts serialize rbml term native:rust_test_helpers DEPS_rand := core -DEPS_log := std regex -DEPS_regex := std +DEPS_log := std DEPS_fmt_macros = std TOOL_DEPS_compiletest := test getopts TOOL_DEPS_rustdoc := rustdoc TOOL_DEPS_rustc := rustc_driver -TOOL_DEPS_rustbook := std regex rustdoc +TOOL_DEPS_rustbook := std rustdoc TOOL_SOURCE_compiletest := $(S)src/compiletest/compiletest.rs TOOL_SOURCE_rustdoc := $(S)src/driver/driver.rs TOOL_SOURCE_rustc := $(S)src/driver/driver.rs @@ -130,9 +129,8 @@ DOC_CRATES := $(filter-out rustc, \ $(filter-out rustc_driver, \ $(filter-out rustc_privacy, \ $(filter-out log, \ - $(filter-out regex, \ $(filter-out getopts, \ - $(filter-out syntax, $(CRATES)))))))))))) + $(filter-out syntax, $(CRATES))))))))))) COMPILER_DOC_CRATES := rustc rustc_trans rustc_borrowck rustc_resolve \ rustc_typeck rustc_driver syntax rustc_privacy diff --git a/src/compiletest/common.rs b/src/compiletest/common.rs index c21785c45a3..1f4f444634d 100644 --- a/src/compiletest/common.rs +++ b/src/compiletest/common.rs @@ -11,7 +11,6 @@ pub use self::Mode::*; use std::fmt; use std::str::FromStr; -use regex::Regex; #[derive(Clone, PartialEq, Debug)] pub enum Mode { @@ -101,10 +100,7 @@ pub struct Config { pub run_ignored: bool, // Only run tests that match this filter - pub filter: Option, - - // Precompiled regex for finding expected errors in cfail - pub cfail_regex: Regex, + pub filter: Option, // Write out a parseable log of tests that were run pub logfile: Option, diff --git a/src/compiletest/compiletest.rs b/src/compiletest/compiletest.rs index b3f0034ca89..4659af4416b 100644 --- a/src/compiletest/compiletest.rs +++ b/src/compiletest/compiletest.rs @@ -22,7 +22,6 @@ extern crate getopts; #[macro_use] extern crate log; -extern crate regex; use std::os; use std::io; @@ -33,7 +32,6 @@ use getopts::{optopt, optflag, reqopt}; use common::Config; use common::{Pretty, DebugInfoGdb, DebugInfoLldb, Codegen}; use util::logv; -use regex::Regex; pub mod procsrv; pub mod util; @@ -116,14 +114,7 @@ pub fn parse_config(args: Vec ) -> Config { } let filter = if !matches.free.is_empty() { - let s = matches.free[0].as_slice(); - match regex::Regex::new(s) { - Ok(re) => Some(re), - Err(e) => { - println!("failed to parse filter /{}/: {:?}", s, e); - panic!() - } - } + Some(matches.free[0].clone()) } else { None }; @@ -145,7 +136,6 @@ pub fn parse_config(args: Vec ) -> Config { .as_slice()).expect("invalid mode"), run_ignored: matches.opt_present("ignored"), filter: filter, - cfail_regex: Regex::new(errors::EXPECTED_PATTERN).unwrap(), logfile: matches.opt_str("logfile").map(|s| Path::new(s)), runtool: matches.opt_str("runtool"), host_rustcflags: matches.opt_str("host-rustcflags"), @@ -374,18 +364,24 @@ fn extract_gdb_version(full_version_line: Option) -> Option { if full_version_line.as_slice().trim().len() > 0 => { let full_version_line = full_version_line.as_slice().trim(); - let re = Regex::new(r"(^|[^0-9])([0-9]\.[0-9])([^0-9]|$)").unwrap(); - - match re.captures(full_version_line) { - Some(captures) => { - Some(captures.at(2).unwrap_or("").to_string()) + // used to be a regex "(^|[^0-9])([0-9]\.[0-9])([^0-9]|$)" + for (pos, c) in full_version_line.char_indices() { + if !c.is_digit(10) { continue } + if pos + 2 >= full_version_line.len() { continue } + if full_version_line.char_at(pos + 1) != '.' { continue } + if !full_version_line.char_at(pos + 2).is_digit(10) { continue } + if pos > 0 && full_version_line.char_at_reverse(pos).is_digit(10) { + continue } - None => { - println!("Could not extract GDB version from line '{}'", - full_version_line); - None + if pos + 3 < full_version_line.len() && + full_version_line.char_at(pos + 3).is_digit(10) { + continue } + return Some(full_version_line[pos..pos+3].to_string()); } + println!("Could not extract GDB version from line '{}'", + full_version_line); + None }, _ => None } @@ -408,18 +404,26 @@ fn extract_lldb_version(full_version_line: Option) -> Option { if full_version_line.as_slice().trim().len() > 0 => { let full_version_line = full_version_line.as_slice().trim(); - let re = Regex::new(r"[Ll][Ll][Dd][Bb]-([0-9]+)").unwrap(); + for (pos, l) in full_version_line.char_indices() { + if l != 'l' && l != 'L' { continue } + if pos + 5 >= full_version_line.len() { continue } + let l = full_version_line.char_at(pos + 1); + if l != 'l' && l != 'L' { continue } + let d = full_version_line.char_at(pos + 2); + if d != 'd' && d != 'D' { continue } + let b = full_version_line.char_at(pos + 3); + if b != 'b' && b != 'B' { continue } + let dash = full_version_line.char_at(pos + 4); + if dash != '-' { continue } - match re.captures(full_version_line) { - Some(captures) => { - Some(captures.at(1).unwrap_or("").to_string()) - } - None => { - println!("Could not extract LLDB version from line '{}'", - full_version_line); - None - } + let vers = full_version_line[pos + 5..].chars().take_while(|c| { + c.is_digit(10) + }).collect::(); + if vers.len() > 0 { return Some(vers) } } + println!("Could not extract LLDB version from line '{}'", + full_version_line); + None }, _ => None } diff --git a/src/compiletest/errors.rs b/src/compiletest/errors.rs index dcfac688c7f..fc815d66a4d 100644 --- a/src/compiletest/errors.rs +++ b/src/compiletest/errors.rs @@ -9,9 +9,7 @@ // except according to those terms. use self::WhichLine::*; -use std::ascii::AsciiExt; use std::io::{BufferedReader, File}; -use regex::Regex; pub struct ExpectedError { pub line: uint, @@ -19,6 +17,9 @@ pub struct ExpectedError { pub msg: String, } +#[derive(PartialEq, Show)] +enum WhichLine { ThisLine, FollowPrevious(uint), AdjustBackward(uint) } + /// Looks for either "//~| KIND MESSAGE" or "//~^^... KIND MESSAGE" /// The former is a "follow" that inherits its target from the preceding line; /// the latter is an "adjusts" that goes that many lines up. @@ -26,15 +27,8 @@ pub struct ExpectedError { /// Goal is to enable tests both like: //~^^^ ERROR go up three /// and also //~^ ERROR message one for the preceding line, and /// //~| ERROR message two for that same line. - -pub static EXPECTED_PATTERN : &'static str = - r"//~(?P\|)?(?P\^*)\s*(?P\S*)\s*(?P.*)"; - -#[derive(PartialEq, Show)] -enum WhichLine { ThisLine, FollowPrevious(uint), AdjustBackward(uint) } - // Load any test directives embedded in the file -pub fn load_errors(re: &Regex, testfile: &Path) -> Vec { +pub fn load_errors(testfile: &Path) -> Vec { let mut rdr = BufferedReader::new(File::open(testfile).unwrap()); // `last_nonfollow_error` tracks the most recently seen @@ -50,7 +44,7 @@ pub fn load_errors(re: &Regex, testfile: &Path) -> Vec { rdr.lines().enumerate().filter_map(|(line_no, ln)| { parse_expected(last_nonfollow_error, line_no + 1, - ln.unwrap().as_slice(), re) + ln.unwrap().as_slice()) .map(|(which, error)| { match which { FollowPrevious(_) => {} @@ -63,30 +57,39 @@ pub fn load_errors(re: &Regex, testfile: &Path) -> Vec { fn parse_expected(last_nonfollow_error: Option, line_num: uint, - line: &str, - re: &Regex) -> Option<(WhichLine, ExpectedError)> { - re.captures(line).and_then(|caps| { - let adjusts = caps.name("adjusts").unwrap_or("").len(); - let kind = caps.name("kind").unwrap_or("").to_ascii_lowercase(); - let msg = caps.name("msg").unwrap_or("").trim().to_string(); - let follow = caps.name("follow").unwrap_or("").len() > 0; + line: &str) -> Option<(WhichLine, ExpectedError)> { + let start = match line.find_str("//~") { Some(i) => i, None => return None }; + let (follow, adjusts) = if line.char_at(start + 3) == '|' { + (true, 0) + } else { + (false, line[start + 3..].chars().take_while(|c| *c == '^').count()) + }; + let kind_start = start + 3 + adjusts + (follow as usize); + let letters = line[kind_start..].chars(); + let kind = letters.skip_while(|c| c.is_whitespace()) + .take_while(|c| !c.is_whitespace()) + .map(|c| c.to_lowercase()) + .collect::(); + let letters = line[kind_start..].chars(); + let msg = letters.skip_while(|c| c.is_whitespace()) + .skip_while(|c| !c.is_whitespace()) + .collect::().trim().to_string(); - let (which, line) = if follow { - assert!(adjusts == 0, "use either //~| or //~^, not both."); - let line = last_nonfollow_error.unwrap_or_else(|| { - panic!("encountered //~| without preceding //~^ line.") - }); - (FollowPrevious(line), line) - } else { - let which = - if adjusts > 0 { AdjustBackward(adjusts) } else { ThisLine }; - let line = line_num - adjusts; - (which, line) - }; + let (which, line) = if follow { + assert!(adjusts == 0, "use either //~| or //~^, not both."); + let line = last_nonfollow_error.unwrap_or_else(|| { + panic!("encountered //~| without preceding //~^ line.") + }); + (FollowPrevious(line), line) + } else { + let which = + if adjusts > 0 { AdjustBackward(adjusts) } else { ThisLine }; + let line = line_num - adjusts; + (which, line) + }; - debug!("line={} which={:?} kind={:?} msg={:?}", line_num, which, kind, msg); - Some((which, ExpectedError { line: line, - kind: kind, - msg: msg, })) - }) + debug!("line={} which={:?} kind={:?} msg={:?}", line_num, which, kind, msg); + Some((which, ExpectedError { line: line, + kind: kind, + msg: msg, })) } diff --git a/src/compiletest/runtest.rs b/src/compiletest/runtest.rs index f075cff769f..e5a973e7501 100644 --- a/src/compiletest/runtest.rs +++ b/src/compiletest/runtest.rs @@ -99,7 +99,7 @@ fn run_cfail_test(config: &Config, props: &TestProps, testfile: &Path) { } let output_to_check = get_output(props, &proc_res); - let expected_errors = errors::load_errors(&config.cfail_regex, testfile); + let expected_errors = errors::load_errors(testfile); if !expected_errors.is_empty() { if !props.error_patterns.is_empty() { fatal("both error pattern and expected errors specified"); diff --git a/src/grammar/verify.rs b/src/grammar/verify.rs index e9409a61061..1288110df33 100644 --- a/src/grammar/verify.rs +++ b/src/grammar/verify.rs @@ -13,14 +13,11 @@ extern crate syntax; extern crate rustc; -extern crate regex; - #[macro_use] extern crate log; use std::collections::HashMap; use std::io::File; -use regex::Regex; use syntax::parse; use syntax::parse::lexer; @@ -167,15 +164,19 @@ fn count(lit: &str) -> usize { } fn parse_antlr_token(s: &str, tokens: &HashMap) -> TokenAndSpan { - let re = Regex::new( - r"\[@(?P\d+),(?P\d+):(?P\d+)='(?P.+?)',<(?P-?\d+)>,\d+:\d+]" - ).unwrap(); + // old regex: + // \[@(?P\d+),(?P\d+):(?P\d+)='(?P.+?)',<(?P-?\d+)>,\d+:\d+] + let start = s.find_str("[@").unwrap(); + let comma = start + s[start..].find_str(",").unwrap(); + let colon = comma + s[comma..].find_str(":").unwrap(); + let content_start = colon + s[colon..].find_str("='").unwrap(); + let content_end = content_start + s[content_start..].find_str("',<").unwrap(); + let toknum_end = content_end + s[content_end..].find_str(">,").unwrap(); - let m = re.captures(s).expect(format!("The regex didn't match {}", s).as_slice()); - let start = m.name("start").unwrap_or(""); - let end = m.name("end").unwrap_or(""); - let toknum = m.name("toknum").unwrap_or(""); - let content = m.name("content").unwrap_or(""); + let start = &s[comma + 1 .. colon]; + let end = &s[colon + 1 .. content_start]; + let content = &s[content_start + 2 .. content_end]; + let toknum = &s[content_end + 3 .. toknum_end]; let proto_tok = tokens.get(toknum).expect(format!("didn't find token {:?} in the map", toknum).as_slice()); diff --git a/src/liblog/directive.rs b/src/liblog/directive.rs index d741019aa7b..5efa799f562 100644 --- a/src/liblog/directive.rs +++ b/src/liblog/directive.rs @@ -8,7 +8,6 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use regex::Regex; use std::ascii::AsciiExt; use std::cmp; @@ -34,7 +33,7 @@ fn parse_log_level(level: &str) -> Option { /// /// Valid log levels are 0-255, with the most likely ones being 1-4 (defined in /// std::). Also supports string log levels of error, warn, info, and debug -pub fn parse_logging_spec(spec: &str) -> (Vec, Option) { +pub fn parse_logging_spec(spec: &str) -> (Vec, Option) { let mut dirs = Vec::new(); let mut parts = spec.split('/'); @@ -80,17 +79,7 @@ pub fn parse_logging_spec(spec: &str) -> (Vec, Option) { }); }}); - let filter = filter.map_or(None, |filter| { - match Regex::new(filter) { - Ok(re) => Some(re), - Err(e) => { - println!("warning: invalid regex filter - {:?}", e); - None - } - } - }); - - return (dirs, filter); + (dirs, filter.map(|s| s.to_string())) } #[cfg(test)] diff --git a/src/liblog/lib.rs b/src/liblog/lib.rs index 4da07c50c59..e7c5bc35f76 100644 --- a/src/liblog/lib.rs +++ b/src/liblog/lib.rs @@ -123,11 +123,11 @@ //! //! # Filtering results //! -//! A RUST_LOG directive may include a regex filter. The syntax is to append `/` -//! followed by a regex. Each message is checked against the regex, and is only -//! logged if it matches. Note that the matching is done after formatting the log -//! string but before adding any logging meta-data. There is a single filter for all -//! modules. +//! A RUST_LOG directive may include a string filter. The syntax is to append +//! `/` followed by a string. Each message is checked against the string and is +//! only logged if it contains the string. Note that the matching is done after +//! formatting the log string but before adding any logging meta-data. There is +//! a single filter for all modules. //! //! Some examples: //! @@ -172,8 +172,6 @@ #![allow(unstable)] #![deny(missing_docs)] -extern crate regex; - use std::cell::RefCell; use std::fmt; use std::io::LineBufferedWriter; @@ -185,8 +183,6 @@ use std::rt; use std::slice; use std::sync::{Once, ONCE_INIT}; -use regex::Regex; - use directive::LOG_LEVEL_NAMES; #[macro_use] @@ -209,8 +205,8 @@ static mut LOG_LEVEL: u32 = MAX_LOG_LEVEL; static mut DIRECTIVES: *const Vec = 0 as *const Vec; -/// Optional regex filter. -static mut FILTER: *const Regex = 0 as *const _; +/// Optional filter. +static mut FILTER: *const String = 0 as *const _; /// Debug log level pub const DEBUG: u32 = 4; @@ -288,7 +284,7 @@ pub fn log(level: u32, loc: &'static LogLocation, args: fmt::Arguments) { // Test the literal string from args against the current filter, if there // is one. match unsafe { FILTER.as_ref() } { - Some(filter) if !filter.is_match(&args.to_string()[]) => return, + Some(filter) if !args.to_string().contains(&filter[]) => return, _ => {} } @@ -435,8 +431,8 @@ fn init() { DIRECTIVES = ptr::null(); if !FILTER.is_null() { - let _filter: Box = mem::transmute(FILTER); - FILTER = ptr::null(); + let _filter: Box = mem::transmute(FILTER); + FILTER = 0 as *const _; } }); } diff --git a/src/libregex/compile.rs b/src/libregex/compile.rs deleted file mode 100644 index d29a7a425c1..00000000000 --- a/src/libregex/compile.rs +++ /dev/null @@ -1,275 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// Enable this to squash warnings due to exporting pieces of the representation -// for use with the regex! macro. See lib.rs for explanation. - -pub use self::Inst::*; - -use std::cmp; -use std::iter::repeat; -use parse; -use parse::{ - Flags, FLAG_EMPTY, - Nothing, Literal, Dot, AstClass, Begin, End, WordBoundary, Capture, Cat, Alt, - Rep, - ZeroOne, ZeroMore, OneMore, -}; - -type InstIdx = uint; - -#[derive(Show, Clone)] -pub enum Inst { - // When a Match instruction is executed, the current thread is successful. - Match, - - // The OneChar instruction matches a literal character. - // The flags indicate whether to do a case insensitive match. - OneChar(char, Flags), - - // The CharClass instruction tries to match one input character against - // the range of characters given. - // The flags indicate whether to do a case insensitive match and whether - // the character class is negated or not. - CharClass(Vec<(char, char)>, Flags), - - // Matches any character except new lines. - // The flags indicate whether to include the '\n' character. - Any(Flags), - - // Matches the beginning of the string, consumes no characters. - // The flags indicate whether it matches if the preceding character - // is a new line. - EmptyBegin(Flags), - - // Matches the end of the string, consumes no characters. - // The flags indicate whether it matches if the proceeding character - // is a new line. - EmptyEnd(Flags), - - // Matches a word boundary (\w on one side and \W \A or \z on the other), - // and consumes no character. - // The flags indicate whether this matches a word boundary or something - // that isn't a word boundary. - EmptyWordBoundary(Flags), - - // Saves the current position in the input string to the Nth save slot. - Save(uint), - - // Jumps to the instruction at the index given. - Jump(InstIdx), - - // Jumps to the instruction at the first index given. If that leads to - // a panic state, then the instruction at the second index given is - // tried. - Split(InstIdx, InstIdx), -} - -/// Program represents a compiled regular expression. Once an expression is -/// compiled, its representation is immutable and will never change. -/// -/// All of the data in a compiled expression is wrapped in "MaybeStatic" or -/// "MaybeOwned" types so that a `Program` can be represented as static data. -/// (This makes it convenient and efficient for use with the `regex!` macro.) -#[derive(Clone)] -pub struct Program { - /// A sequence of instructions. - pub insts: Vec, - /// If the regular expression requires a literal prefix in order to have a - /// match, that prefix is stored here. (It's used in the VM to implement - /// an optimization.) - pub prefix: String, -} - -impl Program { - /// Compiles a Regex given its AST. - pub fn new(ast: parse::Ast) -> (Program, Vec>) { - let mut c = Compiler { - insts: Vec::with_capacity(100), - names: Vec::with_capacity(10), - }; - - c.insts.push(Save(0)); - c.compile(ast); - c.insts.push(Save(1)); - c.insts.push(Match); - - // Try to discover a literal string prefix. - // This is a bit hacky since we have to skip over the initial - // 'Save' instruction. - let mut pre = String::with_capacity(5); - for inst in c.insts[1..].iter() { - match *inst { - OneChar(c, FLAG_EMPTY) => pre.push(c), - _ => break - } - } - - let Compiler { insts, names } = c; - let prog = Program { - insts: insts, - prefix: pre, - }; - (prog, names) - } - - /// Returns the total number of capture groups in the regular expression. - /// This includes the zeroth capture. - pub fn num_captures(&self) -> uint { - let mut n = 0; - for inst in self.insts.iter() { - match *inst { - Save(c) => n = cmp::max(n, c+1), - _ => {} - } - } - // There's exactly 2 Save slots for every capture. - n / 2 - } -} - -struct Compiler<'r> { - insts: Vec, - names: Vec>, -} - -// The compiler implemented here is extremely simple. Most of the complexity -// in this crate is in the parser or the VM. -// The only tricky thing here is patching jump/split instructions to point to -// the right instruction. -impl<'r> Compiler<'r> { - fn compile(&mut self, ast: parse::Ast) { - match ast { - Nothing => {}, - Literal(c, flags) => self.push(OneChar(c, flags)), - Dot(nl) => self.push(Any(nl)), - AstClass(ranges, flags) => - self.push(CharClass(ranges, flags)), - Begin(flags) => self.push(EmptyBegin(flags)), - End(flags) => self.push(EmptyEnd(flags)), - WordBoundary(flags) => self.push(EmptyWordBoundary(flags)), - Capture(cap, name, x) => { - let len = self.names.len(); - if cap >= len { - self.names.extend(repeat(None).take(10 + cap - len)) - } - self.names[cap] = name; - - self.push(Save(2 * cap)); - self.compile(*x); - self.push(Save(2 * cap + 1)); - } - Cat(xs) => { - for x in xs.into_iter() { - self.compile(x) - } - } - Alt(x, y) => { - let split = self.empty_split(); // push: split 0, 0 - let j1 = self.insts.len(); - self.compile(*x); // push: insts for x - let jmp = self.empty_jump(); // push: jmp 0 - let j2 = self.insts.len(); - self.compile(*y); // push: insts for y - let j3 = self.insts.len(); - - self.set_split(split, j1, j2); // split 0, 0 -> split j1, j2 - self.set_jump(jmp, j3); // jmp 0 -> jmp j3 - } - Rep(x, ZeroOne, g) => { - let split = self.empty_split(); - let j1 = self.insts.len(); - self.compile(*x); - let j2 = self.insts.len(); - - if g.is_greedy() { - self.set_split(split, j1, j2); - } else { - self.set_split(split, j2, j1); - } - } - Rep(x, ZeroMore, g) => { - let j1 = self.insts.len(); - let split = self.empty_split(); - let j2 = self.insts.len(); - self.compile(*x); - let jmp = self.empty_jump(); - let j3 = self.insts.len(); - - self.set_jump(jmp, j1); - if g.is_greedy() { - self.set_split(split, j2, j3); - } else { - self.set_split(split, j3, j2); - } - } - Rep(x, OneMore, g) => { - let j1 = self.insts.len(); - self.compile(*x); - let split = self.empty_split(); - let j2 = self.insts.len(); - - if g.is_greedy() { - self.set_split(split, j1, j2); - } else { - self.set_split(split, j2, j1); - } - } - } - } - - /// Appends the given instruction to the program. - #[inline] - fn push(&mut self, x: Inst) { - self.insts.push(x) - } - - /// Appends an *empty* `Split` instruction to the program and returns - /// the index of that instruction. (The index can then be used to "patch" - /// the actual locations of the split in later.) - #[inline] - fn empty_split(&mut self) -> InstIdx { - self.insts.push(Split(0, 0)); - self.insts.len() - 1 - } - - /// Sets the left and right locations of a `Split` instruction at index - /// `i` to `pc1` and `pc2`, respectively. - /// If the instruction at index `i` isn't a `Split` instruction, then - /// `panic!` is called. - #[inline] - fn set_split(&mut self, i: InstIdx, pc1: InstIdx, pc2: InstIdx) { - let split = &mut self.insts[i]; - match *split { - Split(_, _) => *split = Split(pc1, pc2), - _ => panic!("BUG: Invalid split index."), - } - } - - /// Appends an *empty* `Jump` instruction to the program and returns the - /// index of that instruction. - #[inline] - fn empty_jump(&mut self) -> InstIdx { - self.insts.push(Jump(0)); - self.insts.len() - 1 - } - - /// Sets the location of a `Jump` instruction at index `i` to `pc`. - /// If the instruction at index `i` isn't a `Jump` instruction, then - /// `panic!` is called. - #[inline] - fn set_jump(&mut self, i: InstIdx, pc: InstIdx) { - let jmp = &mut self.insts[i]; - match *jmp { - Jump(_) => *jmp = Jump(pc), - _ => panic!("BUG: Invalid jump index."), - } - } -} diff --git a/src/libregex/lib.rs b/src/libregex/lib.rs deleted file mode 100644 index 002b74cf1ef..00000000000 --- a/src/libregex/lib.rs +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. -// -// ignore-lexer-test FIXME #15679 - -//! Regular expressions implemented in Rust -//! -//! For official documentation, see the rust-lang/regex crate -#![crate_name = "regex"] -#![crate_type = "rlib"] -#![crate_type = "dylib"] -#![unstable = "use the crates.io `regex` library instead"] -#![staged_api] -#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", - html_favicon_url = "http://www.rust-lang.org/favicon.ico", - html_root_url = "http://doc.rust-lang.org/nightly/", - html_playground_url = "http://play.rust-lang.org/")] - -#![allow(unknown_features)] -#![allow(unstable)] -#![feature(slicing_syntax)] -#![feature(box_syntax)] -#![allow(unknown_features)] #![feature(int_uint)] -#![deny(missing_docs)] - -#[cfg(test)] -extern crate "test" as stdtest; -#[cfg(test)] -extern crate rand; - -// During tests, this links with the `regex` crate so that the `regex!` macro -// can be tested. -#[cfg(test)] -extern crate regex; - -// Unicode tables for character classes are defined in libunicode -extern crate unicode; - -pub use parse::Error; -pub use re::{Regex, Captures, SubCaptures, SubCapturesPos}; -pub use re::{FindCaptures, FindMatches}; -pub use re::{Replacer, NoExpand, RegexSplits, RegexSplitsN}; -pub use re::{quote, is_match}; - -mod compile; -mod parse; -mod re; -mod vm; - -#[cfg(test)] -mod test; - -/// The `native` module exists to support the `regex!` macro. Do not use. -#[doc(hidden)] -pub mod native { - // Exporting this stuff is bad form, but it's necessary for two reasons. - // Firstly, the `regex!` syntax extension is in a different crate and - // requires access to the representation of a regex (particularly the - // instruction set) in order to compile to native Rust. This could be - // mitigated if `regex!` was defined in the same crate, but this has - // undesirable consequences (such as requiring a dependency on - // `libsyntax`). - // - // Secondly, the code generated by `regex!` must *also* be able - // to access various functions in this crate to reduce code duplication - // and to provide a value with precisely the same `Regex` type in this - // crate. This, AFAIK, is impossible to mitigate. - // - // On the bright side, `rustdoc` lets us hide this from the public API - // documentation. - pub use compile::{ - Program, - OneChar, CharClass, Any, Save, Jump, Split, - Match, EmptyBegin, EmptyEnd, EmptyWordBoundary, - }; - pub use parse::{ - FLAG_EMPTY, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, - FLAG_SWAP_GREED, FLAG_NEGATED, - }; - pub use re::{Dynamic, ExDynamic, Native, ExNative}; - pub use vm::{ - MatchKind, Exists, Location, Submatches, - StepState, StepMatchEarlyReturn, StepMatch, StepContinue, - CharReader, find_prefix, - }; -} diff --git a/src/libregex/parse.rs b/src/libregex/parse.rs deleted file mode 100644 index c2186a0ec24..00000000000 --- a/src/libregex/parse.rs +++ /dev/null @@ -1,1087 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -pub use self::Ast::*; -pub use self::Repeater::*; -pub use self::Greed::*; -use self::BuildAst::*; - -use std::char; -use std::cmp; -use std::fmt; -use std::iter; -use std::num; - -/// Static data containing Unicode ranges for general categories and scripts. -use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW}; - -/// The maximum number of repetitions allowed with the `{n,m}` syntax. -static MAX_REPEAT: uint = 1000; - -/// Error corresponds to something that can go wrong while parsing -/// a regular expression. -/// -/// (Once an expression is compiled, it is not possible to produce an error -/// via searching, splitting or replacing.) -#[derive(Show)] -pub struct Error { - /// The *approximate* character index of where the error occurred. - pub pos: uint, - /// A message describing the error. - pub msg: String, -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Regex syntax error near position {}: {:?}", - self.pos, self.msg) - } -} - -/// Represents the abstract syntax of a regular expression. -/// It is showable so that error messages resulting from a bug can provide -/// useful information. -/// It is cloneable so that expressions can be repeated for the counted -/// repetition feature. (No other copying is done.) -/// -/// Note that this representation prevents one from reproducing the regex as -/// it was typed. (But it could be used to reproduce an equivalent regex.) -#[derive(Show, Clone)] -pub enum Ast { - Nothing, - Literal(char, Flags), - Dot(Flags), - AstClass(Vec<(char, char)>, Flags), - Begin(Flags), - End(Flags), - WordBoundary(Flags), - Capture(uint, Option, Box), - // Represent concatenation as a flat vector to avoid blowing the - // stack in the compiler. - Cat(Vec), - Alt(Box, Box), - Rep(Box, Repeater, Greed), -} - -#[derive(Show, PartialEq, Clone)] -pub enum Repeater { - ZeroOne, - ZeroMore, - OneMore, -} - -#[derive(Copy, Show, Clone)] -pub enum Greed { - Greedy, - Ungreedy, -} - -impl Greed { - pub fn is_greedy(&self) -> bool { - match *self { - Greedy => true, - _ => false, - } - } - - fn swap(self, swapped: bool) -> Greed { - if !swapped { return self } - match self { - Greedy => Ungreedy, - Ungreedy => Greedy, - } - } -} - -/// BuildAst is a regrettable type that represents intermediate state for -/// constructing an abstract syntax tree. Its central purpose is to facilitate -/// parsing groups and alternations while also maintaining a stack of flag -/// state. -#[derive(Show)] -enum BuildAst { - Expr(Ast), - Paren(Flags, uint, String), // '(' - Bar, // '|' -} - -impl BuildAst { - fn paren(&self) -> bool { - match *self { - Paren(_, _, _) => true, - _ => false, - } - } - - fn flags(&self) -> Flags { - match *self { - Paren(flags, _, _) => flags, - _ => panic!("Cannot get flags from {:?}", self), - } - } - - fn capture(&self) -> Option { - match *self { - Paren(_, 0, _) => None, - Paren(_, c, _) => Some(c), - _ => panic!("Cannot get capture group from {:?}", self), - } - } - - fn capture_name(&self) -> Option { - match *self { - Paren(_, 0, _) => None, - Paren(_, _, ref name) => { - if name.len() == 0 { - None - } else { - Some(name.clone()) - } - } - _ => panic!("Cannot get capture name from {:?}", self), - } - } - - fn bar(&self) -> bool { - match *self { - Bar => true, - _ => false, - } - } - - fn unwrap(self) -> Result { - match self { - Expr(x) => Ok(x), - _ => panic!("Tried to unwrap non-AST item: {:?}", self), - } - } -} - -/// Flags represents all options that can be twiddled by a user in an -/// expression. -pub type Flags = u8; - -pub const FLAG_EMPTY: u8 = 0; -pub const FLAG_NOCASE: u8 = 1 << 0; // i -pub const FLAG_MULTI: u8 = 1 << 1; // m -pub const FLAG_DOTNL: u8 = 1 << 2; // s -pub const FLAG_SWAP_GREED: u8 = 1 << 3; // U -pub const FLAG_NEGATED: u8 = 1 << 4; // char class or not word boundary - -struct Parser<'a> { - // The input, parsed only as a sequence of UTF8 code points. - chars: Vec, - // The index of the current character in the input. - chari: uint, - // The intermediate state representing the AST. - stack: Vec, - // The current set of flags. - flags: Flags, - // The total number of capture groups. - // Incremented each time an opening left paren is seen (assuming it is - // opening a capture group). - caps: uint, - // A set of all capture group names used only to detect duplicates. - names: Vec, -} - -pub fn parse(s: &str) -> Result { - Parser { - chars: s.chars().collect(), - chari: 0, - stack: vec!(), - flags: FLAG_EMPTY, - caps: 0, - names: vec!(), - }.parse() -} - -impl<'a> Parser<'a> { - fn parse(&mut self) -> Result { - if self.chars.len() == 0 { - return Ok(Nothing); - } - loop { - let c = self.cur(); - match c { - '?' | '*' | '+' => try!(self.push_repeater(c)), - '\\' => { - let ast = try!(self.parse_escape()); - self.push(ast) - } - '{' => try!(self.parse_counted()), - '[' => match self.try_parse_ascii() { - None => try!(self.parse_class()), - Some(class) => self.push(class), - }, - '(' => { - if self.peek_is(1, '?') { - try!(self.expect('?')); - try!(self.parse_group_opts()) - } else { - self.caps += 1; - self.stack.push(Paren(self.flags, - self.caps, - "".to_string())) - } - } - ')' => { - let catfrom = try!( - self.pos_last(false, |x| x.paren() || x.bar())); - try!(self.concat(catfrom)); - - let altfrom = try!(self.pos_last(false, |x| x.paren())); - // Before we smush the alternates together and pop off the - // left paren, let's grab the old flags and see if we - // need a capture. - let (cap, cap_name, oldflags) = { - let paren = &self.stack[altfrom-1]; - (paren.capture(), paren.capture_name(), paren.flags()) - }; - try!(self.alternate(altfrom)); - self.flags = oldflags; - - // If this was a capture, pop what we just pushed in - // alternate and make it a capture. - if cap.is_some() { - let ast = try!(self.pop_ast()); - self.push(Capture(cap.unwrap(), cap_name, box ast)); - } - } - '|' => { - let catfrom = try!( - self.pos_last(true, |x| x.paren() || x.bar())); - try!(self.concat(catfrom)); - - self.stack.push(Bar); - } - _ => try!(self.push_literal(c)), - } - if !self.next_char() { - break - } - } - - // Try to improve error handling. At this point, there should be - // no remaining open parens. - if self.stack.iter().any(|x| x.paren()) { - return self.err("Unclosed parenthesis.") - } - let catfrom = try!(self.pos_last(true, |x| x.bar())); - try!(self.concat(catfrom)); - try!(self.alternate(0)); - - assert!(self.stack.len() == 1); - self.pop_ast() - } - - fn noteof(&mut self, expected: &str) -> Result<(), Error> { - match self.next_char() { - true => Ok(()), - false => { - self.err(&format!("Expected {:?} but got EOF.", - expected)[]) - } - } - } - - fn expect(&mut self, expected: char) -> Result<(), Error> { - match self.next_char() { - true if self.cur() == expected => Ok(()), - true => self.err(&format!("Expected '{:?}' but got '{:?}'.", - expected, self.cur())[]), - false => { - self.err(&format!("Expected '{:?}' but got EOF.", - expected)[]) - } - } - } - - fn next_char(&mut self) -> bool { - self.chari += 1; - self.chari < self.chars.len() - } - - fn pop_ast(&mut self) -> Result { - match self.stack.pop().unwrap().unwrap() { - Err(e) => Err(e), - Ok(ast) => Ok(ast), - } - } - - fn push(&mut self, ast: Ast) { - self.stack.push(Expr(ast)) - } - - fn push_repeater(&mut self, c: char) -> Result<(), Error> { - match self.stack.last() { - Some(&Expr(..)) => (), - // self.stack is empty, or the top item is not an Expr - _ => return self.err("A repeat operator must be preceded by a valid expression."), - } - let rep: Repeater = match c { - '?' => ZeroOne, '*' => ZeroMore, '+' => OneMore, - _ => panic!("Not a valid repeater operator."), - }; - - match self.peek(1) { - Some('*') | Some('+') => - return self.err( - "Double repeat operators are not supported."), - _ => {}, - } - let ast = try!(self.pop_ast()); - match ast { - Begin(_) | End(_) | WordBoundary(_) => - return self.err( - "Repeat arguments cannot be empty width assertions."), - _ => {} - } - let greed = try!(self.get_next_greedy()); - self.push(Rep(box ast, rep, greed)); - Ok(()) - } - - fn push_literal(&mut self, c: char) -> Result<(), Error> { - let flags = self.flags; - match c { - '.' => { - self.push(Dot(flags)) - } - '^' => { - self.push(Begin(flags)) - } - '$' => { - self.push(End(flags)) - } - _ => { - self.push(Literal(c, flags)) - } - } - Ok(()) - } - - // Parses all forms of character classes. - // Assumes that '[' is the current character. - fn parse_class(&mut self) -> Result<(), Error> { - let negated = - if self.peek_is(1, '^') { - try!(self.expect('^')); - FLAG_NEGATED - } else { - FLAG_EMPTY - }; - let mut ranges: Vec<(char, char)> = vec!(); - let mut alts: Vec = vec!(); - - while self.peek_is(1, '-') { - try!(self.expect('-')); - ranges.push(('-', '-')) - } - loop { - try!(self.noteof("a closing ']' or a non-empty character class)")); - let mut c = self.cur(); - match c { - '[' => - match self.try_parse_ascii() { - Some(AstClass(asciis, flags)) => { - alts.push(AstClass(asciis, flags ^ negated)); - continue - } - Some(ast) => - panic!("Expected Class AST but got '{:?}'", ast), - // Just drop down and try to add as a regular character. - None => {}, - }, - '\\' => { - match try!(self.parse_escape()) { - AstClass(asciis, flags) => { - alts.push(AstClass(asciis, flags ^ negated)); - continue - } - Literal(c2, _) => c = c2, // process below - Begin(_) | End(_) | WordBoundary(_) => - return self.err( - "\\A, \\z, \\b and \\B are not valid escape \ - sequences inside a character class."), - ast => panic!("Unexpected AST item '{:?}'", ast), - } - } - ']' if ranges.len() > 0 || alts.len() > 0 => { - if ranges.len() > 0 { - let flags = negated | (self.flags & FLAG_NOCASE); - let mut ast = AstClass(combine_ranges(ranges), flags); - for alt in alts.into_iter() { - ast = Alt(box alt, box ast) - } - self.push(ast); - } else if alts.len() > 0 { - let mut ast = alts.pop().unwrap(); - for alt in alts.into_iter() { - ast = Alt(box alt, box ast) - } - self.push(ast); - } - return Ok(()) - } - _ => {} - } - - if self.peek_is(1, '-') && !self.peek_is(2, ']') { - try!(self.expect('-')); - // The regex can't end here. - try!(self.noteof("not a ']'")); - // End the range with a single character or character escape. - let mut c2 = self.cur(); - if c2 == '\\' { - match try!(self.parse_escape()) { - Literal(c3, _) => c2 = c3, // allow literal escapes below - ast => - return self.err(&format!("Expected a literal, but got {:?}.", - ast)[]), - } - } - if c2 < c { - return self.err(&format!("Invalid character class \ - range '{}-{}'", - c, - c2)[]) - } - ranges.push((c, self.cur())) - } else { - ranges.push((c, c)) - } - } - } - - // Tries to parse an ASCII character class of the form [:name:]. - // If successful, returns an AST character class corresponding to name - // and moves the parser to the final ']' character. - // If unsuccessful, no state is changed and None is returned. - // Assumes that '[' is the current character. - fn try_parse_ascii(&mut self) -> Option { - if !self.peek_is(1, ':') { - return None - } - let closer = - match self.pos(']') { - Some(i) => i, - None => return None, - }; - if self.chars[closer-1] != ':' { - return None - } - if closer - self.chari <= 3 { - return None - } - let mut name_start = self.chari + 2; - let negated = - if self.peek_is(2, '^') { - name_start += 1; - FLAG_NEGATED - } else { - FLAG_EMPTY - }; - let name = self.slice(name_start, closer - 1); - match find_class(ASCII_CLASSES, &name[]) { - None => None, - Some(ranges) => { - self.chari = closer; - let flags = negated | (self.flags & FLAG_NOCASE); - Some(AstClass(combine_ranges(ranges), flags)) - } - } - } - - // Parses counted repetition. Supports: - // {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}? - // Assumes that '{' is the current character. - // Returns either an error or moves the parser to the final '}' character. - // (Or the '?' character if not greedy.) - fn parse_counted(&mut self) -> Result<(), Error> { - // Scan until the closing '}' and grab the stuff in {}. - let start = self.chari; - let closer = - match self.pos('}') { - Some(i) => i, - None => { - return self.err(&format!("No closing brace for counted \ - repetition starting at position \ - {:?}.", - start)[]) - } - }; - self.chari = closer; - let greed = try!(self.get_next_greedy()); - let inner = self.chars[start+1..closer].iter().cloned() - .collect::(); - - // Parse the min and max values from the regex. - let (mut min, mut max): (uint, Option); - if !inner.contains(",") { - min = try!(self.parse_uint(&inner[])); - max = Some(min); - } else { - let pieces: Vec<&str> = inner.splitn(1, ',').collect(); - let (smin, smax) = (pieces[0], pieces[1]); - if smin.len() == 0 { - return self.err("Max repetitions cannot be specified \ - without min repetitions.") - } - min = try!(self.parse_uint(smin)); - max = - if smax.len() == 0 { - None - } else { - Some(try!(self.parse_uint(smax))) - }; - } - - // Do some bounds checking and make sure max >= min. - if min > MAX_REPEAT { - return self.err(&format!( - "{} exceeds maximum allowed repetitions ({})", - min, MAX_REPEAT)[]); - } - if max.is_some() { - let m = max.unwrap(); - if m > MAX_REPEAT { - return self.err(&format!( - "{} exceeds maximum allowed repetitions ({})", - m, MAX_REPEAT)[]); - } - if m < min { - return self.err(&format!( - "Max repetitions ({}) cannot be smaller than min \ - repetitions ({}).", m, min)[]); - } - } - - // Now manipulate the AST be repeating elements. - if max.is_none() { - // Require N copies of what's on the stack and then repeat it. - let ast = try!(self.pop_ast()); - for _ in iter::range(0, min) { - self.push(ast.clone()) - } - self.push(Rep(box ast, ZeroMore, greed)); - } else { - // Require N copies of what's on the stack and then repeat it - // up to M times optionally. - let ast = try!(self.pop_ast()); - for _ in iter::range(0, min) { - self.push(ast.clone()) - } - if max.is_some() { - for _ in iter::range(min, max.unwrap()) { - self.push(Rep(box ast.clone(), ZeroOne, greed)) - } - } - // It's possible that we popped something off the stack but - // never put anything back on it. To keep things simple, add - // a no-op expression. - if min == 0 && (max.is_none() || max == Some(0)) { - self.push(Nothing) - } - } - Ok(()) - } - - // Parses all escape sequences. - // Assumes that '\' is the current character. - fn parse_escape(&mut self) -> Result { - try!(self.noteof("an escape sequence following a '\\'")); - - let c = self.cur(); - if is_punct(c) { - return Ok(Literal(c, FLAG_EMPTY)) - } - match c { - 'a' => Ok(Literal('\x07', FLAG_EMPTY)), - 'f' => Ok(Literal('\x0C', FLAG_EMPTY)), - 't' => Ok(Literal('\t', FLAG_EMPTY)), - 'n' => Ok(Literal('\n', FLAG_EMPTY)), - 'r' => Ok(Literal('\r', FLAG_EMPTY)), - 'v' => Ok(Literal('\x0B', FLAG_EMPTY)), - 'A' => Ok(Begin(FLAG_EMPTY)), - 'z' => Ok(End(FLAG_EMPTY)), - 'b' => Ok(WordBoundary(FLAG_EMPTY)), - 'B' => Ok(WordBoundary(FLAG_NEGATED)), - '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())), - 'x' => Ok(try!(self.parse_hex())), - 'p' | 'P' => Ok(try!(self.parse_unicode_name())), - 'd' | 'D' | 's' | 'S' | 'w' | 'W' => { - let ranges = perl_unicode_class(c); - let mut flags = self.flags & FLAG_NOCASE; - if c.is_uppercase() { flags |= FLAG_NEGATED } - Ok(AstClass(ranges, flags)) - } - _ => { - self.err(&format!("Invalid escape sequence '\\\\{}'", c)[]) - } - } - } - - // Parses a Unicode character class name, either of the form \pF where - // F is a one letter Unicode class name or of the form \p{name} where - // name is the Unicode class name. - // Assumes that \p or \P has been read (and 'p' or 'P' is the current - // character). - fn parse_unicode_name(&mut self) -> Result { - let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY }; - let mut name: String; - if self.peek_is(1, '{') { - try!(self.expect('{')); - let closer = - match self.pos('}') { - Some(i) => i, - None => return self.err(&format!( - "Missing '}}' for unclosed '{{' at position {}", - self.chari)[]), - }; - if closer - self.chari + 1 == 0 { - return self.err("No Unicode class name found.") - } - name = self.slice(self.chari + 1, closer); - self.chari = closer; - } else { - if self.chari + 1 >= self.chars.len() { - return self.err("No single letter Unicode class name found.") - } - name = self.slice(self.chari + 1, self.chari + 2); - self.chari += 1; - } - match find_class(UNICODE_CLASSES, &name[]) { - None => { - return self.err(&format!("Could not find Unicode class '{}'", - name)[]) - } - Some(ranges) => { - Ok(AstClass(ranges, negated | (self.flags & FLAG_NOCASE))) - } - } - } - - // Parses an octal number, up to 3 digits. - // Assumes that \n has been read, where n is the first digit. - fn parse_octal(&mut self) -> Result { - let start = self.chari; - let mut end = start + 1; - let (d2, d3) = (self.peek(1), self.peek(2)); - if d2 >= Some('0') && d2 <= Some('7') { - try!(self.noteof("expected octal character in [0-7]")); - end += 1; - if d3 >= Some('0') && d3 <= Some('7') { - try!(self.noteof("expected octal character in [0-7]")); - end += 1; - } - } - let s = self.slice(start, end); - match num::from_str_radix::(&s[], 8) { - Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)), - None => { - self.err(&format!("Could not parse '{:?}' as octal number.", - s)[]) - } - } - } - - // Parse a hex number. Either exactly two digits or anything in {}. - // Assumes that \x has been read. - fn parse_hex(&mut self) -> Result { - if !self.peek_is(1, '{') { - try!(self.expect('{')); - return self.parse_hex_two() - } - let start = self.chari + 2; - let closer = - match self.pos('}') { - None => { - return self.err(&format!("Missing '}}' for unclosed \ - '{{' at position {}", - start)[]) - } - Some(i) => i, - }; - self.chari = closer; - self.parse_hex_digits(&self.slice(start, closer)[]) - } - - // Parses a two-digit hex number. - // Assumes that \xn has been read, where n is the first digit and is the - // current character. - // After return, parser will point at the second digit. - fn parse_hex_two(&mut self) -> Result { - let (start, end) = (self.chari, self.chari + 2); - let bad = self.slice(start - 2, self.chars.len()); - try!(self.noteof(format!("Invalid hex escape sequence '{}'", - bad).as_slice())); - self.parse_hex_digits(self.slice(start, end).as_slice()) - } - - // Parses `s` as a hexadecimal number. - fn parse_hex_digits(&self, s: &str) -> Result { - match num::from_str_radix::(s, 16) { - Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)), - None => { - self.err(&format!("Could not parse '{}' as hex number.", s)[]) - } - } - } - - // Parses a named capture. - // Assumes that '(?P<' has been consumed and that the current character - // is '<'. - // When done, parser will be at the closing '>' character. - fn parse_named_capture(&mut self) -> Result<(), Error> { - try!(self.noteof("a capture name")); - let closer = - match self.pos('>') { - Some(i) => i, - None => return self.err("Capture name must end with '>'."), - }; - if closer - self.chari == 0 { - return self.err("Capture names must have at least 1 character.") - } - let name = self.slice(self.chari, closer); - if !name.chars().all(is_valid_cap) { - return self.err( - "Capture names can only have underscores, letters and digits.") - } - if self.names.contains(&name) { - return self.err(&format!("Duplicate capture group name '{}'.", - name)[]) - } - self.names.push(name.clone()); - self.chari = closer; - self.caps += 1; - self.stack.push(Paren(self.flags, self.caps, name)); - Ok(()) - } - - // Parses non-capture groups and options. - // Assumes that '(?' has already been consumed and '?' is the current - // character. - fn parse_group_opts(&mut self) -> Result<(), Error> { - if self.peek_is(1, 'P') && self.peek_is(2, '<') { - try!(self.expect('P')); - try!(self.expect('<')); - return self.parse_named_capture() - } - let start = self.chari; - let mut flags = self.flags; - let mut sign = 1i; - let mut saw_flag = false; - loop { - try!(self.noteof( - "expected non-empty set of flags or closing ')'")); - match self.cur() { - 'i' => { flags = flags | FLAG_NOCASE; saw_flag = true}, - 'm' => { flags = flags | FLAG_MULTI; saw_flag = true}, - 's' => { flags = flags | FLAG_DOTNL; saw_flag = true}, - 'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true}, - '-' => { - if sign < 0 { - return self.err(&format!( - "Cannot negate flags twice in '{}'.", - self.slice(start, self.chari + 1))[]) - } - sign = -1; - saw_flag = false; - flags = flags ^ flags; - } - ':' | ')' => { - if sign < 0 { - if !saw_flag { - return self.err(&format!( - "A valid flag does not follow negation in '{}'", - self.slice(start, self.chari + 1))[]) - } - flags = flags ^ flags; - } - if self.cur() == ':' { - // Save the old flags with the opening paren. - self.stack.push(Paren(self.flags, 0, "".to_string())); - } - self.flags = flags; - return Ok(()) - } - _ => return self.err(&format!( - "Unrecognized flag '{}'.", self.cur())[]), - } - } - } - - // Peeks at the next character and returns whether it's ungreedy or not. - // If it is, then the next character is consumed. - fn get_next_greedy(&mut self) -> Result { - Ok(if self.peek_is(1, '?') { - try!(self.expect('?')); - Ungreedy - } else { - Greedy - }.swap(self.flags & FLAG_SWAP_GREED > 0)) - } - - // Searches the stack (starting at the top) until it finds an expression - // for which `pred` returns true. The index of that expression in the - // stack is returned. - // If there's no match, then one of two things happens depending on the - // values of `allow_start`. When it's true, then `0` will be returned. - // Otherwise, an error will be returned. - // Generally, `allow_start` is only true when you're *not* expecting an - // opening parenthesis. - fn pos_last

(&self, allow_start: bool, pred: P) -> Result where - P: FnMut(&BuildAst) -> bool, - { - let from = match self.stack.iter().rev().position(pred) { - Some(i) => i, - None => { - if allow_start { - self.stack.len() - } else { - return self.err("No matching opening parenthesis.") - } - } - }; - // Adjust index since 'from' is for the reversed stack. - // Also, don't include the '(' or '|'. - Ok(self.stack.len() - from) - } - - // concat starts at `from` in the parser's stack and concatenates all - // expressions up to the top of the stack. The resulting concatenation is - // then pushed on to the stack. - // Usually `from` corresponds to the position of an opening parenthesis, - // a '|' (alternation) or the start of the entire expression. - fn concat(&mut self, from: uint) -> Result<(), Error> { - let ast = try!(self.build_from(from, concat_flatten)); - self.push(ast); - Ok(()) - } - - // concat starts at `from` in the parser's stack and alternates all - // expressions up to the top of the stack. The resulting alternation is - // then pushed on to the stack. - // Usually `from` corresponds to the position of an opening parenthesis - // or the start of the entire expression. - // This will also drop any opening parens or alternation bars found in - // the intermediate AST. - fn alternate(&mut self, mut from: uint) -> Result<(), Error> { - // Unlike in the concatenation case, we want 'build_from' to continue - // all the way to the opening left paren (so it will be popped off and - // thrown away). But be careful with overflow---we can't count on the - // open paren to be there. - if from > 0 { from = from - 1} - let ast = try!(self.build_from(from, |l,r| Alt(box l, box r))); - self.push(ast); - Ok(()) - } - - // build_from combines all AST elements starting at 'from' in the - // parser's stack using 'mk' to combine them. If any such element is not an - // AST then it is popped off the stack and ignored. - fn build_from(&mut self, from: uint, mut mk: F) -> Result where - F: FnMut(Ast, Ast) -> Ast, - { - if from >= self.stack.len() { - return self.err("Empty group or alternate not allowed.") - } - - let mut combined = try!(self.pop_ast()); - let mut i = self.stack.len(); - while i > from { - i = i - 1; - match self.stack.pop().unwrap() { - Expr(x) => combined = mk(x, combined), - _ => {}, - } - } - Ok(combined) - } - - fn parse_uint(&self, s: &str) -> Result { - match s.parse::() { - Some(i) => Ok(i), - None => { - self.err(&format!("Expected an unsigned integer but got '{}'.", - s)[]) - } - } - } - - fn char_from_u32(&self, n: u32) -> Result { - match char::from_u32(n) { - Some(c) => Ok(c), - None => { - self.err(&format!("Could not decode '{}' to unicode \ - character.", n)[]) - } - } - } - - fn pos(&self, c: char) -> Option { - self.chars.iter() - .skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i) - } - - fn err(&self, msg: &str) -> Result { - Err(Error { - pos: self.chari, - msg: msg.to_string(), - }) - } - - fn peek(&self, offset: uint) -> Option { - if self.chari + offset >= self.chars.len() { - return None - } - Some(self.chars[self.chari + offset]) - } - - fn peek_is(&self, offset: uint, is: char) -> bool { - self.peek(offset) == Some(is) - } - - fn cur(&self) -> char { - self.chars[self.chari] - } - - fn slice(&self, start: uint, end: uint) -> String { - self.chars[start..end].iter().cloned().collect() - } -} - -// Given an unordered collection of character ranges, combine_ranges returns -// an ordered sequence of character ranges where no two ranges overlap. They -// are ordered from least to greatest (using start position). -fn combine_ranges(unordered: Vec<(char, char)>) -> Vec<(char, char)> { - // Returns true iff the two character classes overlap or share a boundary. - // e.g., ('a', 'g') and ('h', 'm') would return true. - fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool { - cmp::max(a, x) as u32 <= cmp::min(b, y) as u32 + 1 - } - - // This is currently O(n^2), but I think with sufficient cleverness, - // it can be reduced to O(n) **if necessary**. - let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len()); - for (us, ue) in unordered.into_iter() { - let (mut us, mut ue) = (us, ue); - assert!(us <= ue); - let mut which: Option = None; - for (i, &(os, oe)) in ordered.iter().enumerate() { - if should_merge((us, ue), (os, oe)) { - us = cmp::min(us, os); - ue = cmp::max(ue, oe); - which = Some(i); - break - } - } - match which { - None => ordered.push((us, ue)), - Some(i) => ordered[i] = (us, ue), - } - } - ordered.sort(); - ordered -} - -// Constructs a Unicode friendly Perl character class from \d, \s or \w -// (or any of their negated forms). Note that this does not handle negation. -fn perl_unicode_class(which: char) -> Vec<(char, char)> { - match which.to_lowercase() { - 'd' => PERLD.to_vec(), - 's' => PERLS.to_vec(), - 'w' => PERLW.to_vec(), - _ => unreachable!(), - } -} - -// Returns a concatenation of two expressions. This also guarantees that a -// `Cat` expression will never be a direct child of another `Cat` expression. -fn concat_flatten(x: Ast, y: Ast) -> Ast { - match (x, y) { - (Cat(mut xs), Cat(ys)) => { xs.extend(ys.into_iter()); Cat(xs) } - (Cat(mut xs), ast) => { xs.push(ast); Cat(xs) } - (ast, Cat(mut xs)) => { xs.insert(0, ast); Cat(xs) } - (ast1, ast2) => Cat(vec!(ast1, ast2)), - } -} - -pub fn is_punct(c: char) -> bool { - match c { - '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | - '[' | ']' | '{' | '}' | '^' | '$' => true, - _ => false, - } -} - -fn is_valid_cap(c: char) -> bool { - c == '_' || (c >= '0' && c <= '9') - || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') -} - -fn find_class(classes: NamedClasses, name: &str) -> Option> { - match classes.binary_search_by(|&(s, _)| s.cmp(name)) { - Ok(i) => Some(classes[i].1.to_vec()), - Err(_) => None, - } -} - -type Class = &'static [(char, char)]; -type NamedClasses = &'static [(&'static str, &'static Class)]; - -static ASCII_CLASSES: NamedClasses = &[ - // Classes must be in alphabetical order so that bsearch works. - // [:alnum:] alphanumeric (== [0-9A-Za-z]) - // [:alpha:] alphabetic (== [A-Za-z]) - // [:ascii:] ASCII (== [\x00-\x7F]) - // [:blank:] blank (== [\t ]) - // [:cntrl:] control (== [\x00-\x1F\x7F]) - // [:digit:] digits (== [0-9]) - // [:graph:] graphical (== [!-~]) - // [:lower:] lower case (== [a-z]) - // [:print:] printable (== [ -~] == [ [:graph:]]) - // [:punct:] punctuation (== [!-/:-@[-`{-~]) - // [:space:] whitespace (== [\t\n\v\f\r ]) - // [:upper:] upper case (== [A-Z]) - // [:word:] word characters (== [0-9A-Za-z_]) - // [:xdigit:] hex digit (== [0-9A-Fa-f]) - // Taken from: http://golang.org/pkg/regex/syntax/ - ("alnum", &ALNUM), - ("alpha", &ALPHA), - ("ascii", &ASCII), - ("blank", &BLANK), - ("cntrl", &CNTRL), - ("digit", &DIGIT), - ("graph", &GRAPH), - ("lower", &LOWER), - ("print", &PRINT), - ("punct", &PUNCT), - ("space", &SPACE), - ("upper", &UPPER), - ("word", &WORD), - ("xdigit", &XDIGIT), -]; - -static ALNUM: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z')]; -static ALPHA: Class = &[('A', 'Z'), ('a', 'z')]; -static ASCII: Class = &[('\x00', '\x7F')]; -static BLANK: Class = &[(' ', ' '), ('\t', '\t')]; -static CNTRL: Class = &[('\x00', '\x1F'), ('\x7F', '\x7F')]; -static DIGIT: Class = &[('0', '9')]; -static GRAPH: Class = &[('!', '~')]; -static LOWER: Class = &[('a', 'z')]; -static PRINT: Class = &[(' ', '~')]; -static PUNCT: Class = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]; -static SPACE: Class = &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), - ('\x0C', '\x0C'), ('\r', '\r'), (' ', ' ')]; -static UPPER: Class = &[('A', 'Z')]; -static WORD: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]; -static XDIGIT: Class = &[('0', '9'), ('A', 'F'), ('a', 'f')]; diff --git a/src/libregex/re.rs b/src/libregex/re.rs deleted file mode 100644 index 1b68ad500ca..00000000000 --- a/src/libregex/re.rs +++ /dev/null @@ -1,684 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -pub use self::NamesIter::*; -pub use self::Regex::*; - -use std::borrow::IntoCow; -use std::collections::HashMap; -use std::fmt; -use std::string::CowString; - -use compile::Program; -use parse; -use vm; -use vm::{CaptureLocs, MatchKind, Exists, Location, Submatches}; - -/// Escapes all regular expression meta characters in `text`. -/// -/// The string returned may be safely used as a literal in a regular -/// expression. -pub fn quote(text: &str) -> String { - let mut quoted = String::with_capacity(text.len()); - for c in text.chars() { - if parse::is_punct(c) { - quoted.push('\\') - } - quoted.push(c); - } - quoted -} - -/// Tests if the given regular expression matches somewhere in the text given. -/// -/// If there was a problem compiling the regular expression, an error is -/// returned. -/// -/// To find submatches, split or replace text, you'll need to compile an -/// expression first. -/// -/// Note that you should prefer the `regex!` macro when possible. For example, -/// `regex!("...").is_match("...")`. -pub fn is_match(regex: &str, text: &str) -> Result { - Regex::new(regex).map(|r| r.is_match(text)) -} - -/// A compiled regular expression -#[derive(Clone)] -pub enum Regex { - // The representation of `Regex` is exported to support the `regex!` - // syntax extension. Do not rely on it. - // - // See the comments for the `program` module in `lib.rs` for a more - // detailed explanation for what `regex!` requires. - #[doc(hidden)] - Dynamic(ExDynamic), - #[doc(hidden)] - Native(ExNative), -} - -#[derive(Clone)] -#[doc(hidden)] -pub struct ExDynamic { - original: String, - names: Vec>, - #[doc(hidden)] - pub prog: Program -} - -#[doc(hidden)] -#[derive(Copy)] -pub struct ExNative { - #[doc(hidden)] - pub original: &'static str, - #[doc(hidden)] - pub names: &'static &'static [Option<&'static str>], - #[doc(hidden)] - pub prog: fn(MatchKind, &str, uint, uint) -> Vec> -} - -impl Clone for ExNative { - fn clone(&self) -> ExNative { - *self - } -} - -impl fmt::Display for Regex { - /// Shows the original regular expression. - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - fmt::Display::fmt(self.as_str(), f) - } -} - -impl Regex { - /// Compiles a dynamic regular expression. Once compiled, it can be - /// used repeatedly to search, split or replace text in a string. - /// - /// When possible, you should prefer the `regex!` macro since it is - /// safer and always faster. - /// - /// If an invalid expression is given, then an error is returned. - pub fn new(re: &str) -> Result { - let ast = try!(parse::parse(re)); - let (prog, names) = Program::new(ast); - Ok(Dynamic(ExDynamic { - original: re.to_string(), - names: names, - prog: prog, - })) - } - - /// Returns true if and only if the regex matches the string given. - pub fn is_match(&self, text: &str) -> bool { - has_match(&exec(self, Exists, text)) - } - - /// Returns the start and end byte range of the leftmost-first match in - /// `text`. If no match exists, then `None` is returned. - pub fn find(&self, text: &str) -> Option<(uint, uint)> { - let caps = exec(self, Location, text); - if has_match(&caps) { - Some((caps[0].unwrap(), caps[1].unwrap())) - } else { - None - } - } - - /// Returns an iterator for each successive non-overlapping match in - /// `text`, returning the start and end byte indices with respect to - /// `text`. - pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { - FindMatches { - re: self, - search: text, - last_end: 0, - last_match: None, - } - } - - /// Returns the capture groups corresponding to the leftmost-first - /// match in `text`. Capture group `0` always corresponds to the entire - /// match. If no match is found, then `None` is returned. - /// - /// You should only use `captures` if you need access to submatches. - /// Otherwise, `find` is faster for discovering the location of the overall - /// match. - pub fn captures<'t>(&self, text: &'t str) -> Option> { - let caps = exec(self, Submatches, text); - Captures::new(self, text, caps) - } - - /// Returns an iterator over all the non-overlapping capture groups matched - /// in `text`. This is operationally the same as `find_iter` (except it - /// yields information about submatches). - pub fn captures_iter<'r, 't>(&'r self, text: &'t str) - -> FindCaptures<'r, 't> { - FindCaptures { - re: self, - search: text, - last_match: None, - last_end: 0, - } - } - - /// Returns an iterator of substrings of `text` delimited by a match - /// of the regular expression. - /// Namely, each element of the iterator corresponds to text that *isn't* - /// matched by the regular expression. - /// - /// This method will *not* copy the text given. - pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> { - RegexSplits { - finder: self.find_iter(text), - last: 0, - } - } - - /// Returns an iterator of at most `limit` substrings of `text` delimited - /// by a match of the regular expression. (A `limit` of `0` will return no - /// substrings.) - /// Namely, each element of the iterator corresponds to text that *isn't* - /// matched by the regular expression. - /// The remainder of the string that is not split will be the last element - /// in the iterator. - /// - /// This method will *not* copy the text given. - pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: uint) - -> RegexSplitsN<'r, 't> { - RegexSplitsN { - splits: self.split(text), - cur: 0, - limit: limit, - } - } - - /// Replaces the leftmost-first match with the replacement provided. - /// The replacement can be a regular string (where `$N` and `$name` are - /// expanded to match capture groups) or a function that takes the matches' - /// `Captures` and returns the replaced string. - /// - /// If no match is found, then a copy of the string is returned unchanged. - pub fn replace(&self, text: &str, rep: R) -> String { - self.replacen(text, 1, rep) - } - - /// Replaces all non-overlapping matches in `text` with the - /// replacement provided. This is the same as calling `replacen` with - /// `limit` set to `0`. - /// - /// See the documentation for `replace` for details on how to access - /// submatches in the replacement string. - pub fn replace_all(&self, text: &str, rep: R) -> String { - self.replacen(text, 0, rep) - } - - /// Replaces at most `limit` non-overlapping matches in `text` with the - /// replacement provided. If `limit` is 0, then all non-overlapping matches - /// are replaced. - /// - /// See the documentation for `replace` for details on how to access - /// submatches in the replacement string. - pub fn replacen - (&self, text: &str, limit: uint, mut rep: R) -> String { - let mut new = String::with_capacity(text.len()); - let mut last_match = 0u; - - for (i, cap) in self.captures_iter(text).enumerate() { - // It'd be nicer to use the 'take' iterator instead, but it seemed - // awkward given that '0' => no limit. - if limit > 0 && i >= limit { - break - } - - let (s, e) = cap.pos(0).unwrap(); // captures only reports matches - new.push_str(&text[last_match..s]); - new.push_str(&rep.reg_replace(&cap)[]); - last_match = e; - } - new.push_str(&text[last_match..text.len()]); - return new; - } - - /// Returns the original string of this regex. - pub fn as_str<'a>(&'a self) -> &'a str { - match *self { - Dynamic(ExDynamic { ref original, .. }) => &original[], - Native(ExNative { ref original, .. }) => &original[], - } - } - - #[doc(hidden)] - #[unstable] - pub fn names_iter<'a>(&'a self) -> NamesIter<'a> { - match *self { - Native(ref n) => NamesIterNative(n.names.iter()), - Dynamic(ref d) => NamesIterDynamic(d.names.iter()) - } - } - - fn names_len(&self) -> uint { - match *self { - Native(ref n) => n.names.len(), - Dynamic(ref d) => d.names.len() - } - } - -} - -#[derive(Clone)] -pub enum NamesIter<'a> { - NamesIterNative(::std::slice::Iter<'a, Option<&'static str>>), - NamesIterDynamic(::std::slice::Iter<'a, Option>) -} - -impl<'a> Iterator for NamesIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option> { - match *self { - NamesIterNative(ref mut i) => i.next().map(|x| x.map(|s| s.to_string())), - NamesIterDynamic(ref mut i) => i.next().map(|x| x.as_ref().map(|s| s.to_string())), - } - } -} - -/// NoExpand indicates literal string replacement. -/// -/// It can be used with `replace` and `replace_all` to do a literal -/// string replacement without expanding `$name` to their corresponding -/// capture groups. -/// -/// `'r` is the lifetime of the literal text. -pub struct NoExpand<'t>(pub &'t str); - -/// Replacer describes types that can be used to replace matches in a string. -pub trait Replacer { - /// Returns a possibly owned string that is used to replace the match - /// corresponding to the `caps` capture group. - /// - /// The `'a` lifetime refers to the lifetime of a borrowed string when - /// a new owned string isn't needed (e.g., for `NoExpand`). - fn reg_replace<'a>(&'a mut self, caps: &Captures) -> CowString<'a>; -} - -impl<'t> Replacer for NoExpand<'t> { - fn reg_replace<'a>(&'a mut self, _: &Captures) -> CowString<'a> { - let NoExpand(s) = *self; - s.into_cow() - } -} - -impl<'t> Replacer for &'t str { - fn reg_replace<'a>(&'a mut self, caps: &Captures) -> CowString<'a> { - caps.expand(*self).into_cow() - } -} - -impl Replacer for F where F: FnMut(&Captures) -> String { - fn reg_replace<'a>(&'a mut self, caps: &Captures) -> CowString<'a> { - (*self)(caps).into_cow() - } -} - -/// Yields all substrings delimited by a regular expression match. -/// -/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime -/// of the string being split. -#[derive(Clone)] -pub struct RegexSplits<'r, 't> { - finder: FindMatches<'r, 't>, - last: uint, -} - -impl<'r, 't> Iterator for RegexSplits<'r, 't> { - type Item = &'t str; - - fn next(&mut self) -> Option<&'t str> { - let text = self.finder.search; - match self.finder.next() { - None => { - if self.last >= text.len() { - None - } else { - let s = &text[self.last..text.len()]; - self.last = text.len(); - Some(s) - } - } - Some((s, e)) => { - let matched = &text[self.last..s]; - self.last = e; - Some(matched) - } - } - } -} - -/// Yields at most `N` substrings delimited by a regular expression match. -/// -/// The last substring will be whatever remains after splitting. -/// -/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime -/// of the string being split. -#[derive(Clone)] -pub struct RegexSplitsN<'r, 't> { - splits: RegexSplits<'r, 't>, - cur: uint, - limit: uint, -} - -impl<'r, 't> Iterator for RegexSplitsN<'r, 't> { - type Item = &'t str; - - fn next(&mut self) -> Option<&'t str> { - let text = self.splits.finder.search; - if self.cur >= self.limit { - None - } else { - self.cur += 1; - if self.cur >= self.limit { - Some(&text[self.splits.last..text.len()]) - } else { - self.splits.next() - } - } - } -} - -/// Captures represents a group of captured strings for a single match. -/// -/// The 0th capture always corresponds to the entire match. Each subsequent -/// index corresponds to the next capture group in the regex. -/// If a capture group is named, then the matched string is *also* available -/// via the `name` method. (Note that the 0th capture is always unnamed and so -/// must be accessed with the `at` method.) -/// -/// Positions returned from a capture group are always byte indices. -/// -/// `'t` is the lifetime of the matched text. -pub struct Captures<'t> { - text: &'t str, - locs: CaptureLocs, - named: Option>, -} - -impl<'t> Captures<'t> { - #[allow(unstable)] - fn new(re: &Regex, search: &'t str, locs: CaptureLocs) - -> Option> { - if !has_match(&locs) { - return None - } - - let named = - if re.names_len() == 0 { - None - } else { - let mut named = HashMap::new(); - for (i, name) in re.names_iter().enumerate() { - match name { - None => {}, - Some(name) => { - named.insert(name, i); - } - } - } - Some(named) - }; - Some(Captures { - text: search, - locs: locs, - named: named, - }) - } - - /// Returns the start and end positions of the Nth capture group. - /// Returns `None` if `i` is not a valid capture group or if the capture - /// group did not match anything. - /// The positions returned are *always* byte indices with respect to the - /// original string matched. - pub fn pos(&self, i: uint) -> Option<(uint, uint)> { - let (s, e) = (i * 2, i * 2 + 1); - if e >= self.locs.len() || self.locs[s].is_none() { - // VM guarantees that each pair of locations are both Some or None. - return None - } - Some((self.locs[s].unwrap(), self.locs[e].unwrap())) - } - - /// Returns the matched string for the capture group `i`. If `i` isn't - /// a valid capture group or didn't match anything, then `None` is - /// returned. - pub fn at(&self, i: uint) -> Option<&'t str> { - match self.pos(i) { - None => None, - Some((s, e)) => Some(&self.text[s.. e]) - } - } - - /// Returns the matched string for the capture group named `name`. If - /// `name` isn't a valid capture group or didn't match anything, then - /// `None` is returned. - pub fn name(&self, name: &str) -> Option<&'t str> { - match self.named { - None => None, - Some(ref h) => { - match h.get(name) { - None => None, - Some(i) => self.at(*i), - } - } - } - } - - /// Creates an iterator of all the capture groups in order of appearance - /// in the regular expression. - pub fn iter(&'t self) -> SubCaptures<'t> { - SubCaptures { idx: 0, caps: self, } - } - - /// Creates an iterator of all the capture group positions in order of - /// appearance in the regular expression. Positions are byte indices - /// in terms of the original string matched. - pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { - SubCapturesPos { idx: 0, caps: self, } - } - - /// Expands all instances of `$name` in `text` to the corresponding capture - /// group `name`. - /// - /// `name` may be an integer corresponding to the index of the - /// capture group (counted by order of opening parenthesis where `0` is the - /// entire match) or it can be a name (consisting of letters, digits or - /// underscores) corresponding to a named capture group. - /// - /// If `name` isn't a valid capture group (whether the name doesn't exist or - /// isn't a valid index), then it is replaced with the empty string. - /// - /// To write a literal `$` use `$$`. - pub fn expand(&self, text: &str) -> String { - // How evil can you get? - // FIXME: Don't use regexes for this. It's completely unnecessary. - let re = Regex::new(r"(^|[^$]|\b)\$(\w+)").unwrap(); - let text = re.replace_all(text, |&mut: refs: &Captures| -> String { - let pre = refs.at(1).unwrap_or(""); - let name = refs.at(2).unwrap_or(""); - format!("{}{}", pre, - match name.parse::() { - None => self.name(name).unwrap_or("").to_string(), - Some(i) => self.at(i).unwrap_or("").to_string(), - }) - }); - let re = Regex::new(r"\$\$").unwrap(); - re.replace_all(&text[], NoExpand("$")) - } - - /// Returns the number of captured groups. - #[inline] - pub fn len(&self) -> uint { self.locs.len() / 2 } - - /// Returns if there are no captured groups. - #[inline] - pub fn is_empty(&self) -> bool { self.len() == 0 } -} - -/// An iterator over capture groups for a particular match of a regular -/// expression. -/// -/// `'t` is the lifetime of the matched text. -#[derive(Clone)] -pub struct SubCaptures<'t> { - idx: uint, - caps: &'t Captures<'t>, -} - -impl<'t> Iterator for SubCaptures<'t> { - type Item = &'t str; - - fn next(&mut self) -> Option<&'t str> { - if self.idx < self.caps.len() { - self.idx += 1; - Some(self.caps.at(self.idx - 1).unwrap_or("")) - } else { - None - } - } -} - -/// An iterator over capture group positions for a particular match of a -/// regular expression. -/// -/// Positions are byte indices in terms of the original string matched. -/// -/// `'t` is the lifetime of the matched text. -#[derive(Clone)] -pub struct SubCapturesPos<'t> { - idx: uint, - caps: &'t Captures<'t>, -} - -impl<'t> Iterator for SubCapturesPos<'t> { - type Item = Option<(uint, uint)>; - - fn next(&mut self) -> Option> { - if self.idx < self.caps.len() { - self.idx += 1; - Some(self.caps.pos(self.idx - 1)) - } else { - None - } - } -} - -/// An iterator that yields all non-overlapping capture groups matching a -/// particular regular expression. -/// -/// The iterator stops when no more matches can be found. -/// -/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime -/// of the matched string. -#[derive(Clone)] -pub struct FindCaptures<'r, 't> { - re: &'r Regex, - search: &'t str, - last_match: Option, - last_end: uint, -} - -impl<'r, 't> Iterator for FindCaptures<'r, 't> { - type Item = Captures<'t>; - - fn next(&mut self) -> Option> { - if self.last_end > self.search.len() { - return None - } - - let caps = exec_slice(self.re, Submatches, self.search, - self.last_end, self.search.len()); - let (s, e) = - if !has_match(&caps) { - return None - } else { - (caps[0].unwrap(), caps[1].unwrap()) - }; - - // Don't accept empty matches immediately following a match. - // i.e., no infinite loops please. - if e == s && Some(self.last_end) == self.last_match { - self.last_end += 1; - return self.next() - } - self.last_end = e; - self.last_match = Some(self.last_end); - Captures::new(self.re, self.search, caps) - } -} - -/// An iterator over all non-overlapping matches for a particular string. -/// -/// The iterator yields a tuple of integers corresponding to the start and end -/// of the match. The indices are byte offsets. The iterator stops when no more -/// matches can be found. -/// -/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime -/// of the matched string. -#[derive(Clone)] -pub struct FindMatches<'r, 't> { - re: &'r Regex, - search: &'t str, - last_match: Option, - last_end: uint, -} - -impl<'r, 't> Iterator for FindMatches<'r, 't> { - type Item = (uint, uint); - - fn next(&mut self) -> Option<(uint, uint)> { - if self.last_end > self.search.len() { - return None - } - - let caps = exec_slice(self.re, Location, self.search, - self.last_end, self.search.len()); - let (s, e) = - if !has_match(&caps) { - return None - } else { - (caps[0].unwrap(), caps[1].unwrap()) - }; - - // Don't accept empty matches immediately following a match. - // i.e., no infinite loops please. - if e == s && Some(self.last_end) == self.last_match { - self.last_end += 1; - return self.next() - } - self.last_end = e; - self.last_match = Some(self.last_end); - Some((s, e)) - } -} - -fn exec(re: &Regex, which: MatchKind, input: &str) -> CaptureLocs { - exec_slice(re, which, input, 0, input.len()) -} - -fn exec_slice(re: &Regex, which: MatchKind, - input: &str, s: uint, e: uint) -> CaptureLocs { - match *re { - Dynamic(ExDynamic { ref prog, .. }) => vm::run(which, prog, input, s, e), - Native(ExNative { ref prog, .. }) => (*prog)(which, input, s, e), - } -} - -#[inline] -fn has_match(caps: &CaptureLocs) -> bool { - caps.len() >= 2 && caps[0].is_some() && caps[1].is_some() -} diff --git a/src/libregex/test/bench.rs b/src/libregex/test/bench.rs deleted file mode 100644 index 17521ff7ea5..00000000000 --- a/src/libregex/test/bench.rs +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. -#![allow(non_snake_case)] - -use std::rand::{Rng, thread_rng}; -use stdtest::Bencher; -use std::iter::repeat; - -use regex::{Regex, NoExpand}; - -fn bench_assert_match(b: &mut Bencher, re: Regex, text: &str) { - b.iter(|| if !re.is_match(text) { panic!("no match") }); -} - -#[bench] -fn no_exponential(b: &mut Bencher) { - let n = 100; - let re = Regex::new(format!("{}{}", - repeat("a?").take(n).collect::(), - repeat("a").take(n).collect::()).as_slice()).unwrap(); - let text = repeat("a").take(n).collect::(); - bench_assert_match(b, re, text.as_slice()); -} - -#[bench] -fn literal(b: &mut Bencher) { - let re = regex!("y"); - let text = format!("{}y", repeat("x").take(50).collect::()); - bench_assert_match(b, re, text.as_slice()); -} - -#[bench] -fn not_literal(b: &mut Bencher) { - let re = regex!(".y"); - let text = format!("{}y", repeat("x").take(50).collect::()); - bench_assert_match(b, re, text.as_slice()); -} - -#[bench] -fn match_class(b: &mut Bencher) { - let re = regex!("[abcdw]"); - let text = format!("{}w", repeat("xxxx").take(20).collect::()); - bench_assert_match(b, re, text.as_slice()); -} - -#[bench] -fn match_class_in_range(b: &mut Bencher) { - // 'b' is between 'a' and 'c', so the class range checking doesn't help. - let re = regex!("[ac]"); - let text = format!("{}c", repeat("bbbb").take(20).collect::()); - bench_assert_match(b, re, text.as_slice()); -} - -#[bench] -fn replace_all(b: &mut Bencher) { - let re = regex!("[cjrw]"); - let text = "abcdefghijklmnopqrstuvwxyz"; - // FIXME: This isn't using the $name expand stuff. - // It's possible RE2/Go is using it, but currently, the expand in this - // crate is actually compiling a regex, so it's incredibly slow. - b.iter(|| re.replace_all(text, NoExpand(""))); -} - -#[bench] -fn anchored_literal_short_non_match(b: &mut Bencher) { - let re = regex!("^zbc(d|e)"); - let text = "abcdefghijklmnopqrstuvwxyz"; - b.iter(|| re.is_match(text)); -} - -#[bench] -fn anchored_literal_long_non_match(b: &mut Bencher) { - let re = regex!("^zbc(d|e)"); - let text = repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect::(); - b.iter(|| re.is_match(text.as_slice())); -} - -#[bench] -fn anchored_literal_short_match(b: &mut Bencher) { - let re = regex!("^.bc(d|e)"); - let text = "abcdefghijklmnopqrstuvwxyz"; - b.iter(|| re.is_match(text)); -} - -#[bench] -fn anchored_literal_long_match(b: &mut Bencher) { - let re = regex!("^.bc(d|e)"); - let text = repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect::(); - b.iter(|| re.is_match(text.as_slice())); -} - -#[bench] -fn one_pass_short_a(b: &mut Bencher) { - let re = regex!("^.bc(d|e)*$"); - let text = "abcddddddeeeededd"; - b.iter(|| re.is_match(text)); -} - -#[bench] -fn one_pass_short_a_not(b: &mut Bencher) { - let re = regex!(".bc(d|e)*$"); - let text = "abcddddddeeeededd"; - b.iter(|| re.is_match(text)); -} - -#[bench] -fn one_pass_short_b(b: &mut Bencher) { - let re = regex!("^.bc(?:d|e)*$"); - let text = "abcddddddeeeededd"; - b.iter(|| re.is_match(text)); -} - -#[bench] -fn one_pass_short_b_not(b: &mut Bencher) { - let re = regex!(".bc(?:d|e)*$"); - let text = "abcddddddeeeededd"; - b.iter(|| re.is_match(text)); -} - -#[bench] -fn one_pass_long_prefix(b: &mut Bencher) { - let re = regex!("^abcdefghijklmnopqrstuvwxyz.*$"); - let text = "abcdefghijklmnopqrstuvwxyz"; - b.iter(|| re.is_match(text)); -} - -#[bench] -fn one_pass_long_prefix_not(b: &mut Bencher) { - let re = regex!("^.bcdefghijklmnopqrstuvwxyz.*$"); - let text = "abcdefghijklmnopqrstuvwxyz"; - b.iter(|| re.is_match(text)); -} - -macro_rules! throughput { - ($name:ident, $regex:expr, $size:expr) => ( - #[bench] - fn $name(b: &mut Bencher) { - let text = gen_text($size); - b.bytes = $size; - b.iter(|| if $regex.is_match(text.as_slice()) { panic!("match") }); - } - ); -} - -fn easy0() -> Regex { regex!("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } -fn easy1() -> Regex { regex!("A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") } -fn medium() -> Regex { regex!("[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } -fn hard() -> Regex { regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } - -fn gen_text(n: uint) -> String { - let mut rng = thread_rng(); - let mut bytes = rng.gen_ascii_chars().map(|n| n as u8).take(n) - .collect::>(); - for (i, b) in bytes.iter_mut().enumerate() { - if i % 20 == 0 { - *b = b'\n' - } - } - String::from_utf8(bytes).unwrap() -} - -throughput!{easy0_32, easy0(), 32} -throughput!{easy0_1K, easy0(), 1<<10} -throughput!{easy0_32K, easy0(), 32<<10} - -throughput!{easy1_32, easy1(), 32} -throughput!{easy1_1K, easy1(), 1<<10} -throughput!{easy1_32K, easy1(), 32<<10} - -throughput!{medium_32, medium(), 32} -throughput!{medium_1K, medium(), 1<<10} -throughput!{medium_32K,medium(), 32<<10} - -throughput!{hard_32, hard(), 32} -throughput!{hard_1K, hard(), 1<<10} -throughput!{hard_32K,hard(), 32<<10} diff --git a/src/libregex/test/matches.rs b/src/libregex/test/matches.rs deleted file mode 100644 index 7508f4c50a2..00000000000 --- a/src/libregex/test/matches.rs +++ /dev/null @@ -1,373 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// ignore-tidy-linelength - -// DO NOT EDIT. Automatically generated by 'src/etc/regex-match-tests' -// on 2014-04-23 01:33:36.539280. - -// Tests from basic.dat -mat!{match_basic_3, r"abracadabra$", r"abracadabracadabra", Some((7, 18))} -mat!{match_basic_4, r"a...b", r"abababbb", Some((2, 7))} -mat!{match_basic_5, r"XXXXXX", r"..XXXXXX", Some((2, 8))} -mat!{match_basic_6, r"\)", r"()", Some((1, 2))} -mat!{match_basic_7, r"a]", r"a]a", Some((0, 2))} -mat!{match_basic_9, r"\}", r"}", Some((0, 1))} -mat!{match_basic_10, r"\]", r"]", Some((0, 1))} -mat!{match_basic_12, r"]", r"]", Some((0, 1))} -mat!{match_basic_15, r"^a", r"ax", Some((0, 1))} -mat!{match_basic_16, r"\^a", r"a^a", Some((1, 3))} -mat!{match_basic_17, r"a\^", r"a^", Some((0, 2))} -mat!{match_basic_18, r"a$", r"aa", Some((1, 2))} -mat!{match_basic_19, r"a\$", r"a$", Some((0, 2))} -mat!{match_basic_20, r"^$", r"", Some((0, 0))} -mat!{match_basic_21, r"$^", r"", Some((0, 0))} -mat!{match_basic_22, r"a($)", r"aa", Some((1, 2)), Some((2, 2))} -mat!{match_basic_23, r"a*(^a)", r"aa", Some((0, 1)), Some((0, 1))} -mat!{match_basic_24, r"(..)*(...)*", r"a", Some((0, 0))} -mat!{match_basic_25, r"(..)*(...)*", r"abcd", Some((0, 4)), Some((2, 4))} -mat!{match_basic_26, r"(ab|a)(bc|c)", r"abc", Some((0, 3)), Some((0, 2)), Some((2, 3))} -mat!{match_basic_27, r"(ab)c|abc", r"abc", Some((0, 3)), Some((0, 2))} -mat!{match_basic_28, r"a{0}b", r"ab", Some((1, 2))} -mat!{match_basic_29, r"(a*)(b?)(b+)b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7))} -mat!{match_basic_30, r"(a*)(b{0,1})(b{1,})b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7))} -mat!{match_basic_32, r"((a|a)|a)", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1))} -mat!{match_basic_33, r"(a*)(a|aa)", r"aaaa", Some((0, 4)), Some((0, 3)), Some((3, 4))} -mat!{match_basic_34, r"a*(a.|aa)", r"aaaa", Some((0, 4)), Some((2, 4))} -mat!{match_basic_35, r"a(b)|c(d)|a(e)f", r"aef", Some((0, 3)), None, None, Some((1, 2))} -mat!{match_basic_36, r"(a|b)?.*", r"b", Some((0, 1)), Some((0, 1))} -mat!{match_basic_37, r"(a|b)c|a(b|c)", r"ac", Some((0, 2)), Some((0, 1))} -mat!{match_basic_38, r"(a|b)c|a(b|c)", r"ab", Some((0, 2)), None, Some((1, 2))} -mat!{match_basic_39, r"(a|b)*c|(a|ab)*c", r"abc", Some((0, 3)), Some((1, 2))} -mat!{match_basic_40, r"(a|b)*c|(a|ab)*c", r"xc", Some((1, 2))} -mat!{match_basic_41, r"(.a|.b).*|.*(.a|.b)", r"xa", Some((0, 2)), Some((0, 2))} -mat!{match_basic_42, r"a?(ab|ba)ab", r"abab", Some((0, 4)), Some((0, 2))} -mat!{match_basic_43, r"a?(ac{0}b|ba)ab", r"abab", Some((0, 4)), Some((0, 2))} -mat!{match_basic_44, r"ab|abab", r"abbabab", Some((0, 2))} -mat!{match_basic_45, r"aba|bab|bba", r"baaabbbaba", Some((5, 8))} -mat!{match_basic_46, r"aba|bab", r"baaabbbaba", Some((6, 9))} -mat!{match_basic_47, r"(aa|aaa)*|(a|aaaaa)", r"aa", Some((0, 2)), Some((0, 2))} -mat!{match_basic_48, r"(a.|.a.)*|(a|.a...)", r"aa", Some((0, 2)), Some((0, 2))} -mat!{match_basic_49, r"ab|a", r"xabc", Some((1, 3))} -mat!{match_basic_50, r"ab|a", r"xxabc", Some((2, 4))} -mat!{match_basic_51, r"(?i)(Ab|cD)*", r"aBcD", Some((0, 4)), Some((2, 4))} -mat!{match_basic_52, r"[^-]", r"--a", Some((2, 3))} -mat!{match_basic_53, r"[a-]*", r"--a", Some((0, 3))} -mat!{match_basic_54, r"[a-m-]*", r"--amoma--", Some((0, 4))} -mat!{match_basic_55, r":::1:::0:|:::1:1:0:", r":::0:::1:::1:::0:", Some((8, 17))} -mat!{match_basic_56, r":::1:::0:|:::1:1:1:", r":::0:::1:::1:::0:", Some((8, 17))} -mat!{match_basic_57, r"[[:upper:]]", r"A", Some((0, 1))} -mat!{match_basic_58, r"[[:lower:]]+", r"`az{", Some((1, 3))} -mat!{match_basic_59, r"[[:upper:]]+", r"@AZ[", Some((1, 3))} -mat!{match_basic_65, r" -", r" -", Some((0, 1))} -mat!{match_basic_66, r" -", r" -", Some((0, 1))} -mat!{match_basic_67, r"[^a]", r" -", Some((0, 1))} -mat!{match_basic_68, r" -a", r" -a", Some((0, 2))} -mat!{match_basic_69, r"(a)(b)(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((2, 3))} -mat!{match_basic_70, r"xxx", r"xxx", Some((0, 3))} -mat!{match_basic_71, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 6,", Some((0, 6))} -mat!{match_basic_72, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"2/7", Some((0, 3))} -mat!{match_basic_73, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 1,Feb 6", Some((5, 11))} -mat!{match_basic_74, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))", r"x", Some((0, 1)), Some((0, 1)), Some((0, 1))} -mat!{match_basic_75, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*", r"xx", Some((0, 2)), Some((1, 2)), Some((1, 2))} -mat!{match_basic_76, r"a?(ab|ba)*", r"ababababababababababababababababababababababababababababababababababababababababa", Some((0, 81)), Some((79, 81))} -mat!{match_basic_77, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabbbbaa", Some((18, 25))} -mat!{match_basic_78, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabaa", Some((18, 22))} -mat!{match_basic_79, r"aaac|aabc|abac|abbc|baac|babc|bbac|bbbc", r"baaabbbabac", Some((7, 11))} -mat!{match_basic_80, r".*", r"", Some((0, 2))} -mat!{match_basic_81, r"aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll", r"XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa", Some((53, 57))} -mat!{match_basic_83, r"a*a*a*a*a*b", r"aaaaaaaaab", Some((0, 10))} -mat!{match_basic_84, r"^", r"", Some((0, 0))} -mat!{match_basic_85, r"$", r"", Some((0, 0))} -mat!{match_basic_86, r"^$", r"", Some((0, 0))} -mat!{match_basic_87, r"^a$", r"a", Some((0, 1))} -mat!{match_basic_88, r"abc", r"abc", Some((0, 3))} -mat!{match_basic_89, r"abc", r"xabcy", Some((1, 4))} -mat!{match_basic_90, r"abc", r"ababc", Some((2, 5))} -mat!{match_basic_91, r"ab*c", r"abc", Some((0, 3))} -mat!{match_basic_92, r"ab*bc", r"abc", Some((0, 3))} -mat!{match_basic_93, r"ab*bc", r"abbc", Some((0, 4))} -mat!{match_basic_94, r"ab*bc", r"abbbbc", Some((0, 6))} -mat!{match_basic_95, r"ab+bc", r"abbc", Some((0, 4))} -mat!{match_basic_96, r"ab+bc", r"abbbbc", Some((0, 6))} -mat!{match_basic_97, r"ab?bc", r"abbc", Some((0, 4))} -mat!{match_basic_98, r"ab?bc", r"abc", Some((0, 3))} -mat!{match_basic_99, r"ab?c", r"abc", Some((0, 3))} -mat!{match_basic_100, r"^abc$", r"abc", Some((0, 3))} -mat!{match_basic_101, r"^abc", r"abcc", Some((0, 3))} -mat!{match_basic_102, r"abc$", r"aabc", Some((1, 4))} -mat!{match_basic_103, r"^", r"abc", Some((0, 0))} -mat!{match_basic_104, r"$", r"abc", Some((3, 3))} -mat!{match_basic_105, r"a.c", r"abc", Some((0, 3))} -mat!{match_basic_106, r"a.c", r"axc", Some((0, 3))} -mat!{match_basic_107, r"a.*c", r"axyzc", Some((0, 5))} -mat!{match_basic_108, r"a[bc]d", r"abd", Some((0, 3))} -mat!{match_basic_109, r"a[b-d]e", r"ace", Some((0, 3))} -mat!{match_basic_110, r"a[b-d]", r"aac", Some((1, 3))} -mat!{match_basic_111, r"a[-b]", r"a-", Some((0, 2))} -mat!{match_basic_112, r"a[b-]", r"a-", Some((0, 2))} -mat!{match_basic_113, r"a]", r"a]", Some((0, 2))} -mat!{match_basic_114, r"a[]]b", r"a]b", Some((0, 3))} -mat!{match_basic_115, r"a[^bc]d", r"aed", Some((0, 3))} -mat!{match_basic_116, r"a[^-b]c", r"adc", Some((0, 3))} -mat!{match_basic_117, r"a[^]b]c", r"adc", Some((0, 3))} -mat!{match_basic_118, r"ab|cd", r"abc", Some((0, 2))} -mat!{match_basic_119, r"ab|cd", r"abcd", Some((0, 2))} -mat!{match_basic_120, r"a\(b", r"a(b", Some((0, 3))} -mat!{match_basic_121, r"a\(*b", r"ab", Some((0, 2))} -mat!{match_basic_122, r"a\(*b", r"a((b", Some((0, 4))} -mat!{match_basic_123, r"((a))", r"abc", Some((0, 1)), Some((0, 1)), Some((0, 1))} -mat!{match_basic_124, r"(a)b(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((2, 3))} -mat!{match_basic_125, r"a+b+c", r"aabbabc", Some((4, 7))} -mat!{match_basic_126, r"a*", r"aaa", Some((0, 3))} -mat!{match_basic_128, r"(a*)*", r"-", Some((0, 0)), None} -mat!{match_basic_129, r"(a*)+", r"-", Some((0, 0)), Some((0, 0))} -mat!{match_basic_131, r"(a*|b)*", r"-", Some((0, 0)), None} -mat!{match_basic_132, r"(a+|b)*", r"ab", Some((0, 2)), Some((1, 2))} -mat!{match_basic_133, r"(a+|b)+", r"ab", Some((0, 2)), Some((1, 2))} -mat!{match_basic_134, r"(a+|b)?", r"ab", Some((0, 1)), Some((0, 1))} -mat!{match_basic_135, r"[^ab]*", r"cde", Some((0, 3))} -mat!{match_basic_137, r"(^)*", r"-", Some((0, 0)), None} -mat!{match_basic_138, r"a*", r"", Some((0, 0))} -mat!{match_basic_139, r"([abc])*d", r"abbbcd", Some((0, 6)), Some((4, 5))} -mat!{match_basic_140, r"([abc])*bcd", r"abcd", Some((0, 4)), Some((0, 1))} -mat!{match_basic_141, r"a|b|c|d|e", r"e", Some((0, 1))} -mat!{match_basic_142, r"(a|b|c|d|e)f", r"ef", Some((0, 2)), Some((0, 1))} -mat!{match_basic_144, r"((a*|b))*", r"-", Some((0, 0)), None, None} -mat!{match_basic_145, r"abcd*efg", r"abcdefg", Some((0, 7))} -mat!{match_basic_146, r"ab*", r"xabyabbbz", Some((1, 3))} -mat!{match_basic_147, r"ab*", r"xayabbbz", Some((1, 2))} -mat!{match_basic_148, r"(ab|cd)e", r"abcde", Some((2, 5)), Some((2, 4))} -mat!{match_basic_149, r"[abhgefdc]ij", r"hij", Some((0, 3))} -mat!{match_basic_150, r"(a|b)c*d", r"abcd", Some((1, 4)), Some((1, 2))} -mat!{match_basic_151, r"(ab|ab*)bc", r"abc", Some((0, 3)), Some((0, 1))} -mat!{match_basic_152, r"a([bc]*)c*", r"abc", Some((0, 3)), Some((1, 3))} -mat!{match_basic_153, r"a([bc]*)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4))} -mat!{match_basic_154, r"a([bc]+)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4))} -mat!{match_basic_155, r"a([bc]*)(c+d)", r"abcd", Some((0, 4)), Some((1, 2)), Some((2, 4))} -mat!{match_basic_156, r"a[bcd]*dcdcde", r"adcdcde", Some((0, 7))} -mat!{match_basic_157, r"(ab|a)b*c", r"abc", Some((0, 3)), Some((0, 2))} -mat!{match_basic_158, r"((a)(b)c)(d)", r"abcd", Some((0, 4)), Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((3, 4))} -mat!{match_basic_159, r"[A-Za-z_][A-Za-z0-9_]*", r"alpha", Some((0, 5))} -mat!{match_basic_160, r"^a(bc+|b[eh])g|.h$", r"abh", Some((1, 3))} -mat!{match_basic_161, r"(bc+d$|ef*g.|h?i(j|k))", r"effgz", Some((0, 5)), Some((0, 5))} -mat!{match_basic_162, r"(bc+d$|ef*g.|h?i(j|k))", r"ij", Some((0, 2)), Some((0, 2)), Some((1, 2))} -mat!{match_basic_163, r"(bc+d$|ef*g.|h?i(j|k))", r"reffgz", Some((1, 6)), Some((1, 6))} -mat!{match_basic_164, r"(((((((((a)))))))))", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1))} -mat!{match_basic_165, r"multiple words", r"multiple words yeah", Some((0, 14))} -mat!{match_basic_166, r"(.*)c(.*)", r"abcde", Some((0, 5)), Some((0, 2)), Some((3, 5))} -mat!{match_basic_167, r"abcd", r"abcd", Some((0, 4))} -mat!{match_basic_168, r"a(bc)d", r"abcd", Some((0, 4)), Some((1, 3))} -mat!{match_basic_169, r"a[-]?c", r"ac", Some((0, 3))} -mat!{match_basic_170, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qaddafi", Some((0, 15)), None, Some((10, 12))} -mat!{match_basic_171, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mo'ammar Gadhafi", Some((0, 16)), None, Some((11, 13))} -mat!{match_basic_172, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Kaddafi", Some((0, 15)), None, Some((10, 12))} -mat!{match_basic_173, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qadhafi", Some((0, 15)), None, Some((10, 12))} -mat!{match_basic_174, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gadafi", Some((0, 14)), None, Some((10, 11))} -mat!{match_basic_175, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadafi", Some((0, 15)), None, Some((11, 12))} -mat!{match_basic_176, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moamar Gaddafi", Some((0, 14)), None, Some((9, 11))} -mat!{match_basic_177, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadhdhafi", Some((0, 18)), None, Some((13, 15))} -mat!{match_basic_178, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Khaddafi", Some((0, 16)), None, Some((11, 13))} -mat!{match_basic_179, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafy", Some((0, 16)), None, Some((11, 13))} -mat!{match_basic_180, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghadafi", Some((0, 15)), None, Some((11, 12))} -mat!{match_basic_181, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafi", Some((0, 16)), None, Some((11, 13))} -mat!{match_basic_182, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muamar Kaddafi", Some((0, 14)), None, Some((9, 11))} -mat!{match_basic_183, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Quathafi", Some((0, 16)), None, Some((11, 13))} -mat!{match_basic_184, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gheddafi", Some((0, 16)), None, Some((11, 13))} -mat!{match_basic_185, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Khadafy", Some((0, 15)), None, Some((11, 12))} -mat!{match_basic_186, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Qudhafi", Some((0, 15)), None, Some((10, 12))} -mat!{match_basic_187, r"a+(b|c)*d+", r"aabcdd", Some((0, 6)), Some((3, 4))} -mat!{match_basic_188, r"^.+$", r"vivi", Some((0, 4))} -mat!{match_basic_189, r"^(.+)$", r"vivi", Some((0, 4)), Some((0, 4))} -mat!{match_basic_190, r"^([^!.]+).att.com!(.+)$", r"gryphon.att.com!eby", Some((0, 19)), Some((0, 7)), Some((16, 19))} -mat!{match_basic_191, r"^([^!]+!)?([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3))} -mat!{match_basic_192, r"^([^!]+!)?([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))} -mat!{match_basic_193, r"^([^!]+!)?([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))} -mat!{match_basic_194, r"^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), Some((4, 8)), Some((8, 11))} -mat!{match_basic_195, r"((foo)|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), None, Some((0, 3))} -mat!{match_basic_196, r"((foo)|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), None, Some((4, 7))} -mat!{match_basic_197, r"((foo)|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3))} -mat!{match_basic_198, r"((foo)|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3))} -mat!{match_basic_199, r"((foo)|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7))} -mat!{match_basic_200, r"((foo)|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3))} -mat!{match_basic_201, r"(foo|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), Some((0, 3))} -mat!{match_basic_202, r"(foo|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), Some((4, 7))} -mat!{match_basic_203, r"(foo|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3))} -mat!{match_basic_204, r"(foo|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3))} -mat!{match_basic_205, r"(foo|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7))} -mat!{match_basic_206, r"(foo|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3))} -mat!{match_basic_207, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11))} -mat!{match_basic_208, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3))} -mat!{match_basic_209, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))} -mat!{match_basic_210, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), None, None, Some((4, 8)), Some((8, 11))} -mat!{match_basic_211, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))} -mat!{match_basic_212, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bas", Some((0, 3)), Some((0, 3)), None, Some((0, 3))} -mat!{match_basic_213, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bar!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7))} -mat!{match_basic_214, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11))} -mat!{match_basic_215, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7))} -mat!{match_basic_216, r".*(/XXX).*", r"/XXX", Some((0, 4)), Some((0, 4))} -mat!{match_basic_217, r".*(\\XXX).*", r"\XXX", Some((0, 4)), Some((0, 4))} -mat!{match_basic_218, r"\\XXX", r"\XXX", Some((0, 4))} -mat!{match_basic_219, r".*(/000).*", r"/000", Some((0, 4)), Some((0, 4))} -mat!{match_basic_220, r".*(\\000).*", r"\000", Some((0, 4)), Some((0, 4))} -mat!{match_basic_221, r"\\000", r"\000", Some((0, 4))} - -// Tests from nullsubexpr.dat -mat!{match_nullsubexpr_3, r"(a*)*", r"a", Some((0, 1)), Some((0, 1))} -mat!{match_nullsubexpr_5, r"(a*)*", r"x", Some((0, 0)), None} -mat!{match_nullsubexpr_6, r"(a*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_7, r"(a*)*", r"aaaaaax", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_8, r"(a*)+", r"a", Some((0, 1)), Some((0, 1))} -mat!{match_nullsubexpr_9, r"(a*)+", r"x", Some((0, 0)), Some((0, 0))} -mat!{match_nullsubexpr_10, r"(a*)+", r"aaaaaa", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_11, r"(a*)+", r"aaaaaax", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_12, r"(a+)*", r"a", Some((0, 1)), Some((0, 1))} -mat!{match_nullsubexpr_13, r"(a+)*", r"x", Some((0, 0))} -mat!{match_nullsubexpr_14, r"(a+)*", r"aaaaaa", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_15, r"(a+)*", r"aaaaaax", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_16, r"(a+)+", r"a", Some((0, 1)), Some((0, 1))} -mat!{match_nullsubexpr_17, r"(a+)+", r"x", None} -mat!{match_nullsubexpr_18, r"(a+)+", r"aaaaaa", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_19, r"(a+)+", r"aaaaaax", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_21, r"([a]*)*", r"a", Some((0, 1)), Some((0, 1))} -mat!{match_nullsubexpr_23, r"([a]*)*", r"x", Some((0, 0)), None} -mat!{match_nullsubexpr_24, r"([a]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_25, r"([a]*)*", r"aaaaaax", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_26, r"([a]*)+", r"a", Some((0, 1)), Some((0, 1))} -mat!{match_nullsubexpr_27, r"([a]*)+", r"x", Some((0, 0)), Some((0, 0))} -mat!{match_nullsubexpr_28, r"([a]*)+", r"aaaaaa", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_29, r"([a]*)+", r"aaaaaax", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_30, r"([^b]*)*", r"a", Some((0, 1)), Some((0, 1))} -mat!{match_nullsubexpr_32, r"([^b]*)*", r"b", Some((0, 0)), None} -mat!{match_nullsubexpr_33, r"([^b]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_34, r"([^b]*)*", r"aaaaaab", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_35, r"([ab]*)*", r"a", Some((0, 1)), Some((0, 1))} -mat!{match_nullsubexpr_36, r"([ab]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_37, r"([ab]*)*", r"ababab", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_38, r"([ab]*)*", r"bababa", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_39, r"([ab]*)*", r"b", Some((0, 1)), Some((0, 1))} -mat!{match_nullsubexpr_40, r"([ab]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_41, r"([ab]*)*", r"aaaabcde", Some((0, 5)), Some((0, 5))} -mat!{match_nullsubexpr_42, r"([^a]*)*", r"b", Some((0, 1)), Some((0, 1))} -mat!{match_nullsubexpr_43, r"([^a]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_45, r"([^a]*)*", r"aaaaaa", Some((0, 0)), None} -mat!{match_nullsubexpr_46, r"([^ab]*)*", r"ccccxx", Some((0, 6)), Some((0, 6))} -mat!{match_nullsubexpr_48, r"([^ab]*)*", r"ababab", Some((0, 0)), None} -mat!{match_nullsubexpr_50, r"((z)+|a)*", r"zabcde", Some((0, 2)), Some((1, 2))} -mat!{match_nullsubexpr_69, r"(a*)*(x)", r"x", Some((0, 1)), None, Some((0, 1))} -mat!{match_nullsubexpr_70, r"(a*)*(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2))} -mat!{match_nullsubexpr_71, r"(a*)*(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2))} -mat!{match_nullsubexpr_73, r"(a*)+(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1))} -mat!{match_nullsubexpr_74, r"(a*)+(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2))} -mat!{match_nullsubexpr_75, r"(a*)+(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2))} -mat!{match_nullsubexpr_77, r"(a*){2}(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1))} -mat!{match_nullsubexpr_78, r"(a*){2}(x)", r"ax", Some((0, 2)), Some((1, 1)), Some((1, 2))} -mat!{match_nullsubexpr_79, r"(a*){2}(x)", r"axa", Some((0, 2)), Some((1, 1)), Some((1, 2))} - -// Tests from repetition.dat -mat!{match_repetition_10, r"((..)|(.))", r"", None} -mat!{match_repetition_11, r"((..)|(.))((..)|(.))", r"", None} -mat!{match_repetition_12, r"((..)|(.))((..)|(.))((..)|(.))", r"", None} -mat!{match_repetition_14, r"((..)|(.)){1}", r"", None} -mat!{match_repetition_15, r"((..)|(.)){2}", r"", None} -mat!{match_repetition_16, r"((..)|(.)){3}", r"", None} -mat!{match_repetition_18, r"((..)|(.))*", r"", Some((0, 0))} -mat!{match_repetition_20, r"((..)|(.))", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1))} -mat!{match_repetition_21, r"((..)|(.))((..)|(.))", r"a", None} -mat!{match_repetition_22, r"((..)|(.))((..)|(.))((..)|(.))", r"a", None} -mat!{match_repetition_24, r"((..)|(.)){1}", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1))} -mat!{match_repetition_25, r"((..)|(.)){2}", r"a", None} -mat!{match_repetition_26, r"((..)|(.)){3}", r"a", None} -mat!{match_repetition_28, r"((..)|(.))*", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1))} -mat!{match_repetition_30, r"((..)|(.))", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None} -mat!{match_repetition_31, r"((..)|(.))((..)|(.))", r"aa", Some((0, 2)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2))} -mat!{match_repetition_32, r"((..)|(.))((..)|(.))((..)|(.))", r"aa", None} -mat!{match_repetition_34, r"((..)|(.)){1}", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None} -mat!{match_repetition_35, r"((..)|(.)){2}", r"aa", Some((0, 2)), Some((1, 2)), None, Some((1, 2))} -mat!{match_repetition_36, r"((..)|(.)){3}", r"aa", None} -mat!{match_repetition_38, r"((..)|(.))*", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None} -mat!{match_repetition_40, r"((..)|(.))", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None} -mat!{match_repetition_41, r"((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3))} -mat!{match_repetition_42, r"((..)|(.))((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2)), Some((2, 3)), None, Some((2, 3))} -mat!{match_repetition_44, r"((..)|(.)){1}", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None} -mat!{match_repetition_46, r"((..)|(.)){2}", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3))} -mat!{match_repetition_47, r"((..)|(.)){3}", r"aaa", Some((0, 3)), Some((2, 3)), None, Some((2, 3))} -mat!{match_repetition_50, r"((..)|(.))*", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3))} -mat!{match_repetition_52, r"((..)|(.))", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None} -mat!{match_repetition_53, r"((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None} -mat!{match_repetition_54, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3)), Some((3, 4)), None, Some((3, 4))} -mat!{match_repetition_56, r"((..)|(.)){1}", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None} -mat!{match_repetition_57, r"((..)|(.)){2}", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None} -mat!{match_repetition_59, r"((..)|(.)){3}", r"aaaa", Some((0, 4)), Some((3, 4)), Some((0, 2)), Some((3, 4))} -mat!{match_repetition_61, r"((..)|(.))*", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None} -mat!{match_repetition_63, r"((..)|(.))", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None} -mat!{match_repetition_64, r"((..)|(.))((..)|(.))", r"aaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None} -mat!{match_repetition_65, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaa", Some((0, 5)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 5)), None, Some((4, 5))} -mat!{match_repetition_67, r"((..)|(.)){1}", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None} -mat!{match_repetition_68, r"((..)|(.)){2}", r"aaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None} -mat!{match_repetition_70, r"((..)|(.)){3}", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5))} -mat!{match_repetition_73, r"((..)|(.))*", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5))} -mat!{match_repetition_75, r"((..)|(.))", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None} -mat!{match_repetition_76, r"((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None} -mat!{match_repetition_77, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 6)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 6)), Some((4, 6)), None} -mat!{match_repetition_79, r"((..)|(.)){1}", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None} -mat!{match_repetition_80, r"((..)|(.)){2}", r"aaaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None} -mat!{match_repetition_81, r"((..)|(.)){3}", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None} -mat!{match_repetition_83, r"((..)|(.))*", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None} -mat!{match_repetition_90, r"X(.?){0,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))} -mat!{match_repetition_91, r"X(.?){1,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))} -mat!{match_repetition_92, r"X(.?){2,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))} -mat!{match_repetition_93, r"X(.?){3,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))} -mat!{match_repetition_94, r"X(.?){4,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))} -mat!{match_repetition_95, r"X(.?){5,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))} -mat!{match_repetition_96, r"X(.?){6,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))} -mat!{match_repetition_97, r"X(.?){7,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))} -mat!{match_repetition_98, r"X(.?){8,}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))} -mat!{match_repetition_100, r"X(.?){0,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))} -mat!{match_repetition_102, r"X(.?){1,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))} -mat!{match_repetition_104, r"X(.?){2,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))} -mat!{match_repetition_106, r"X(.?){3,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))} -mat!{match_repetition_108, r"X(.?){4,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))} -mat!{match_repetition_110, r"X(.?){5,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))} -mat!{match_repetition_112, r"X(.?){6,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))} -mat!{match_repetition_114, r"X(.?){7,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))} -mat!{match_repetition_115, r"X(.?){8,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))} -mat!{match_repetition_126, r"(a|ab|c|bcd){0,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))} -mat!{match_repetition_127, r"(a|ab|c|bcd){1,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))} -mat!{match_repetition_128, r"(a|ab|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))} -mat!{match_repetition_129, r"(a|ab|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))} -mat!{match_repetition_130, r"(a|ab|c|bcd){4,}(d*)", r"ababcd", None} -mat!{match_repetition_131, r"(a|ab|c|bcd){0,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))} -mat!{match_repetition_132, r"(a|ab|c|bcd){1,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))} -mat!{match_repetition_133, r"(a|ab|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))} -mat!{match_repetition_134, r"(a|ab|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))} -mat!{match_repetition_135, r"(a|ab|c|bcd){4,10}(d*)", r"ababcd", None} -mat!{match_repetition_136, r"(a|ab|c|bcd)*(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))} -mat!{match_repetition_137, r"(a|ab|c|bcd)+(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))} -mat!{match_repetition_143, r"(ab|a|c|bcd){0,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))} -mat!{match_repetition_145, r"(ab|a|c|bcd){1,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))} -mat!{match_repetition_147, r"(ab|a|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))} -mat!{match_repetition_149, r"(ab|a|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))} -mat!{match_repetition_150, r"(ab|a|c|bcd){4,}(d*)", r"ababcd", None} -mat!{match_repetition_152, r"(ab|a|c|bcd){0,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))} -mat!{match_repetition_154, r"(ab|a|c|bcd){1,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))} -mat!{match_repetition_156, r"(ab|a|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))} -mat!{match_repetition_158, r"(ab|a|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))} -mat!{match_repetition_159, r"(ab|a|c|bcd){4,10}(d*)", r"ababcd", None} -mat!{match_repetition_161, r"(ab|a|c|bcd)*(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))} -mat!{match_repetition_163, r"(ab|a|c|bcd)+(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))} - diff --git a/src/libregex/test/mod.rs b/src/libregex/test/mod.rs deleted file mode 100644 index e11094b1174..00000000000 --- a/src/libregex/test/mod.rs +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -macro_rules! regex { - ($re:expr) => ( - match ::regex::Regex::new($re) { - Ok(re) => re, - Err(err) => panic!("{:?}", err), - } - ); -} - -#[path = "bench.rs"] -mod dynamic_bench; -#[path = "tests.rs"] -mod dynamic_tests; - diff --git a/src/libregex/test/native_static.rs b/src/libregex/test/native_static.rs deleted file mode 100644 index 62e14731c20..00000000000 --- a/src/libregex/test/native_static.rs +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use regex::Regex; -static RE: Regex = regex!(r"\d+"); - -#[test] -fn static_splitn() { - let text = "cauchy123plato456tyler789binx"; - let subs: Vec<&str> = RE.splitn(text, 2).collect(); - assert_eq!(subs, vec!("cauchy", "plato456tyler789binx")); -} - -#[test] -fn static_split() { - let text = "cauchy123plato456tyler789binx"; - let subs: Vec<&str> = RE.split(text).collect(); - assert_eq!(subs, vec!("cauchy", "plato", "tyler", "binx")); -} diff --git a/src/libregex/test/tests.rs b/src/libregex/test/tests.rs deleted file mode 100644 index b69420ac05b..00000000000 --- a/src/libregex/test/tests.rs +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// ignore-tidy-linelength -// ignore-lexer-test FIXME #15679 - -use regex::{Regex, NoExpand}; - -#[test] -fn splitn() { - let re = regex!(r"\d+"); - let text = "cauchy123plato456tyler789binx"; - let subs: Vec<&str> = re.splitn(text, 2).collect(); - assert_eq!(subs, vec!("cauchy", "plato456tyler789binx")); -} - -#[test] -fn split() { - let re = regex!(r"\d+"); - let text = "cauchy123plato456tyler789binx"; - let subs: Vec<&str> = re.split(text).collect(); - assert_eq!(subs, vec!("cauchy", "plato", "tyler", "binx")); -} - -#[test] -fn empty_regex_empty_match() { - let re = regex!(""); - let ms = re.find_iter("").collect::>(); - assert_eq!(ms, vec![(0, 0)]); -} - -#[test] -fn empty_regex_nonempty_match() { - let re = regex!(""); - let ms = re.find_iter("abc").collect::>(); - assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]); -} - -#[test] -fn quoted_bracket_set() { - let re = regex!(r"([\x{5b}\x{5d}])"); - let ms = re.find_iter("[]").collect::>(); - assert_eq!(ms, vec![(0, 1), (1, 2)]); - let re = regex!(r"([\[\]])"); - let ms = re.find_iter("[]").collect::>(); - assert_eq!(ms, vec![(0, 1), (1, 2)]); -} - -#[test] -fn first_range_starts_with_left_bracket() { - let re = regex!(r"([[-z])"); - let ms = re.find_iter("[]").collect::>(); - assert_eq!(ms, vec![(0, 1), (1, 2)]); -} - -#[test] -fn range_ends_with_escape() { - let re = regex!(r"([\[-\x{5d}])"); - let ms = re.find_iter("[]").collect::>(); - assert_eq!(ms, vec![(0, 1), (1, 2)]); -} - -macro_rules! replace { - ($name:ident, $which:ident, $re:expr, - $search:expr, $replace:expr, $result:expr) => ( - #[test] - fn $name() { - let re = regex!($re); - assert_eq!(re.$which($search, $replace), String::from_str($result)); - } - ); -} - -replace!{rep_first, replace, r"\d", "age: 26", "Z", "age: Z6"} -replace!{rep_plus, replace, r"\d+", "age: 26", "Z", "age: Z"} -replace!{rep_all, replace_all, r"\d", "age: 26", "Z", "age: ZZ"} -replace!{rep_groups, replace, r"(\S+)\s+(\S+)", "w1 w2", "$2 $1", "w2 w1"} -replace!{rep_double_dollar, replace, - r"(\S+)\s+(\S+)", "w1 w2", "$2 $$1", "w2 $1"} -replace!{rep_no_expand, replace, - r"(\S+)\s+(\S+)", "w1 w2", NoExpand("$2 $1"), "$2 $1"} -replace!{rep_named, replace_all, - r"(?P\S+)\s+(?P\S+)(?P\s*)", - "w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3"} -replace!{rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t", - "", "trim me"} - -macro_rules! noparse { - ($name:ident, $re:expr) => ( - #[test] - fn $name() { - let re = $re; - match Regex::new(re) { - Err(_) => {}, - Ok(_) => panic!("Regex '{}' should cause a parse error.", re), - } - } - ); -} - -noparse!{fail_double_repeat, "a**"} -noparse!{fail_no_repeat_arg, "*"} -noparse!{fail_no_repeat_arg_begin, "^*"} -noparse!{fail_incomplete_escape, "\\"} -noparse!{fail_class_incomplete, "[A-"} -noparse!{fail_class_not_closed, "[A"} -noparse!{fail_class_no_begin, r"[\A]"} -noparse!{fail_class_no_end, r"[\z]"} -noparse!{fail_class_no_boundary, r"[\b]"} -noparse!{fail_open_paren, "("} -noparse!{fail_close_paren, ")"} -noparse!{fail_invalid_range, "[a-Z]"} -noparse!{fail_empty_capture_name, "(?P<>a)"} -noparse!{fail_empty_capture_exp, "(?P)"} -noparse!{fail_bad_capture_name, "(?P)"} -noparse!{fail_bad_flag, "(?a)a"} -noparse!{fail_empty_alt_before, "|a"} -noparse!{fail_empty_alt_after, "a|"} -noparse!{fail_counted_big_exact, "a{1001}"} -noparse!{fail_counted_big_min, "a{1001,}"} -noparse!{fail_counted_no_close, "a{1001"} -noparse!{fail_unfinished_cap, "(?"} -noparse!{fail_unfinished_escape, "\\"} -noparse!{fail_octal_digit, r"\8"} -noparse!{fail_hex_digit, r"\xG0"} -noparse!{fail_hex_short, r"\xF"} -noparse!{fail_hex_long_digits, r"\x{fffg}"} -noparse!{fail_flag_bad, "(?a)"} -noparse!{fail_flag_empty, "(?)"} -noparse!{fail_double_neg, "(?-i-i)"} -noparse!{fail_neg_empty, "(?i-)"} -noparse!{fail_empty_group, "()"} -noparse!{fail_dupe_named, "(?P.)(?P.)"} -noparse!{fail_range_end_no_class, "[a-[:lower:]]"} -noparse!{fail_range_end_no_begin, r"[a-\A]"} -noparse!{fail_range_end_no_end, r"[a-\z]"} -noparse!{fail_range_end_no_boundary, r"[a-\b]"} -noparse!{fail_repeat_no_expr, r"-|+"} - -macro_rules! mat { - ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( - #[test] - fn $name() { - let text = $text; - let expected: Vec> = vec!($($loc)+); - let r = regex!($re); - let got = match r.captures(text) { - Some(c) => c.iter_pos().collect::>>(), - None => vec!(None), - }; - // The test set sometimes leave out capture groups, so truncate - // actual capture groups to match test set. - let mut sgot = got.as_slice(); - if sgot.len() > expected.len() { - sgot = &sgot[..expected.len()] - } - if expected != sgot { - panic!("For RE '{}' against '{}', expected '{:?}' but got '{:?}'", - $re, text, expected, sgot); - } - } - ); -} - -// Some crazy expressions from regular-expressions.info. -mat!{match_ranges, - r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", - "num: 255", Some((5, 8))} -mat!{match_ranges_not, - r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", - "num: 256", None} -mat!{match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3))} -mat!{match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3))} -mat!{match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4))} -mat!{match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None} -mat!{match_email, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", - "mine is jam.slam@gmail.com ", Some((8, 26))} -mat!{match_email_not, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", - "mine is jam.slam@gmail ", None} -mat!{match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", - "mine is jam.slam@gmail.com ", Some((8, 26))} -mat!{match_date1, - r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", - "1900-01-01", Some((0, 10))} -mat!{match_date2, - r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", - "1900-00-01", None} -mat!{match_date3, - r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", - "1900-13-01", None} - -// Exercise the flags. -mat!{match_flag_case, "(?i)abc", "ABC", Some((0, 3))} -mat!{match_flag_weird_case, "(?i)a(?-i)bc", "Abc", Some((0, 3))} -mat!{match_flag_weird_case_not, "(?i)a(?-i)bc", "ABC", None} -mat!{match_flag_case_dotnl, "(?is)a.", "A\n", Some((0, 2))} -mat!{match_flag_case_dotnl_toggle, "(?is)a.(?-is)a.", "A\nab", Some((0, 4))} -mat!{match_flag_case_dotnl_toggle_not, "(?is)a.(?-is)a.", "A\na\n", None} -mat!{match_flag_case_dotnl_toggle_ok, "(?is)a.(?-is:a.)?", "A\na\n", Some((0, 2))} -mat!{match_flag_multi, "(?m)(?:^\\d+$\n?)+", "123\n456\n789", Some((0, 11))} -mat!{match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1))} -mat!{match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2))} -mat!{match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2))} - -// Some Unicode tests. -// A couple of these are commented out because something in the guts of macro expansion is creating -// invalid byte strings. -//mat!{uni_literal, r"Ⅰ", "Ⅰ", Some((0, 3))} -mat!{uni_one, r"\pN", "Ⅰ", Some((0, 3))} -mat!{uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))} -mat!{uni_not, r"\PN+", "abⅠ", Some((0, 2))} -mat!{uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))} -mat!{uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))} -mat!{uni_case, r"(?i)Δ", "δ", Some((0, 2))} -//mat!{uni_case_not, r"Δ", "δ", None} -mat!{uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))} -mat!{uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))} -mat!{uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))} -mat!{uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))} - -// Test the Unicode friendliness of Perl character classes. -mat!{uni_perl_w, r"\w+", "dδd", Some((0, 4))} -mat!{uni_perl_w_not, r"\w+", "⥡", None} -mat!{uni_perl_w_neg, r"\W+", "⥡", Some((0, 3))} -mat!{uni_perl_d, r"\d+", "1२३9", Some((0, 8))} -mat!{uni_perl_d_not, r"\d+", "Ⅱ", None} -mat!{uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))} -mat!{uni_perl_s, r"\s+", " ", Some((0, 3))} -mat!{uni_perl_s_not, r"\s+", "☃", None} -mat!{uni_perl_s_neg, r"\S+", "☃", Some((0, 3))} - -// And do the same for word boundaries. -mat!{uni_boundary_none, r"\d\b", "6δ", None} -mat!{uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))} - -// A whole mess of tests from Glenn Fowler's regex test suite. -// Generated by the 'src/etc/regex-match-tests' program. -mod matches; diff --git a/src/libregex/testdata/LICENSE b/src/libregex/testdata/LICENSE deleted file mode 100644 index f47dbf4c449..00000000000 --- a/src/libregex/testdata/LICENSE +++ /dev/null @@ -1,19 +0,0 @@ -The following license covers testregex.c and all associated test data. - -Permission is hereby granted, free of charge, to any person obtaining a -copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software -without restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, and/or sell copies of the -Software, and to permit persons to whom the Software is furnished to do -so, subject to the following disclaimer: - -THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED -WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. -IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/libregex/testdata/README b/src/libregex/testdata/README deleted file mode 100644 index 33b0ba17ed7..00000000000 --- a/src/libregex/testdata/README +++ /dev/null @@ -1,17 +0,0 @@ -Test data was taken from the Go distribution, which was in turn taken from the -testregex test suite: - - http://www2.research.att.com/~astopen/testregex/testregex.html - -The LICENSE in this directory corresponds to the LICENSE that the data was -released under. - -The tests themselves were modified for RE2/Go. A couple were modified further -by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. -(Yes, it seems like RE2/Go includes failing test cases.) This may or may not -have been a bad idea, but I think being consistent with an established Regex -library is worth something. - -Note that these files are read by 'src/etc/regexp-match-tests' and turned into -Rust tests found in 'src/libregexp/tests/matches.rs'. - diff --git a/src/libregex/testdata/basic.dat b/src/libregex/testdata/basic.dat deleted file mode 100644 index e55efaeec06..00000000000 --- a/src/libregex/testdata/basic.dat +++ /dev/null @@ -1,221 +0,0 @@ -NOTE all standard compliant implementations should pass these : 2002-05-31 - -BE abracadabra$ abracadabracadabra (7,18) -BE a...b abababbb (2,7) -BE XXXXXX ..XXXXXX (2,8) -E \) () (1,2) -BE a] a]a (0,2) -B } } (0,1) -E \} } (0,1) -BE \] ] (0,1) -B ] ] (0,1) -E ] ] (0,1) -B { { (0,1) -B } } (0,1) -BE ^a ax (0,1) -BE \^a a^a (1,3) -BE a\^ a^ (0,2) -BE a$ aa (1,2) -BE a\$ a$ (0,2) -BE ^$ NULL (0,0) -E $^ NULL (0,0) -E a($) aa (1,2)(2,2) -E a*(^a) aa (0,1)(0,1) -E (..)*(...)* a (0,0) -E (..)*(...)* abcd (0,4)(2,4) -E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) -E (ab)c|abc abc (0,3)(0,2) -E a{0}b ab (1,2) -E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) -E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) -E a{9876543210} NULL BADBR -E ((a|a)|a) a (0,1)(0,1)(0,1) -E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) -E a*(a.|aa) aaaa (0,4)(2,4) -E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) -E (a|b)?.* b (0,1)(0,1) -E (a|b)c|a(b|c) ac (0,2)(0,1) -E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) -E (a|b)*c|(a|ab)*c abc (0,3)(1,2) -E (a|b)*c|(a|ab)*c xc (1,2) -E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) -E a?(ab|ba)ab abab (0,4)(0,2) -E a?(ac{0}b|ba)ab abab (0,4)(0,2) -E ab|abab abbabab (0,2) -E aba|bab|bba baaabbbaba (5,8) -E aba|bab baaabbbaba (6,9) -E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) -E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) -E ab|a xabc (1,3) -E ab|a xxabc (2,4) -Ei (Ab|cD)* aBcD (0,4)(2,4) -BE [^-] --a (2,3) -BE [a-]* --a (0,3) -BE [a-m-]* --amoma-- (0,4) -E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) -E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) -{E [[:upper:]] A (0,1) [[]] not supported -E [[:lower:]]+ `az{ (1,3) -E [[:upper:]]+ @AZ[ (1,3) -# No collation in Go -#BE [[-]] [[-]] (2,4) -#BE [[.NIL.]] NULL ECOLLATE -#BE [[=aleph=]] NULL ECOLLATE -} -BE$ \n \n (0,1) -BEn$ \n \n (0,1) -BE$ [^a] \n (0,1) -BE$ \na \na (0,2) -E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) -BE xxx xxx (0,3) -E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) -E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) -E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) -E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) -E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) -E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) -E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) -E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) -E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) -BE$ .* \x01\x7f (0,2) -E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) -L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH -E a*a*a*a*a*b aaaaaaaaab (0,10) -BE ^ NULL (0,0) -BE $ NULL (0,0) -BE ^$ NULL (0,0) -BE ^a$ a (0,1) -BE abc abc (0,3) -BE abc xabcy (1,4) -BE abc ababc (2,5) -BE ab*c abc (0,3) -BE ab*bc abc (0,3) -BE ab*bc abbc (0,4) -BE ab*bc abbbbc (0,6) -E ab+bc abbc (0,4) -E ab+bc abbbbc (0,6) -E ab?bc abbc (0,4) -E ab?bc abc (0,3) -E ab?c abc (0,3) -BE ^abc$ abc (0,3) -BE ^abc abcc (0,3) -BE abc$ aabc (1,4) -BE ^ abc (0,0) -BE $ abc (3,3) -BE a.c abc (0,3) -BE a.c axc (0,3) -BE a.*c axyzc (0,5) -BE a[bc]d abd (0,3) -BE a[b-d]e ace (0,3) -BE a[b-d] aac (1,3) -BE a[-b] a- (0,2) -BE a[b-] a- (0,2) -BE a] a] (0,2) -BE a[]]b a]b (0,3) -BE a[^bc]d aed (0,3) -BE a[^-b]c adc (0,3) -BE a[^]b]c adc (0,3) -E ab|cd abc (0,2) -E ab|cd abcd (0,2) -E a\(b a(b (0,3) -E a\(*b ab (0,2) -E a\(*b a((b (0,4) -E ((a)) abc (0,1)(0,1)(0,1) -E (a)b(c) abc (0,3)(0,1)(2,3) -E a+b+c aabbabc (4,7) -E a* aaa (0,3) -#E (a*)* - (0,0)(0,0) -E (a*)* - (0,0)(?,?) RE2/Go -E (a*)+ - (0,0)(0,0) -#E (a*|b)* - (0,0)(0,0) -E (a*|b)* - (0,0)(?,?) RE2/Go -E (a+|b)* ab (0,2)(1,2) -E (a+|b)+ ab (0,2)(1,2) -E (a+|b)? ab (0,1)(0,1) -BE [^ab]* cde (0,3) -#E (^)* - (0,0)(0,0) -E (^)* - (0,0)(?,?) RE2/Go -BE a* NULL (0,0) -E ([abc])*d abbbcd (0,6)(4,5) -E ([abc])*bcd abcd (0,4)(0,1) -E a|b|c|d|e e (0,1) -E (a|b|c|d|e)f ef (0,2)(0,1) -#E ((a*|b))* - (0,0)(0,0)(0,0) -E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go -BE abcd*efg abcdefg (0,7) -BE ab* xabyabbbz (1,3) -BE ab* xayabbbz (1,2) -E (ab|cd)e abcde (2,5)(2,4) -BE [abhgefdc]ij hij (0,3) -E (a|b)c*d abcd (1,4)(1,2) -E (ab|ab*)bc abc (0,3)(0,1) -E a([bc]*)c* abc (0,3)(1,3) -E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) -E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) -E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) -E a[bcd]*dcdcde adcdcde (0,7) -E (ab|a)b*c abc (0,3)(0,2) -E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) -BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) -E ^a(bc+|b[eh])g|.h$ abh (1,3) -E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) -E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) -E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) -E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) -BE multiple words multiple words yeah (0,14) -E (.*)c(.*) abcde (0,5)(0,2)(3,5) -BE abcd abcd (0,4) -E a(bc)d abcd (0,4)(1,3) -E a[-]?c ac (0,3) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) -E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) -E a+(b|c)*d+ aabcdd (0,6)(3,4) -E ^.+$ vivi (0,4) -E ^(.+)$ vivi (0,4)(0,4) -E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) -E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) -E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) -E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) -E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) -E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) -E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) -E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) -E ((foo)|bar)!bas bar!bas (0,7)(0,3) -E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) -E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) -E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) -E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) -E (foo|(bar))!bas foo!bas (0,7)(0,3) -E (foo|bar)!bas bar!bas (0,7)(0,3) -E (foo|bar)!bas foo!bar!bas (4,11)(4,7) -E (foo|bar)!bas foo!bas (0,7)(0,3) -E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) -E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) -E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) -E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) -E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) -E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) -E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) -E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) -E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) -E .*(/XXX).* /XXX (0,4)(0,4) -E .*(\\XXX).* \XXX (0,4)(0,4) -E \\XXX \XXX (0,4) -E .*(/000).* /000 (0,4)(0,4) -E .*(\\000).* \000 (0,4)(0,4) -E \\000 \000 (0,4) diff --git a/src/libregex/testdata/nullsubexpr.dat b/src/libregex/testdata/nullsubexpr.dat deleted file mode 100644 index 2e18fbb9170..00000000000 --- a/src/libregex/testdata/nullsubexpr.dat +++ /dev/null @@ -1,79 +0,0 @@ -NOTE null subexpression matches : 2002-06-06 - -E (a*)* a (0,1)(0,1) -#E SAME x (0,0)(0,0) -E SAME x (0,0)(?,?) RE2/Go -E SAME aaaaaa (0,6)(0,6) -E SAME aaaaaax (0,6)(0,6) -E (a*)+ a (0,1)(0,1) -E SAME x (0,0)(0,0) -E SAME aaaaaa (0,6)(0,6) -E SAME aaaaaax (0,6)(0,6) -E (a+)* a (0,1)(0,1) -E SAME x (0,0) -E SAME aaaaaa (0,6)(0,6) -E SAME aaaaaax (0,6)(0,6) -E (a+)+ a (0,1)(0,1) -E SAME x NOMATCH -E SAME aaaaaa (0,6)(0,6) -E SAME aaaaaax (0,6)(0,6) - -E ([a]*)* a (0,1)(0,1) -#E SAME x (0,0)(0,0) -E SAME x (0,0)(?,?) RE2/Go -E SAME aaaaaa (0,6)(0,6) -E SAME aaaaaax (0,6)(0,6) -E ([a]*)+ a (0,1)(0,1) -E SAME x (0,0)(0,0) -E SAME aaaaaa (0,6)(0,6) -E SAME aaaaaax (0,6)(0,6) -E ([^b]*)* a (0,1)(0,1) -#E SAME b (0,0)(0,0) -E SAME b (0,0)(?,?) RE2/Go -E SAME aaaaaa (0,6)(0,6) -E SAME aaaaaab (0,6)(0,6) -E ([ab]*)* a (0,1)(0,1) -E SAME aaaaaa (0,6)(0,6) -E SAME ababab (0,6)(0,6) -E SAME bababa (0,6)(0,6) -E SAME b (0,1)(0,1) -E SAME bbbbbb (0,6)(0,6) -E SAME aaaabcde (0,5)(0,5) -E ([^a]*)* b (0,1)(0,1) -E SAME bbbbbb (0,6)(0,6) -#E SAME aaaaaa (0,0)(0,0) -E SAME aaaaaa (0,0)(?,?) RE2/Go -E ([^ab]*)* ccccxx (0,6)(0,6) -#E SAME ababab (0,0)(0,0) -E SAME ababab (0,0)(?,?) RE2/Go - -E ((z)+|a)* zabcde (0,2)(1,2) - -#{E a+? aaaaaa (0,1) no *? +? mimimal match ops -#E (a) aaa (0,1)(0,1) -#E (a*?) aaa (0,0)(0,0) -#E (a)*? aaa (0,0) -#E (a*?)*? aaa (0,0) -#} - -B \(a*\)*\(x\) x (0,1)(0,0)(0,1) -B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) -B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) -B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) -B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) -B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) -B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) -B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) - -#E (a*)*(x) x (0,1)(0,0)(0,1) -E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go -E (a*)*(x) ax (0,2)(0,1)(1,2) -E (a*)*(x) axa (0,2)(0,1)(1,2) - -E (a*)+(x) x (0,1)(0,0)(0,1) -E (a*)+(x) ax (0,2)(0,1)(1,2) -E (a*)+(x) axa (0,2)(0,1)(1,2) - -E (a*){2}(x) x (0,1)(0,0)(0,1) -E (a*){2}(x) ax (0,2)(1,1)(1,2) -E (a*){2}(x) axa (0,2)(1,1)(1,2) diff --git a/src/libregex/testdata/repetition.dat b/src/libregex/testdata/repetition.dat deleted file mode 100644 index 3bb21211800..00000000000 --- a/src/libregex/testdata/repetition.dat +++ /dev/null @@ -1,163 +0,0 @@ -NOTE implicit vs. explicit repetitions : 2009-02-02 - -# Glenn Fowler -# conforming matches (column 4) must match one of the following BREs -# NOMATCH -# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* -# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* -# i.e., each 3-tuple has two identical elements and one (?,?) - -E ((..)|(.)) NULL NOMATCH -E ((..)|(.))((..)|(.)) NULL NOMATCH -E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH - -E ((..)|(.)){1} NULL NOMATCH -E ((..)|(.)){2} NULL NOMATCH -E ((..)|(.)){3} NULL NOMATCH - -E ((..)|(.))* NULL (0,0) - -E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) -E ((..)|(.))((..)|(.)) a NOMATCH -E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH - -E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) -E ((..)|(.)){2} a NOMATCH -E ((..)|(.)){3} a NOMATCH - -E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) - -E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) -E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) -E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH - -E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) -E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) -E ((..)|(.)){3} aa NOMATCH - -E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) - -E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) -E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) -E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) - -E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) -#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) -E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go -E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) - -#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) -E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go - -E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) -E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) -E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) - -E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) -E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) -#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) -E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go - -E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) - -E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) -E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) -E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) - -E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) -E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) -#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) -E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go - -#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) -E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go - -E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) -E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) -E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) - -E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) -E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) -E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) - -E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) - -NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 - -# These test a bug in OS X / FreeBSD / NetBSD, and libtree. -# Linux/GLIBC gets the {8,} and {8,8} wrong. - -:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) -:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) -:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) -:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) -:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) -:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) -:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) -:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) -:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) -#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) -:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go -#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) -:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go -#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) -:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go -#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) -:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go -#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) -:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go -#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) -:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go -#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) -:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go -#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) -:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go -:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) - -# These test a fixed bug in my regex-tdfa that did not keep the expanded -# form properly grouped, so right association did the wrong thing with -# these ambiguous patterns (crafted just to test my code when I became -# suspicious of my implementation). The first subexpression should use -# "ab" then "a" then "bcd". - -# OS X / FreeBSD / NetBSD badly fail many of these, with impossible -# results like (0,6)(4,5)(6,6). - -:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) -:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) -:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) -:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) -:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH -:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) -:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) -:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) -:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) -:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH -:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) -:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) - -# The above worked on Linux/GLIBC but the following often fail. -# They also trip up OS X / FreeBSD / NetBSD: - -#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) -:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go -#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) -:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go -#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) -:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go -#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) -:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go -:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH -#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) -:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go -#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) -:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go -#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) -:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go -#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) -:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go -:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH -#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) -:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go -#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) -:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go diff --git a/src/libregex/vm.rs b/src/libregex/vm.rs deleted file mode 100644 index 9605536a052..00000000000 --- a/src/libregex/vm.rs +++ /dev/null @@ -1,582 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// FIXME: Currently, the VM simulates an NFA. It would be nice to have another -// VM that simulates a DFA. -// -// According to Russ Cox[1], a DFA performs better than an NFA, principally -// because it reuses states previously computed by the machine *and* doesn't -// keep track of capture groups. The drawback of a DFA (aside from its -// complexity) is that it can't accurately return the locations of submatches. -// The NFA *can* do that. (This is my understanding anyway.) -// -// Cox suggests that a DFA ought to be used to answer "does this match" and -// "where does it match" questions. (In the latter, the starting position of -// the match is computed by executing the regex backwards.) Cox also suggests -// that a DFA should be run when asking "where are the submatches", which can -// 1) quickly answer "no" is there's no match and 2) discover the substring -// that matches, which means running the NFA on smaller input. -// -// Currently, the NFA simulation implemented below does some dirty tricks to -// avoid tracking capture groups when they aren't needed (which only works -// for 'is_match', not 'find'). This is a half-measure, but does provide some -// perf improvement. -// -// AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go. -// -// [1] - http://swtch.com/~rsc/regex/regex3.html - -pub use self::MatchKind::*; -pub use self::StepState::*; - -use std::cmp; -use std::cmp::Ordering::{self, Less, Equal, Greater}; -use std::mem; -use std::iter::repeat; -use std::slice::SliceExt; -use compile::{ - Program, - Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary, - Save, Jump, Split, -}; -use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED}; -use unicode::regex::PERLW; - -pub type CaptureLocs = Vec>; - -/// Indicates the type of match to be performed by the VM. -#[derive(Copy)] -pub enum MatchKind { - /// Only checks if a match exists or not. Does not return location. - Exists, - /// Returns the start and end indices of the entire match in the input - /// given. - Location, - /// Returns the start and end indices of each submatch in the input given. - Submatches, -} - -/// Runs an NFA simulation on the compiled expression given on the search text -/// `input`. The search begins at byte index `start` and ends at byte index -/// `end`. (The range is specified here so that zero-width assertions will work -/// correctly when searching for successive non-overlapping matches.) -/// -/// The `which` parameter indicates what kind of capture information the caller -/// wants. There are three choices: match existence only, the location of the -/// entire match or the locations of the entire match in addition to the -/// locations of each submatch. -pub fn run<'r, 't>(which: MatchKind, prog: &'r Program, input: &'t str, - start: uint, end: uint) -> CaptureLocs { - Nfa { - which: which, - prog: prog, - input: input, - start: start, - end: end, - ic: 0, - chars: CharReader::new(input), - }.run() -} - -struct Nfa<'r, 't> { - which: MatchKind, - prog: &'r Program, - input: &'t str, - start: uint, - end: uint, - ic: uint, - chars: CharReader<'t>, -} - -/// Indicates the next action to take after a single non-empty instruction -/// is processed. -#[derive(Copy)] -pub enum StepState { - /// This is returned if and only if a Match instruction is reached and - /// we only care about the existence of a match. It instructs the VM to - /// quit early. - StepMatchEarlyReturn, - /// Indicates that a match was found. Thus, the rest of the states in the - /// *current* queue should be dropped (i.e., leftmost-first semantics). - /// States in the "next" queue can still be processed. - StepMatch, - /// No match was found. Continue with the next state in the queue. - StepContinue, -} - -impl<'r, 't> Nfa<'r, 't> { - fn run(&mut self) -> CaptureLocs { - let ncaps = match self.which { - Exists => 0, - Location => 1, - Submatches => self.prog.num_captures(), - }; - let mut matched = false; - let ninsts = self.prog.insts.len(); - let mut clist = &mut Threads::new(self.which, ninsts, ncaps); - let mut nlist = &mut Threads::new(self.which, ninsts, ncaps); - - let mut groups: Vec<_> = repeat(None).take(ncaps * 2).collect(); - - // Determine if the expression starts with a '^' so we can avoid - // simulating .*? - // Make sure multi-line mode isn't enabled for it, otherwise we can't - // drop the initial .*? - let prefix_anchor = - match self.prog.insts[1] { - EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true, - _ => false, - }; - - self.ic = self.start; - let mut next_ic = self.chars.set(self.start); - while self.ic <= self.end { - if clist.size == 0 { - // We have a match and we're done exploring alternatives. - // Time to quit. - if matched { - break - } - - // If there are no threads to try, then we'll have to start - // over at the beginning of the regex. - // BUT, if there's a literal prefix for the program, try to - // jump ahead quickly. If it can't be found, then we can bail - // out early. - if self.prog.prefix.len() > 0 && clist.size == 0 { - let needle = self.prog.prefix.as_bytes(); - let haystack = &self.input.as_bytes()[self.ic..]; - match find_prefix(needle, haystack) { - None => break, - Some(i) => { - self.ic += i; - next_ic = self.chars.set(self.ic); - } - } - } - } - - // This simulates a preceding '.*?' for every regex by adding - // a state starting at the current position in the input for the - // beginning of the program only if we don't already have a match. - if clist.size == 0 || (!prefix_anchor && !matched) { - self.add(clist, 0, groups.as_mut_slice()) - } - - // Now we try to read the next character. - // As a result, the 'step' method will look at the previous - // character. - self.ic = next_ic; - next_ic = self.chars.advance(); - - for i in range(0, clist.size) { - let pc = clist.pc(i); - let step_state = self.step(groups.as_mut_slice(), nlist, - clist.groups(i), pc); - match step_state { - StepMatchEarlyReturn => return vec![Some(0), Some(0)], - StepMatch => { matched = true; break }, - StepContinue => {}, - } - } - mem::swap(&mut clist, &mut nlist); - nlist.empty(); - } - match self.which { - Exists if matched => vec![Some(0), Some(0)], - Exists => vec![None, None], - Location | Submatches => groups, - } - } - - fn step(&self, groups: &mut [Option], nlist: &mut Threads, - caps: &mut [Option], pc: uint) - -> StepState { - match self.prog.insts[pc] { - Match => { - match self.which { - Exists => { - return StepMatchEarlyReturn - } - Location => { - groups[0] = caps[0]; - groups[1] = caps[1]; - return StepMatch - } - Submatches => { - for (slot, val) in groups.iter_mut().zip(caps.iter()) { - *slot = *val; - } - return StepMatch - } - } - } - OneChar(c, flags) => { - if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) { - self.add(nlist, pc+1, caps); - } - } - CharClass(ref ranges, flags) => { - if self.chars.prev.is_some() { - let c = self.chars.prev.unwrap(); - let negate = flags & FLAG_NEGATED > 0; - let casei = flags & FLAG_NOCASE > 0; - let found = ranges.as_slice(); - let found = found.binary_search_by(|&rc| class_cmp(casei, c, rc)).is_ok(); - if found ^ negate { - self.add(nlist, pc+1, caps); - } - } - } - Any(flags) => { - if flags & FLAG_DOTNL > 0 - || !self.char_eq(false, self.chars.prev, '\n') { - self.add(nlist, pc+1, caps) - } - } - EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_) - | Save(_) | Jump(_) | Split(_, _) => {}, - } - StepContinue - } - - fn add(&self, nlist: &mut Threads, pc: uint, groups: &mut [Option]) { - if nlist.contains(pc) { - return - } - // We have to add states to the threads list even if their empty. - // TL;DR - It prevents cycles. - // If we didn't care about cycles, we'd *only* add threads that - // correspond to non-jumping instructions (OneChar, Any, Match, etc.). - // But, it's possible for valid regexs (like '(a*)*') to result in - // a cycle in the instruction list. e.g., We'll keep chasing the Split - // instructions forever. - // So we add these instructions to our thread queue, but in the main - // VM loop, we look for them but simply ignore them. - // Adding them to the queue prevents them from being revisited so we - // can avoid cycles (and the inevitable stack overflow). - // - // We make a minor optimization by indicating that the state is "empty" - // so that its capture groups are not filled in. - match self.prog.insts[pc] { - EmptyBegin(flags) => { - let multi = flags & FLAG_MULTI > 0; - nlist.add(pc, groups, true); - if self.chars.is_begin() - || (multi && self.char_is(self.chars.prev, '\n')) { - self.add(nlist, pc + 1, groups) - } - } - EmptyEnd(flags) => { - let multi = flags & FLAG_MULTI > 0; - nlist.add(pc, groups, true); - if self.chars.is_end() - || (multi && self.char_is(self.chars.cur, '\n')) { - self.add(nlist, pc + 1, groups) - } - } - EmptyWordBoundary(flags) => { - nlist.add(pc, groups, true); - if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) { - self.add(nlist, pc + 1, groups) - } - } - Save(slot) => { - nlist.add(pc, groups, true); - match self.which { - Location if slot <= 1 => { - let old = groups[slot]; - groups[slot] = Some(self.ic); - self.add(nlist, pc + 1, groups); - groups[slot] = old; - } - Submatches => { - let old = groups[slot]; - groups[slot] = Some(self.ic); - self.add(nlist, pc + 1, groups); - groups[slot] = old; - } - Exists | Location => self.add(nlist, pc + 1, groups), - } - } - Jump(to) => { - nlist.add(pc, groups, true); - self.add(nlist, to, groups) - } - Split(x, y) => { - nlist.add(pc, groups, true); - self.add(nlist, x, groups); - self.add(nlist, y, groups); - } - Match | OneChar(_, _) | CharClass(_, _) | Any(_) => { - nlist.add(pc, groups, false); - } - } - } - - // FIXME: For case insensitive comparisons, it uses the uppercase - // character and tests for equality. IIUC, this does not generalize to - // all of Unicode. I believe we need to check the entire fold for each - // character. This will be easy to add if and when it gets added to Rust's - // standard library. - #[inline] - fn char_eq(&self, casei: bool, textc: Option, regc: char) -> bool { - match textc { - None => false, - Some(textc) => { - regc == textc - || (casei && regc.to_uppercase() == textc.to_uppercase()) - } - } - } - - #[inline] - fn char_is(&self, textc: Option, regc: char) -> bool { - textc == Some(regc) - } -} - -/// CharReader is responsible for maintaining a "previous" and a "current" -/// character. This one-character lookahead is necessary for assertions that -/// look one character before or after the current position. -pub struct CharReader<'t> { - /// The previous character read. It is None only when processing the first - /// character of the input. - pub prev: Option, - /// The current character. - pub cur: Option, - input: &'t str, - next: uint, -} - -impl<'t> CharReader<'t> { - /// Returns a new CharReader that advances through the input given. - /// Note that a CharReader has no knowledge of the range in which to search - /// the input. - pub fn new(input: &'t str) -> CharReader<'t> { - CharReader { - prev: None, - cur: None, - input: input, - next: 0, - } - } - - /// Sets the previous and current character given any arbitrary byte - /// index (at a Unicode codepoint boundary). - #[inline] - pub fn set(&mut self, ic: uint) -> uint { - self.prev = None; - self.cur = None; - self.next = 0; - - if self.input.len() == 0 { - return 1 - } - if ic > 0 { - let i = cmp::min(ic, self.input.len()); - let prev = self.input.char_range_at_reverse(i); - self.prev = Some(prev.ch); - } - if ic < self.input.len() { - let cur = self.input.char_range_at(ic); - self.cur = Some(cur.ch); - self.next = cur.next; - self.next - } else { - self.input.len() + 1 - } - } - - /// Does the same as `set`, except it always advances to the next - /// character in the input (and therefore does half as many UTF8 decodings). - #[inline] - pub fn advance(&mut self) -> uint { - self.prev = self.cur; - if self.next < self.input.len() { - let cur = self.input.char_range_at(self.next); - self.cur = Some(cur.ch); - self.next = cur.next; - } else { - self.cur = None; - self.next = self.input.len() + 1; - } - self.next - } - - /// Returns true if and only if this is the beginning of the input - /// (ignoring the range of the input to search). - #[inline] - pub fn is_begin(&self) -> bool { self.prev.is_none() } - - /// Returns true if and only if this is the end of the input - /// (ignoring the range of the input to search). - #[inline] - pub fn is_end(&self) -> bool { self.cur.is_none() } - - /// Returns true if and only if the current position is a word boundary. - /// (Ignoring the range of the input to search.) - pub fn is_word_boundary(&self) -> bool { - if self.is_begin() { - return is_word(self.cur) - } - if self.is_end() { - return is_word(self.prev) - } - (is_word(self.cur) && !is_word(self.prev)) - || (is_word(self.prev) && !is_word(self.cur)) - } -} - -struct Thread { - pc: uint, - groups: Vec>, -} - -struct Threads { - which: MatchKind, - queue: Vec, - sparse: Vec, - size: uint, -} - -impl Threads { - // This is using a wicked neat trick to provide constant time lookup - // for threads in the queue using a sparse set. A queue of threads is - // allocated once with maximal size when the VM initializes and is reused - // throughout execution. That is, there should be zero allocation during - // the execution of a VM. - // - // See http://research.swtch.com/sparse for the deets. - fn new(which: MatchKind, num_insts: uint, ncaps: uint) -> Threads { - Threads { - which: which, - queue: range(0, num_insts).map(|_| { - Thread { pc: 0, groups: repeat(None).take(ncaps * 2).collect() } - }).collect(), - sparse: repeat(0u).take(num_insts).collect(), - size: 0, - } - } - - fn add(&mut self, pc: uint, groups: &[Option], empty: bool) { - let t = &mut self.queue[self.size]; - t.pc = pc; - match (empty, self.which) { - (_, Exists) | (true, _) => {}, - (false, Location) => { - t.groups[0] = groups[0]; - t.groups[1] = groups[1]; - } - (false, Submatches) => { - for (slot, val) in t.groups.iter_mut().zip(groups.iter()) { - *slot = *val; - } - } - } - self.sparse[pc] = self.size; - self.size += 1; - } - - #[inline] - fn contains(&self, pc: uint) -> bool { - let s = self.sparse[pc]; - s < self.size && self.queue[s].pc == pc - } - - #[inline] - fn empty(&mut self) { - self.size = 0; - } - - #[inline] - fn pc(&self, i: uint) -> uint { - self.queue[i].pc - } - - #[inline] - fn groups<'r>(&'r mut self, i: uint) -> &'r mut [Option] { - let q = &mut self.queue[i]; - q.groups.as_mut_slice() - } -} - -/// Returns true if the character is a word character, according to the -/// (Unicode friendly) Perl character class '\w'. -/// Note that this is only use for testing word boundaries. The actual '\w' -/// is encoded as a CharClass instruction. -pub fn is_word(c: Option) -> bool { - let c = match c { - None => return false, - Some(c) => c, - }; - // Try the common ASCII case before invoking binary search. - match c { - '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true, - _ => PERLW.binary_search_by(|&(start, end)| { - if c >= start && c <= end { - Equal - } else if start > c { - Greater - } else { - Less - } - }).is_ok() - } -} - -/// Given a character and a single character class range, return an ordering -/// indicating whether the character is less than the start of the range, -/// in the range (inclusive) or greater than the end of the range. -/// -/// If `casei` is `true`, then this ordering is computed case insensitively. -/// -/// This function is meant to be used with a binary search. -#[inline] -fn class_cmp(casei: bool, mut textc: char, - (mut start, mut end): (char, char)) -> Ordering { - if casei { - // FIXME: This is pretty ridiculous. All of this case conversion - // can be moved outside this function: - // 1) textc should be uppercased outside the bsearch. - // 2) the character class itself should be uppercased either in the - // parser or the compiler. - // FIXME: This is too simplistic for correct Unicode support. - // See also: char_eq - textc = textc.to_uppercase(); - start = start.to_uppercase(); - end = end.to_uppercase(); - } - if textc >= start && textc <= end { - Equal - } else if start > textc { - Greater - } else { - Less - } -} - -/// Returns the starting location of `needle` in `haystack`. -/// If `needle` is not in `haystack`, then `None` is returned. -/// -/// Note that this is using a naive substring algorithm. -#[inline] -pub fn find_prefix(needle: &[u8], haystack: &[u8]) -> Option { - let (hlen, nlen) = (haystack.len(), needle.len()); - if nlen > hlen || nlen == 0 { - return None - } - for (offset, window) in haystack.windows(nlen).enumerate() { - if window == needle { - return Some(offset) - } - } - None -} diff --git a/src/librustc/lib.rs b/src/librustc/lib.rs index 377e5dd39ff..b961200f335 100644 --- a/src/librustc/lib.rs +++ b/src/librustc/lib.rs @@ -37,7 +37,6 @@ extern crate fmt_macros; extern crate getopts; extern crate graphviz; extern crate libc; -extern crate regex; extern crate rustc_llvm; extern crate rustc_back; extern crate serialize; diff --git a/src/librustc/session/mod.rs b/src/librustc/session/mod.rs index 4150335abc3..f90a60c9754 100644 --- a/src/librustc/session/mod.rs +++ b/src/librustc/session/mod.rs @@ -15,8 +15,6 @@ use metadata::filesearch; use session::search_paths::PathKind; use util::nodemap::NodeMap; -use regex::Regex; - use syntax::ast::NodeId; use syntax::codemap::Span; use syntax::diagnostic::{self, Emitter}; @@ -253,50 +251,54 @@ fn split_msg_into_multilines(msg: &str) -> Option { !msg.contains("structure constructor specifies a structure of type") { return None } - - let first = Regex::new(r"[( ]expected").unwrap(); - let second = Regex::new(r" found").unwrap(); - let third = Regex::new( - r"\((values differ|lifetime|cyclic type of infinite size)").unwrap(); + let first = msg.match_indices("expected").filter(|s| { + s.0 > 0 && (msg.char_at_reverse(s.0) == ' ' || + msg.char_at_reverse(s.0) == '(') + }).map(|(a, b)| (a - 1, b)); + let second = msg.match_indices("found").filter(|s| { + msg.char_at_reverse(s.0) == ' ' + }).map(|(a, b)| (a - 1, b)); let mut new_msg = String::new(); let mut head = 0u; // Insert `\n` before expected and found. - for (pos1, pos2) in first.find_iter(msg).zip( - second.find_iter(msg)) { + for (pos1, pos2) in first.zip(second) { new_msg = new_msg + - // A `(` may be preceded by a space and it should be trimmed - msg[head..pos1.0].trim_right() + // prefix - "\n" + // insert before first - &msg[pos1.0..pos1.1] + // insert what first matched - &msg[pos1.1..pos2.0] + // between matches - "\n " + // insert before second - // 123 - // `expected` is 3 char longer than `found`. To align the types, `found` gets - // 3 spaces prepended. - &msg[pos2.0..pos2.1]; // insert what second matched + // A `(` may be preceded by a space and it should be trimmed + msg[head..pos1.0].trim_right() + // prefix + "\n" + // insert before first + &msg[pos1.0..pos1.1] + // insert what first matched + &msg[pos1.1..pos2.0] + // between matches + "\n " + // insert before second + // 123 + // `expected` is 3 char longer than `found`. To align the types, + // `found` gets 3 spaces prepended. + &msg[pos2.0..pos2.1]; // insert what second matched head = pos2.1; } let mut tail = &msg[head..]; + let third = tail.find_str("(values differ") + .or(tail.find_str("(lifetime")) + .or(tail.find_str("(cyclic type of infinite size")); // Insert `\n` before any remaining messages which match. - for pos in third.find_iter(tail).take(1) { - // The end of the message may just be wrapped in `()` without `expected`/`found`. - // Push this also to a new line and add the final tail after. + if let Some(pos) = third { + // The end of the message may just be wrapped in `()` without + // `expected`/`found`. Push this also to a new line and add the + // final tail after. new_msg = new_msg + - // `(` is usually preceded by a space and should be trimmed. - tail[..pos.0].trim_right() + // prefix - "\n" + // insert before paren - &tail[pos.0..]; // append the tail + // `(` is usually preceded by a space and should be trimmed. + tail[..pos].trim_right() + // prefix + "\n" + // insert before paren + &tail[pos..]; // append the tail tail = ""; } new_msg.push_str(tail); - - return Some(new_msg) + return Some(new_msg); } pub fn build_session(sopts: config::Options, diff --git a/src/libtest/lib.rs b/src/libtest/lib.rs index 7226c6423b8..793483754ee 100644 --- a/src/libtest/lib.rs +++ b/src/libtest/lib.rs @@ -38,7 +38,6 @@ #![allow(unstable)] extern crate getopts; -extern crate regex; extern crate serialize; extern crate "serialize" as rustc_serialize; extern crate term; @@ -53,7 +52,6 @@ use self::OutputLocation::*; use stats::Stats; use getopts::{OptGroup, optflag, optopt}; -use regex::Regex; use serialize::Encodable; use term::Terminal; use term::color::{Color, RED, YELLOW, GREEN, CYAN}; @@ -279,7 +277,7 @@ pub enum ColorConfig { } pub struct TestOpts { - pub filter: Option, + pub filter: Option, pub run_ignored: bool, pub run_tests: bool, pub run_benchmarks: bool, @@ -365,11 +363,7 @@ pub fn parse_opts(args: &[String]) -> Option { if matches.opt_present("h") { usage(args[0].as_slice()); return None; } let filter = if matches.free.len() > 0 { - let s = matches.free[0].as_slice(); - match Regex::new(s) { - Ok(re) => Some(re), - Err(e) => return Some(Err(format!("could not parse /{}/: {:?}", s, e))) - } + Some(matches.free[0].clone()) } else { None }; @@ -833,9 +827,10 @@ pub fn filter_tests(opts: &TestOpts, tests: Vec) -> Vec filtered, - Some(ref re) => { - filtered.into_iter() - .filter(|test| re.is_match(test.desc.name.as_slice())).collect() + Some(ref filter) => { + filtered.into_iter().filter(|test| { + test.desc.name.as_slice().contains(&filter[]) + }).collect() } }; @@ -1230,16 +1225,6 @@ mod tests { assert!(res == TrFailed); } - #[test] - fn first_free_arg_should_be_a_filter() { - let args = vec!("progname".to_string(), "some_regex_filter".to_string()); - let opts = match parse_opts(args.as_slice()) { - Some(Ok(o)) => o, - _ => panic!("Malformed arg in first_free_arg_should_be_a_filter") - }; - assert!(opts.filter.expect("should've found filter").is_match("some_regex_filter")) - } - #[test] fn parse_ignored_flag() { let args = vec!("progname".to_string(), @@ -1336,37 +1321,6 @@ mod tests { } } - #[test] - pub fn filter_tests_regex() { - let mut opts = TestOpts::new(); - opts.filter = Some(::regex::Regex::new("a.*b.+c").unwrap()); - - let mut names = ["yes::abXc", "yes::aXXXbXXXXc", - "no::XYZ", "no::abc"]; - names.sort(); - - fn test_fn() {} - let tests = names.iter().map(|name| { - TestDescAndFn { - desc: TestDesc { - name: DynTestName(name.to_string()), - ignore: false, - should_fail: ShouldFail::No, - }, - testfn: DynTestFn(Thunk::new(test_fn)) - } - }).collect(); - let filtered = filter_tests(&opts, tests); - - let expected: Vec<&str> = - names.iter().map(|&s| s).filter(|name| name.starts_with("yes")).collect(); - - assert_eq!(filtered.len(), expected.len()); - for (test, expected_name) in filtered.iter().zip(expected.iter()) { - assert_eq!(test.desc.name.as_slice(), *expected_name); - } - } - #[test] pub fn test_metricmap_compare() { let mut m1 = MetricMap::new(); diff --git a/src/rustbook/book.rs b/src/rustbook/book.rs index 20346449fd1..3047e93137f 100644 --- a/src/rustbook/book.rs +++ b/src/rustbook/book.rs @@ -13,7 +13,6 @@ use std::io::BufferedReader; use std::iter; use std::iter::AdditiveIterator; -use regex::Regex; pub struct BookItem { pub title: String, @@ -94,8 +93,6 @@ pub fn parse_summary(input: R, src: &Path) -> Result[\t ]*)\*[:space:]*\[(?P.*)\]\((?P<path>.*)\)"; - let item_re = Regex::new(regex).unwrap(); let mut top_items = vec!(); let mut stack = vec!(); let mut errors = vec!(); @@ -117,45 +114,51 @@ pub fn parse_summary<R: Reader>(input: R, src: &Path) -> Result<Book, Vec<String } }; - item_re.captures(&line[]).map(|cap| { - let given_path = cap.name("path"); - let title = cap.name("title").unwrap().to_string(); + let star_idx = match line.find_str("*") { Some(i) => i, None => continue }; - let path_from_root = match src.join(given_path.unwrap()).path_relative_from(src) { - Some(p) => p, - None => { - errors.push(format!("paths in SUMMARY.md must be relative, \ - but path '{}' for section '{}' is not.", - given_path.unwrap(), title)); - Path::new("") - } - }; - let path_to_root = Path::new(iter::repeat("../") - .take(path_from_root.components().count() - 1) - .collect::<String>()); - let item = BookItem { - title: title, - path: path_from_root, - path_to_root: path_to_root, - children: vec!(), - }; - let level = cap.name("indent").unwrap().chars().map(|c| { - match c { - ' ' => 1us, - '\t' => 4, - _ => unreachable!() - } - }).sum() / 4 + 1; + let start_bracket = star_idx + line[star_idx..].find_str("[").unwrap(); + let end_bracket = start_bracket + line[start_bracket..].find_str("](").unwrap(); + let start_paren = end_bracket + 1; + let end_paren = start_paren + line[start_paren..].find_str(")").unwrap(); - if level > stack.len() + 1 { - errors.push(format!("section '{}' is indented too deeply; \ - found {}, expected {} or less", - item.title, level, stack.len() + 1)); - } else if level <= stack.len() { - collapse(&mut stack, &mut top_items, level); + let given_path = &line[start_paren + 1 .. end_paren]; + let title = line[start_bracket + 1..end_bracket].to_string(); + let indent = &line[..star_idx]; + + let path_from_root = match src.join(given_path).path_relative_from(src) { + Some(p) => p, + None => { + errors.push(format!("paths in SUMMARY.md must be relative, \ + but path '{}' for section '{}' is not.", + given_path, title)); + Path::new("") } - stack.push(item) - }); + }; + let path_to_root = Path::new(iter::repeat("../") + .take(path_from_root.components().count() - 1) + .collect::<String>()); + let item = BookItem { + title: title, + path: path_from_root, + path_to_root: path_to_root, + children: vec!(), + }; + let level = indent.chars().map(|c| { + match c { + ' ' => 1us, + '\t' => 4, + _ => unreachable!() + } + }).sum() / 4 + 1; + + if level > stack.len() + 1 { + errors.push(format!("section '{}' is indented too deeply; \ + found {}, expected {} or less", + item.title, level, stack.len() + 1)); + } else if level <= stack.len() { + collapse(&mut stack, &mut top_items, level); + } + stack.push(item) } if errors.is_empty() { diff --git a/src/rustbook/build.rs b/src/rustbook/build.rs index 50a6ad43aee..93601c0f61b 100644 --- a/src/rustbook/build.rs +++ b/src/rustbook/build.rs @@ -22,8 +22,6 @@ use book::{Book, BookItem}; use css; use javascript; -use regex::Regex; - use rustdoc; struct Build; @@ -81,9 +79,6 @@ fn render(book: &Book, tgt: &Path) -> CliResult<()> { let out_path = tgt.join(item.path.dirname()); - let regex = r"\[(?P<title>[^]]*)\]\((?P<url_stem>[^)]*)\.(?P<ext>md|markdown)\)"; - let md_urls = Regex::new(regex).unwrap(); - let src; if os::args().len() < 3 { src = os::getcwd().unwrap().clone(); @@ -94,7 +89,7 @@ fn render(book: &Book, tgt: &Path) -> CliResult<()> { let markdown_data = try!(File::open(&src.join(&item.path)).read_to_string()); let preprocessed_path = tmp.path().join(item.path.filename().unwrap()); { - let urls = md_urls.replace_all(&markdown_data[], "[$title]($url_stem.html)"); + let urls = markdown_data.replace(".md)", ".html)"); try!(File::create(&preprocessed_path) .write_str(&urls[])); } diff --git a/src/rustbook/main.rs b/src/rustbook/main.rs index ea72c653087..cbd29004097 100644 --- a/src/rustbook/main.rs +++ b/src/rustbook/main.rs @@ -11,8 +11,6 @@ #![feature(slicing_syntax, box_syntax)] #![allow(unstable)] -extern crate regex; - extern crate rustdoc; use std::os; diff --git a/src/test/bench/shootout-regex-dna.rs b/src/test/bench/shootout-regex-dna.rs deleted file mode 100644 index 074c0592312..00000000000 --- a/src/test/bench/shootout-regex-dna.rs +++ /dev/null @@ -1,126 +0,0 @@ -// The Computer Language Benchmarks Game -// http://benchmarksgame.alioth.debian.org/ -// -// contributed by the Rust Project Developers - -// Copyright (c) 2014 The Rust Project Developers -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// - Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in -// the documentation and/or other materials provided with the -// distribution. -// -// - Neither the name of "The Computer Language Benchmarks Game" nor -// the name of "The Computer Language Shootout Benchmarks" nor the -// names of its contributors may be used to endorse or promote -// products derived from this software without specific prior -// written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED -// OF THE POSSIBILITY OF SUCH DAMAGE. - -// ignore-stage1 -// ignore-cross-compile #12102 - -#![feature(box_syntax)] - -extern crate regex; - -use std::io; -use regex::{NoExpand, Regex}; -use std::sync::{Arc, Future}; - -macro_rules! regex { - ($e:expr) => (Regex::new($e).unwrap()) -} - -fn count_matches(seq: &str, variant: &Regex) -> int { - let mut n = 0; - for _ in variant.find_iter(seq) { - n += 1; - } - n -} - -fn main() { - let mut rdr = if std::os::getenv("RUST_BENCH").is_some() { - let fd = io::File::open(&Path::new("shootout-k-nucleotide.data")); - box io::BufferedReader::new(fd) as Box<io::Reader> - } else { - box io::stdin() as Box<io::Reader> - }; - let mut seq = rdr.read_to_string().unwrap(); - let ilen = seq.len(); - - seq = regex!(">[^\n]*\n|\n").replace_all(seq.as_slice(), NoExpand("")); - let seq_arc = Arc::new(seq.clone()); // copy before it moves - let clen = seq.len(); - - let mut seqlen = Future::spawn(move|| { - let substs = vec![ - (regex!("B"), "(c|g|t)"), - (regex!("D"), "(a|g|t)"), - (regex!("H"), "(a|c|t)"), - (regex!("K"), "(g|t)"), - (regex!("M"), "(a|c)"), - (regex!("N"), "(a|c|g|t)"), - (regex!("R"), "(a|g)"), - (regex!("S"), "(c|g)"), - (regex!("V"), "(a|c|g)"), - (regex!("W"), "(a|t)"), - (regex!("Y"), "(c|t)"), - ]; - let mut seq = seq; - for (re, replacement) in substs.into_iter() { - seq = re.replace_all(seq.as_slice(), NoExpand(replacement)); - } - seq.len() - }); - - let variants = vec![ - regex!("agggtaaa|tttaccct"), - regex!("[cgt]gggtaaa|tttaccc[acg]"), - regex!("a[act]ggtaaa|tttacc[agt]t"), - regex!("ag[act]gtaaa|tttac[agt]ct"), - regex!("agg[act]taaa|ttta[agt]cct"), - regex!("aggg[acg]aaa|ttt[cgt]ccct"), - regex!("agggt[cgt]aa|tt[acg]accct"), - regex!("agggta[cgt]a|t[acg]taccct"), - regex!("agggtaa[cgt]|[acg]ttaccct"), - ]; - let (mut variant_strs, mut counts) = (vec!(), vec!()); - for variant in variants.into_iter() { - let seq_arc_copy = seq_arc.clone(); - variant_strs.push(variant.to_string()); - counts.push(Future::spawn(move|| { - count_matches(seq_arc_copy.as_slice(), &variant) - })); - } - - for (i, variant) in variant_strs.iter().enumerate() { - println!("{} {}", variant, counts[i].get()); - } - println!(""); - println!("{}", ilen); - println!("{}", clen); - println!("{}", seqlen.get()); -} diff --git a/src/test/run-pass/rust-log-filter.rs b/src/test/run-pass/rust-log-filter.rs index 28d47f7aa9b..f7fa204d453 100644 --- a/src/test/run-pass/rust-log-filter.rs +++ b/src/test/run-pass/rust-log-filter.rs @@ -8,7 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -// exec-env:RUST_LOG=rust-log-filter/f.o +// exec-env:RUST_LOG=rust-log-filter/foo #![allow(unknown_features)] #![feature(box_syntax)] @@ -42,18 +42,14 @@ pub fn main() { let _t = Thread::spawn(move|| { log::set_logger(logger); - // our regex is "f.o" - // ensure it is a regex, and isn't anchored info!("foo"); info!("bar"); info!("foo bar"); info!("bar foo"); - info!("f1o"); }); assert_eq!(rx.recv().unwrap().as_slice(), "foo"); assert_eq!(rx.recv().unwrap().as_slice(), "foo bar"); assert_eq!(rx.recv().unwrap().as_slice(), "bar foo"); - assert_eq!(rx.recv().unwrap().as_slice(), "f1o"); assert!(rx.recv().is_err()); }