mirror of https://github.com/n-hys/bash.git
618 lines
14 KiB
C
618 lines
14 KiB
C
/* strmatch.c -- ksh-like extended pattern matching for the shell and filename
|
|
globbing. */
|
|
|
|
/* Copyright (C) 1991-2020 Free Software Foundation, Inc.
|
|
|
|
This file is part of GNU Bash, the Bourne Again SHell.
|
|
|
|
Bash is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
Bash is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Bash. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include <config.h>
|
|
|
|
#include <stdio.h> /* for debugging */
|
|
|
|
#include "strmatch.h"
|
|
#include <chartypes.h>
|
|
|
|
#include "bashansi.h"
|
|
#include "shmbutil.h"
|
|
#include "xmalloc.h"
|
|
|
|
#include <errno.h>
|
|
|
|
#if !defined (errno)
|
|
extern int errno;
|
|
#endif
|
|
|
|
#if FNMATCH_EQUIV_FALLBACK
|
|
/* We don't include <fnmatch.h> in order to avoid namespace collisions; the
|
|
internal strmatch still uses the FNM_ constants. */
|
|
extern int fnmatch (const char *, const char *, int);
|
|
#endif
|
|
|
|
/* First, compile `sm_loop.c' for single-byte characters. */
|
|
#define CHAR unsigned char
|
|
#define U_CHAR unsigned char
|
|
#define XCHAR char
|
|
#define INT int
|
|
#define L(CS) CS
|
|
#define INVALID -1
|
|
|
|
#undef STREQ
|
|
#undef STREQN
|
|
#define STREQ(a, b) ((a)[0] == (b)[0] && strcmp(a, b) == 0)
|
|
#define STREQN(a, b, n) ((a)[0] == (b)[0] && strncmp(a, b, n) == 0)
|
|
|
|
#ifndef GLOBASCII_DEFAULT
|
|
# define GLOBASCII_DEFAULT 0
|
|
#endif
|
|
|
|
int glob_asciirange = GLOBASCII_DEFAULT;
|
|
|
|
#if FNMATCH_EQUIV_FALLBACK
|
|
/* Construct a string w1 = "c1" and a pattern w2 = "[[=c2=]]" and pass them
|
|
to fnmatch to see if wide characters c1 and c2 collate as members of the
|
|
same equivalence class. We can't really do this portably any other way */
|
|
static int
|
|
_fnmatch_fallback (s, p)
|
|
int s, p; /* string char, patchar */
|
|
{
|
|
char s1[2]; /* string */
|
|
char s2[8]; /* constructed pattern */
|
|
|
|
s1[0] = (unsigned char)s;
|
|
s1[1] = '\0';
|
|
|
|
/* reconstruct the pattern */
|
|
s2[0] = s2[1] = '[';
|
|
s2[2] = '=';
|
|
s2[3] = (unsigned char)p;
|
|
s2[4] = '=';
|
|
s2[5] = s2[6] = ']';
|
|
s2[7] = '\0';
|
|
|
|
return (fnmatch ((const char *)s2, (const char *)s1, 0));
|
|
}
|
|
#endif
|
|
|
|
/* We use strcoll(3) for range comparisons in bracket expressions,
|
|
even though it can have unwanted side effects in locales
|
|
other than POSIX or US. For instance, in the de locale, [A-Z] matches
|
|
all characters. If GLOB_ASCIIRANGE is non-zero, and we're not forcing
|
|
the use of strcoll (e.g., for explicit collating symbols), we use
|
|
straight ordering as if in the C locale. */
|
|
|
|
#if defined (HAVE_STRCOLL)
|
|
/* Helper functions for collating symbol equivalence. */
|
|
|
|
/* Return 0 if C1 == C2 or collates equally if FORCECOLL is non-zero. */
|
|
static int
|
|
charcmp (c1, c2, forcecoll)
|
|
int c1, c2;
|
|
int forcecoll;
|
|
{
|
|
static char s1[2] = { ' ', '\0' };
|
|
static char s2[2] = { ' ', '\0' };
|
|
int ret;
|
|
|
|
/* Eight bits only. Period. */
|
|
c1 &= 0xFF;
|
|
c2 &= 0xFF;
|
|
|
|
if (c1 == c2)
|
|
return (0);
|
|
|
|
if (forcecoll == 0 && glob_asciirange)
|
|
return (c1 - c2);
|
|
|
|
s1[0] = c1;
|
|
s2[0] = c2;
|
|
|
|
return (strcoll (s1, s2));
|
|
}
|
|
|
|
static int
|
|
rangecmp (c1, c2, forcecoll)
|
|
int c1, c2;
|
|
int forcecoll;
|
|
{
|
|
int r;
|
|
|
|
r = charcmp (c1, c2, forcecoll);
|
|
|
|
/* We impose a total ordering here by returning c1-c2 if charcmp returns 0 */
|
|
if (r != 0)
|
|
return r;
|
|
return (c1 - c2); /* impose total ordering */
|
|
}
|
|
#else /* !HAVE_STRCOLL */
|
|
# define rangecmp(c1, c2, f) ((int)(c1) - (int)(c2))
|
|
#endif /* !HAVE_STRCOLL */
|
|
|
|
#if defined (HAVE_STRCOLL)
|
|
/* Returns 1 if chars C and EQUIV collate equally in the current locale. */
|
|
static int
|
|
collequiv (c, equiv)
|
|
int c, equiv;
|
|
{
|
|
if (charcmp (c, equiv, 1) == 0)
|
|
return 1;
|
|
|
|
#if FNMATCH_EQUIV_FALLBACK
|
|
return (_fnmatch_fallback (c, equiv) == 0);
|
|
#else
|
|
return 0;
|
|
#endif
|
|
|
|
}
|
|
#else
|
|
# define collequiv(c, equiv) ((c) == (equiv))
|
|
#endif
|
|
|
|
#define _COLLSYM _collsym
|
|
#define __COLLSYM __collsym
|
|
#define POSIXCOLL posix_collsyms
|
|
#include "collsyms.h"
|
|
|
|
static int
|
|
collsym (s, len)
|
|
CHAR *s;
|
|
int len;
|
|
{
|
|
register struct _collsym *csp;
|
|
char *x;
|
|
|
|
x = (char *)s;
|
|
for (csp = posix_collsyms; csp->name; csp++)
|
|
{
|
|
if (STREQN(csp->name, x, len) && csp->name[len] == '\0')
|
|
return (csp->code);
|
|
}
|
|
if (len == 1)
|
|
return s[0];
|
|
return INVALID;
|
|
}
|
|
|
|
/* unibyte character classification */
|
|
#if !defined (isascii) && !defined (HAVE_ISASCII)
|
|
# define isascii(c) ((unsigned int)(c) <= 0177)
|
|
#endif
|
|
|
|
enum char_class
|
|
{
|
|
CC_NO_CLASS = 0,
|
|
CC_ASCII, CC_ALNUM, CC_ALPHA, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
|
|
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_WORD, CC_XDIGIT
|
|
};
|
|
|
|
static char const *const cclass_name[] =
|
|
{
|
|
"",
|
|
"ascii", "alnum", "alpha", "blank", "cntrl", "digit", "graph",
|
|
"lower", "print", "punct", "space", "upper", "word", "xdigit"
|
|
};
|
|
|
|
#define N_CHAR_CLASS (sizeof(cclass_name) / sizeof (cclass_name[0]))
|
|
|
|
static enum char_class
|
|
is_valid_cclass (name)
|
|
const char *name;
|
|
{
|
|
enum char_class ret;
|
|
int i;
|
|
|
|
ret = CC_NO_CLASS;
|
|
|
|
for (i = 1; i < N_CHAR_CLASS; i++)
|
|
{
|
|
if (STREQ (name, cclass_name[i]))
|
|
{
|
|
ret = (enum char_class)i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int
|
|
cclass_test (c, char_class)
|
|
int c;
|
|
enum char_class char_class;
|
|
{
|
|
int result;
|
|
|
|
switch (char_class)
|
|
{
|
|
case CC_ASCII:
|
|
result = isascii (c);
|
|
break;
|
|
case CC_ALNUM:
|
|
result = ISALNUM (c);
|
|
break;
|
|
case CC_ALPHA:
|
|
result = ISALPHA (c);
|
|
break;
|
|
case CC_BLANK:
|
|
result = ISBLANK (c);
|
|
break;
|
|
case CC_CNTRL:
|
|
result = ISCNTRL (c);
|
|
break;
|
|
case CC_DIGIT:
|
|
result = ISDIGIT (c);
|
|
break;
|
|
case CC_GRAPH:
|
|
result = ISGRAPH (c);
|
|
break;
|
|
case CC_LOWER:
|
|
result = ISLOWER (c);
|
|
break;
|
|
case CC_PRINT:
|
|
result = ISPRINT (c);
|
|
break;
|
|
case CC_PUNCT:
|
|
result = ISPUNCT (c);
|
|
break;
|
|
case CC_SPACE:
|
|
result = ISSPACE (c);
|
|
break;
|
|
case CC_UPPER:
|
|
result = ISUPPER (c);
|
|
break;
|
|
case CC_WORD:
|
|
result = (ISALNUM (c) || c == '_');
|
|
break;
|
|
case CC_XDIGIT:
|
|
result = ISXDIGIT (c);
|
|
break;
|
|
default:
|
|
result = -1;
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
static int
|
|
is_cclass (c, name)
|
|
int c;
|
|
const char *name;
|
|
{
|
|
enum char_class char_class;
|
|
int result;
|
|
|
|
char_class = is_valid_cclass (name);
|
|
if (char_class == CC_NO_CLASS)
|
|
return -1;
|
|
|
|
result = cclass_test (c, char_class);
|
|
return (result);
|
|
}
|
|
|
|
/* Now include `sm_loop.c' for single-byte characters. */
|
|
/* The result of FOLD is an `unsigned char' */
|
|
# define FOLD(c) ((flags & FNM_CASEFOLD) \
|
|
? TOLOWER ((unsigned char)c) \
|
|
: ((unsigned char)c))
|
|
|
|
#define FCT internal_strmatch
|
|
#define GMATCH gmatch
|
|
#define COLLSYM collsym
|
|
#define PARSE_COLLSYM parse_collsym
|
|
#define BRACKMATCH brackmatch
|
|
#define PATSCAN glob_patscan
|
|
#define STRCOMPARE strcompare
|
|
#define EXTMATCH extmatch
|
|
#define DEQUOTE_PATHNAME udequote_pathname
|
|
#define STRUCT smat_struct
|
|
#define STRCHR(S, C) strchr((S), (C))
|
|
#define MEMCHR(S, C, N) memchr((S), (C), (N))
|
|
#define STRCOLL(S1, S2) strcoll((S1), (S2))
|
|
#define STRLEN(S) strlen(S)
|
|
#define STRCMP(S1, S2) strcmp((S1), (S2))
|
|
#define RANGECMP(C1, C2, F) rangecmp((C1), (C2), (F))
|
|
#define COLLEQUIV(C1, C2) collequiv((C1), (C2))
|
|
#define CTYPE_T enum char_class
|
|
#define IS_CCLASS(C, S) is_cclass((C), (S))
|
|
#include "sm_loop.c"
|
|
|
|
#if HANDLE_MULTIBYTE
|
|
|
|
# define CHAR wchar_t
|
|
# define U_CHAR wint_t
|
|
# define XCHAR wchar_t
|
|
# define INT wint_t
|
|
# define L(CS) L##CS
|
|
# define INVALID WEOF
|
|
|
|
# undef STREQ
|
|
# undef STREQN
|
|
# define STREQ(s1, s2) ((wcscmp (s1, s2) == 0))
|
|
# define STREQN(a, b, n) ((a)[0] == (b)[0] && wcsncmp(a, b, n) == 0)
|
|
|
|
extern char *mbsmbchar PARAMS((const char *));
|
|
|
|
#if FNMATCH_EQUIV_FALLBACK
|
|
/* Construct a string w1 = "c1" and a pattern w2 = "[[=c2=]]" and pass them
|
|
to fnmatch to see if wide characters c1 and c2 collate as members of the
|
|
same equivalence class. We can't really do this portably any other way */
|
|
static int
|
|
_fnmatch_fallback_wc (c1, c2)
|
|
wchar_t c1, c2; /* string char, patchar */
|
|
{
|
|
char w1[MB_LEN_MAX+1]; /* string */
|
|
char w2[MB_LEN_MAX+8]; /* constructed pattern */
|
|
int l1, l2;
|
|
|
|
l1 = wctomb (w1, c1);
|
|
if (l1 == -1)
|
|
return (2);
|
|
w1[l1] = '\0';
|
|
|
|
/* reconstruct the pattern */
|
|
w2[0] = w2[1] = '[';
|
|
w2[2] = '=';
|
|
l2 = wctomb (w2+3, c2);
|
|
if (l2 == -1)
|
|
return (2);
|
|
w2[l2+3] = '=';
|
|
w2[l2+4] = w2[l2+5] = ']';
|
|
w2[l2+6] = '\0';
|
|
|
|
return (fnmatch ((const char *)w2, (const char *)w1, 0));
|
|
}
|
|
#endif
|
|
|
|
static int
|
|
charcmp_wc (c1, c2, forcecoll)
|
|
wint_t c1, c2;
|
|
int forcecoll;
|
|
{
|
|
static wchar_t s1[2] = { L' ', L'\0' };
|
|
static wchar_t s2[2] = { L' ', L'\0' };
|
|
int r;
|
|
|
|
if (c1 == c2)
|
|
return 0;
|
|
|
|
if (forcecoll == 0 && glob_asciirange && c1 <= UCHAR_MAX && c2 <= UCHAR_MAX)
|
|
return ((int)(c1 - c2));
|
|
|
|
s1[0] = c1;
|
|
s2[0] = c2;
|
|
|
|
return (wcscoll (s1, s2));
|
|
}
|
|
|
|
static int
|
|
rangecmp_wc (c1, c2, forcecoll)
|
|
wint_t c1, c2;
|
|
int forcecoll;
|
|
{
|
|
int r;
|
|
|
|
r = charcmp_wc (c1, c2, forcecoll);
|
|
|
|
/* We impose a total ordering here by returning c1-c2 if charcmp returns 0,
|
|
as we do above in the single-byte case. */
|
|
if (r != 0 || forcecoll)
|
|
return r;
|
|
return ((int)(c1 - c2)); /* impose total ordering */
|
|
}
|
|
|
|
/* Returns 1 if wide chars C and EQUIV collate equally in the current locale. */
|
|
static int
|
|
collequiv_wc (c, equiv)
|
|
wint_t c, equiv;
|
|
{
|
|
wchar_t s, p;
|
|
|
|
if (charcmp_wc (c, equiv, 1) == 0)
|
|
return 1;
|
|
|
|
#if FNMATCH_EQUIV_FALLBACK
|
|
/* We check explicitly for success (fnmatch returns 0) to avoid problems if
|
|
our local definition of FNM_NOMATCH (strmatch.h) doesn't match the
|
|
system's (fnmatch.h). We don't care about error return values here. */
|
|
|
|
s = c;
|
|
p = equiv;
|
|
return (_fnmatch_fallback_wc (s, p) == 0);
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
/* Helper function for collating symbol. */
|
|
# define _COLLSYM _collwcsym
|
|
# define __COLLSYM __collwcsym
|
|
# define POSIXCOLL posix_collwcsyms
|
|
# include "collsyms.h"
|
|
|
|
static wint_t
|
|
collwcsym (s, len)
|
|
wchar_t *s;
|
|
int len;
|
|
{
|
|
register struct _collwcsym *csp;
|
|
|
|
for (csp = posix_collwcsyms; csp->name; csp++)
|
|
{
|
|
if (STREQN(csp->name, s, len) && csp->name[len] == L'\0')
|
|
return (csp->code);
|
|
}
|
|
if (len == 1)
|
|
return s[0];
|
|
return INVALID;
|
|
}
|
|
|
|
static int
|
|
is_wcclass (wc, name)
|
|
wint_t wc;
|
|
wchar_t *name;
|
|
{
|
|
char *mbs;
|
|
mbstate_t state;
|
|
size_t mbslength;
|
|
wctype_t desc;
|
|
int want_word;
|
|
|
|
if ((wctype ("ascii") == (wctype_t)0) && (wcscmp (name, L"ascii") == 0))
|
|
{
|
|
int c;
|
|
|
|
if ((c = wctob (wc)) == EOF)
|
|
return 0;
|
|
else
|
|
return (c <= 0x7F);
|
|
}
|
|
|
|
want_word = (wcscmp (name, L"word") == 0);
|
|
if (want_word)
|
|
name = L"alnum";
|
|
|
|
memset (&state, '\0', sizeof (mbstate_t));
|
|
mbs = (char *) malloc (wcslen(name) * MB_CUR_MAX + 1);
|
|
if (mbs == 0)
|
|
return -1;
|
|
mbslength = wcsrtombs (mbs, (const wchar_t **)&name, (wcslen(name) * MB_CUR_MAX + 1), &state);
|
|
|
|
if (mbslength == (size_t)-1 || mbslength == (size_t)-2)
|
|
{
|
|
free (mbs);
|
|
return -1;
|
|
}
|
|
desc = wctype (mbs);
|
|
free (mbs);
|
|
|
|
if (desc == (wctype_t)0)
|
|
return -1;
|
|
|
|
if (want_word)
|
|
return (iswctype (wc, desc) || wc == L'_');
|
|
else
|
|
return (iswctype (wc, desc));
|
|
}
|
|
|
|
/* Return 1 if there are no char class [:class:] expressions (degenerate case)
|
|
or only posix-specified (C locale supported) char class expressions in
|
|
PATTERN. These are the ones where it's safe to punt to the single-byte
|
|
code, since wide character support allows locale-defined char classes.
|
|
This only uses single-byte code, but is only needed to support multibyte
|
|
locales. */
|
|
static int
|
|
posix_cclass_only (pattern)
|
|
char *pattern;
|
|
{
|
|
char *p, *p1;
|
|
char cc[16]; /* sufficient for all valid posix char class names */
|
|
enum char_class valid;
|
|
|
|
p = pattern;
|
|
while (p = strchr (p, '['))
|
|
{
|
|
if (p[1] != ':')
|
|
{
|
|
p++;
|
|
continue;
|
|
}
|
|
p += 2; /* skip past "[:" */
|
|
/* Find end of char class expression */
|
|
for (p1 = p; *p1; p1++)
|
|
if (*p1 == ':' && p1[1] == ']')
|
|
break;
|
|
if (*p1 == 0) /* no char class expression found */
|
|
break;
|
|
/* Find char class name and validate it against posix char classes */
|
|
if ((p1 - p) >= sizeof (cc))
|
|
return 0;
|
|
bcopy (p, cc, p1 - p);
|
|
cc[p1 - p] = '\0';
|
|
valid = is_valid_cclass (cc);
|
|
if (valid == CC_NO_CLASS)
|
|
return 0; /* found unrecognized char class name */
|
|
|
|
p = p1 + 2; /* found posix char class name */
|
|
}
|
|
|
|
return 1; /* no char class names or only posix */
|
|
}
|
|
|
|
/* Now include `sm_loop.c' for multibyte characters. */
|
|
#define FOLD(c) ((flags & FNM_CASEFOLD) && iswupper (c) ? towlower (c) : (c))
|
|
#define FCT internal_wstrmatch
|
|
#define GMATCH gmatch_wc
|
|
#define COLLSYM collwcsym
|
|
#define PARSE_COLLSYM parse_collwcsym
|
|
#define BRACKMATCH brackmatch_wc
|
|
#define PATSCAN glob_patscan_wc
|
|
#define STRCOMPARE wscompare
|
|
#define EXTMATCH extmatch_wc
|
|
#define DEQUOTE_PATHNAME wcdequote_pathname
|
|
#define STRUCT wcsmat_struct
|
|
#define STRCHR(S, C) wcschr((S), (C))
|
|
#define MEMCHR(S, C, N) wmemchr((S), (C), (N))
|
|
#define STRCOLL(S1, S2) wcscoll((S1), (S2))
|
|
#define STRLEN(S) wcslen(S)
|
|
#define STRCMP(S1, S2) wcscmp((S1), (S2))
|
|
#define RANGECMP(C1, C2, F) rangecmp_wc((C1), (C2), (F))
|
|
#define COLLEQUIV(C1, C2) collequiv_wc((C1), (C2))
|
|
#define CTYPE_T enum char_class
|
|
#define IS_CCLASS(C, S) is_wcclass((C), (S))
|
|
#include "sm_loop.c"
|
|
|
|
#endif /* HAVE_MULTIBYTE */
|
|
|
|
int
|
|
xstrmatch (pattern, string, flags)
|
|
char *pattern;
|
|
char *string;
|
|
int flags;
|
|
{
|
|
#if HANDLE_MULTIBYTE
|
|
int ret;
|
|
size_t n;
|
|
wchar_t *wpattern, *wstring;
|
|
size_t plen, slen, mplen, mslen;
|
|
|
|
if (MB_CUR_MAX == 1)
|
|
return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
|
|
|
|
if (mbsmbchar (string) == 0 && mbsmbchar (pattern) == 0 && posix_cclass_only (pattern))
|
|
return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
|
|
|
|
n = xdupmbstowcs (&wpattern, NULL, pattern);
|
|
if (n == (size_t)-1 || n == (size_t)-2)
|
|
return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
|
|
|
|
n = xdupmbstowcs (&wstring, NULL, string);
|
|
if (n == (size_t)-1 || n == (size_t)-2)
|
|
{
|
|
free (wpattern);
|
|
return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
|
|
}
|
|
|
|
ret = internal_wstrmatch (wpattern, wstring, flags);
|
|
|
|
free (wpattern);
|
|
free (wstring);
|
|
|
|
return ret;
|
|
#else
|
|
return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
|
|
#endif /* !HANDLE_MULTIBYTE */
|
|
}
|