Commit f59e4a25 authored by Botond Baranyi's avatar Botond Baranyi
Browse files

implemented the '@nocase' modifier (artf699765, artf724076)



Change-Id: I6b9396893c7a4b4ad01800748b6cb79851f79bde
Signed-off-by: Botond Baranyi's avatarBotond Baranyi <botond.baranyi@ericsson.com>
parent 36ef98be
......@@ -35,7 +35,8 @@ GENERATED_SOURCES := pattern_la.cc pattern_p.cc pattern_uni.cc config_preproc_la
# Sources in the CVS
STATIC_SOURCES := memory.c new.cc userinfo.c path.c config_preproc.cc Quadruple.cc Path2.cc ModuleVersion.cc JSON_Tokenizer.cc
STATIC_SOURCES := memory.c new.cc userinfo.c path.c config_preproc.cc Quadruple.cc \
Path2.cc ModuleVersion.cc JSON_Tokenizer.cc UnicharPattern.cc
ifndef MINGW
STATIC_SOURCES += NetworkHandler.cc
......
......@@ -7,6 +7,7 @@
*
* Contributors:
* Balasko, Jeno
* Baranyi, Botond
* Raduly, Csaba
* Zalanyi, Balazs Andor
*
......@@ -64,6 +65,13 @@ void Quad::set(unsigned char group, unsigned char plane, unsigned char row,
u.comp.cell = cell;
}
void Quad::set_hexrepr(const char* hex_repr) {
u.comp.group = ((hex_repr[0] - 'A') << 4) + (hex_repr[1] - 'A');
u.comp.plane = ((hex_repr[2] - 'A') << 4) + (hex_repr[3] - 'A');
u.comp.row = ((hex_repr[4] - 'A') << 4) + (hex_repr[5] - 'A');
u.comp.cell = ((hex_repr[6] - 'A') << 4) + (hex_repr[7] - 'A');
}
const Quad Quad::operator-(const Quad& rhs) const {
return Quad(u.value - rhs.u.value);
}
......
......@@ -7,6 +7,7 @@
*
* Contributors:
* Balasko, Jeno
* Baranyi, Botond
* Raduly, Csaba
* Zalanyi, Balazs Andor
*
......@@ -72,6 +73,8 @@ public:
* @param c Value to set.
*/
void set(int field, unsigned char c);
void set_hexrepr(const char* hex_repr);
const Quad operator-(const Quad& rhs) const;
const Quad& operator=(const Quad& rhs);
......
/******************************************************************************
* Copyright (c) 2000-2016 Ericsson Telecom AB
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Baranyi, Botond – initial implementation
*
******************************************************************************/
#include "UnicharPattern.hh"
#include "pattern.hh"
#include "memory.h"
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
//////////////////////////////////////////////
//////////// the global instance /////////////
//////////////////////////////////////////////
UnicharPattern unichar_pattern;
//////////////////////////////////////////////
////////////// helper functions //////////////
//////////////////////////////////////////////
/** removes spaces from the beginning and end of the input string and returns
* the result */
static char* remove_spaces(char* str)
{
if (str == NULL) {
return NULL;
}
size_t len = strlen(str);
size_t start = 0;
while (isspace(str[start])) {
++start;
}
size_t end = len - 1;
while (isspace(str[end])) {
str[end] = '\0';
--end;
}
return str + start;
}
/** Exception class
*
* Thrown when one of the characters processed by hexchar_to_char or
* hexstr_to_char is not a hexadecimal digit */
class NotHexException {};
/** converts a character containing a hexadecimal digit to its numeric value */
static unsigned char hexchar_to_char(const char c)
{
if (c >= '0' && c <= '9') {
return c - '0';
}
else if (c >= 'A' && c <= 'F') {
return c + 10 - 'A';
}
else if (c >= 'a' && c <= 'f') {
return c + 10 - 'a';
}
throw NotHexException();
}
/** converts a string of two hexadecimal digits to the character the digits
* represent */
static unsigned char hexstr_to_char(const char* hex_str)
{
return (hexchar_to_char(hex_str[0]) << 4) | hexchar_to_char(hex_str[1]);
}
//////////////////////////////////////////////
// member functions of class UnicharPattern //
//////////////////////////////////////////////
UnicharPattern::UnicharPattern() : mappings_head(NULL)
{
// if anything goes wrong while parsing the case mappings file, just delete the
// partial results, display a warning, and treat all patterns as case-sensitive
const char* ttcn3_dir = getenv("TTCN3_DIR");
if (ttcn3_dir == NULL) {
TTCN_pattern_warning("Environment variable TTCN3_DIR not present. "
"Case-insensitive universal charstring patterns are disabled.\n");
return;
}
size_t ttcn3_dir_len = strlen(ttcn3_dir);
bool ends_with_slash = ttcn3_dir_len > 0 && ttcn3_dir[ttcn3_dir_len - 1] == '/';
char* mappings_file = mprintf("%s%setc/CaseFolding.txt", ttcn3_dir,
ends_with_slash ? "" : "/");
FILE* fp = fopen(mappings_file, "r");
if (fp == NULL) {
TTCN_pattern_warning("Cannot open file '%s' for reading. "
"Case-insensitive universal charstring patterns are disabled.\n", mappings_file);
return;
}
// this always points to the last element of the list
mapping_t* mappings_tail = NULL;
// read one line at a time
char line[1024];
while (fgets(line, sizeof(line), fp) != NULL) {
// ignore everything after the '#' (this is the 'comment' indicator)
char* line_end = strchr(line, '#');
if (line_end != NULL) {
*line_end = '\0';
}
// each column ends with a ';', use that as the separator for strtok
char* from_str = remove_spaces(strtok(line, ";"));
size_t from_str_len = from_str != NULL ? strlen(from_str) : 0;
if (from_str_len == 0) {
// nothing but comments and spaces in this line
continue;
}
// all character codes are 4 or 5 digits long
if (from_str_len < 4 || from_str_len > 5) {
TTCN_pattern_warning("Invalid format of case folding file (code column). "
"Case-insensitive universal charstring patterns are disabled.\n");
clean_up();
return;
}
char* status = remove_spaces(strtok(NULL, ";"));
// the status is one character long
if (status == NULL || strlen(status) != 1) {
TTCN_pattern_warning("Invalid format of case folding file (status column). "
"Case-insensitive universal charstring patterns are disabled.\n");
clean_up();
return;
}
else if (status[0] != 'C' && status[0] != 'S') {
// only use the lines with statuses 'C' and 'S', ignore the rest
continue;
}
char* to_str = remove_spaces(strtok(NULL, ";"));
size_t to_str_len = to_str != NULL ? strlen(to_str) : 0;
if (to_str_len < 4 || to_str_len > 5) {
TTCN_pattern_warning("Invalid format of case folding file (mapping column). "
"Case-insensitive universal charstring patterns are disabled.\n");
clean_up();
return;
}
// create the new element
if (mappings_tail == NULL) {
mappings_head = new mapping_t;
mappings_tail = mappings_head;
}
else {
mappings_tail->next = new mapping_t;
mappings_tail = mappings_tail->next;
}
mappings_tail->next = NULL;
// try to convert the extracted tokens to their character codes
try {
mappings_tail->from.set(0, from_str_len == 5 ? from_str[0] : 0,
hexstr_to_char(from_str + from_str_len - 4),
hexstr_to_char(from_str + from_str_len - 2));
mappings_tail->to.set(0, to_str_len == 5 ? to_str[0] : 0,
hexstr_to_char(to_str + to_str_len - 4),
hexstr_to_char(to_str + to_str_len - 2));
}
catch (NotHexException) {
// one of the tokens contained a non-hex character
TTCN_pattern_warning("Invalid format of case folding file (character code). "
"Case-insensitive universal charstring patterns are disabled.\n");
clean_up();
return;
}
}
}
void UnicharPattern::clean_up()
{
while (mappings_head != NULL) {
mapping_t* temp = mappings_head;
mappings_head = mappings_head->next;
delete temp;
}
}
UnicharPattern::mapping_t* UnicharPattern::find_mapping(const Quad& q) const
{
mapping_t* ptr = mappings_head;
while (ptr != NULL) {
if (ptr->from == q) {
return ptr;
}
ptr = ptr->next;
}
return NULL;
}
Quad UnicharPattern::convert_quad_to_lowercase(const Quad& q) const
{
mapping_t* mapping = find_mapping(q);
if (mapping != NULL) {
return mapping->to;
}
return q;
}
void UnicharPattern::convert_regex_str_to_lowercase(char* str) const
{
if (mappings_head != NULL) {
size_t len = strlen(str) / 8;
for (size_t i = 0; i < len; ++i) {
// the class 'Quad' contains the logic to convert to and from regex strings
Quad q;
q.set_hexrepr(str + 8 * i);
mapping_t* mapping = find_mapping(q);
if (mapping != NULL) {
// this call actually saves the specified Quad's regex string to the
// specified location in the string
Quad::get_hexrepr(mapping->to, str + 8 * i);
}
}
}
}
/******************************************************************************
* Copyright (c) 2000-2016 Ericsson Telecom AB
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Baranyi, Botond – initial implementation
*
******************************************************************************/
#ifndef UNICHARPATTERN_HH
#define UNICHARPATTERN_HH
#include "Quadruple.hh"
/** Helper class for handling case-insensitive universal charstring patterns
* (this includes all patterns used in universal charstring templates and
* universal charstring subtypes, and the universal charstring version of
* the predefined function 'regexp', as long as they have the '@nocase' modifier)
*
* Only one (global) instance of this class is created, which is used to convert
* the uppercase letters in patterns and the strings matched by the patterns
* to lowercase.
*
* The instance is initialized with a table at its construction, which contains
* the case mappings of Unicode characters (read from the file CaseFolding.txt,
* from the official Unicode site).
*
* This class does simple case foldings (from the folding types described in
* CaseFolding.txt), so only the mappings with statuses 'C' and 'S' are used. */
class UnicharPattern {
/** structure containing one character's mapping (linked list) */
struct mapping_t {
/** character mapped from (uppercase letter) */
Quad from;
/** character mapped to (lowercase letter) */
Quad to;
/** pointer to the next element in the list */
mapping_t* next;
};
/** pointer to the head of the linked list of mappings */
mapping_t* mappings_head;
/** deletes the mappings list */
void clean_up();
/** finds and returns the mapping list element with the 'from' character
* equivalent to the parameter */
mapping_t* find_mapping(const Quad& q) const;
public:
/** constructor - reads the case mappings from a text file and stores them
* in the linked list */
UnicharPattern();
/** destructor - deletes the list */
~UnicharPattern() { clean_up(); }
/** converts the specified character to lowercase (if it's an uppercase letter),
* and returns the result */
Quad convert_quad_to_lowercase(const Quad& q) const;
/** goes through the null-terminated regex string parameter and changes each
* uppercase letter to its lowercase equivalent
* @param str a universal charstring in regex format (meaning that every universal
* character is coded as 8 characters from 'A' to 'P', each representing a
* hexadecimal digit in the universal character's code) */
void convert_regex_str_to_lowercase(char* str) const;
};
/** The one instance of the universal charstring pattern helper class. */
extern UnicharPattern unichar_pattern;
#endif /* UNICHARPATTERN_HH */
......@@ -36,7 +36,7 @@
* to true, so no errors are reported for the extended ASCII characters. */
extern char* TTCN_pattern_to_regexp(const char* p_pattern, bool utf8 = false);
extern char* TTCN_pattern_to_regexp_uni(const char* p_pattern,
extern char* TTCN_pattern_to_regexp_uni(const char* p_pattern, bool p_nocase,
int** groups = 0);
/* defined elsewhere (can be different in compiler/runtime) */
......
......@@ -45,6 +45,7 @@
#include "pattern.hh"
#include "Quadruple.hh"
#include "UnicharPattern.hh"
union YYSTYPE;
/* defined in lexer c-file: */
......@@ -66,6 +67,8 @@
static int user_groups;
static bool nocase;
#define YYERROR_VERBOSE
static void yyprint(FILE *file, int type, const YYSTYPE& value);
......@@ -381,12 +384,13 @@ RE_OneCharPos:
{
unsigned char c = $1;
if ($1 <= 0) TTCN_pattern_error("Character with code %u "
"(0x%02x) cannot be used in a pattern for type charstring.", $1, $1);
$$ = Quad::get_hexrepr(c);
"(0x%02x) cannot be used in a pattern for type universal charstring.", $1, $1);
$$ = Quad::get_hexrepr(nocase ? tolower(c) : c);
}
| RE_Quadruple
{
$$ = Quad::get_hexrepr($1.value);
$$ = Quad::get_hexrepr(nocase ?
unichar_pattern.convert_quad_to_lowercase($1.value).get_value() : $1.value);
}
| RE_Set
{
......@@ -513,10 +517,11 @@ RE_Set_Range_Char:
| TOK_Char
{
if ($1 <= 0) TTCN_pattern_error("Character with code %u "
"(0x%02x) cannot be used in a pattern for type charstring.", $1, $1);
$$.value = $1;
"(0x%02x) cannot be used in a pattern for type universal charstring.", $1, $1);
$$.value = nocase ? tolower($1) : $1;
}
| RE_Quadruple { $$.value = $1.value; }
| RE_Quadruple { $$.value = nocase ?
unichar_pattern.convert_quad_to_lowercase($1.value).get_value() : $1.value; }
;
RE_Set_NoRange_Char:
......@@ -582,13 +587,14 @@ RE_Quadruple:
* Interface
*********************************************************************/
char* TTCN_pattern_to_regexp_uni(const char* p_pattern, int** groups)
char* TTCN_pattern_to_regexp_uni(const char* p_pattern, bool p_nocase, int** groups)
{
/* if you want to debug */
//pattern_unidebug=1;
ret_val=NULL;
user_groups = 0;
nocase = p_nocase;
yy_buffer_state *flex_buffer = pattern_yy_scan_string(p_pattern);
if(flex_buffer == NULL) {
......
......@@ -107,7 +107,7 @@ TCOV2LCOV_OBJECTS := $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(TCOV2LCOV_SOURCES)
OBJECTS := $(COMPILER_OBJECTS) $(MFGEN_OBJECTS) $(TCOV2LCOV_OBJECTS)
# Used by both the compiler and makefilegen
COMMON_OBJECTS := $(addprefix ../common/, memory.o path.o userinfo.o JSON_Tokenizer.o)
COMMON_OBJECTS := $(addprefix ../common/, memory.o path.o userinfo.o)
TCOV2LCOV_COMMON_OBJECTS := ../common/memory.o
......@@ -121,7 +121,8 @@ ifdef REGEX_DIR
endif
COMPILER_COMMON_OBJECTS := $(COMMON_OBJECTS) \
$(addprefix ../common/, new.o pattern_la.o pattern_p.o pattern_uni.o Quadruple.o ModuleVersion.o)
$(addprefix ../common/, new.o pattern_la.o pattern_p.o pattern_uni.o Quadruple.o \
ModuleVersion.o JSON_Tokenizer.o UnicharPattern.o)
ifeq ($(USAGE_STATS), yes)
COMPILER_COMMON_OBJECTS += ../common/usage_stats.o
......
......@@ -28,6 +28,7 @@
#include <stdint.h>
#include "../common/memory.h"
#include "../common/pattern.hh"
#include "../common/UnicharPattern.hh"
#include <iostream>
// used by regex
......@@ -606,7 +607,7 @@ namespace Common {
}
string* regexp(const string& instr, const string& expression,
const Int& groupno)
const Int& groupno, bool nocase)
{
string *retval=0;
......@@ -627,7 +628,8 @@ namespace Common {
}
regex_t posix_regexp;
int ret_val=regcomp(&posix_regexp, posix_str, REG_EXTENDED);
int ret_val=regcomp(&posix_regexp, posix_str, REG_EXTENDED |
(nocase ? REG_ICASE : 0));
Free(posix_str);
if(ret_val!=0) {
/* regexp error */
......@@ -671,7 +673,7 @@ namespace Common {
}
ustring* regexp(const ustring& instr, const ustring& expression,
const Int& groupno)
const Int& groupno, bool nocase)
{
ustring *retval=0;
......@@ -685,7 +687,7 @@ namespace Common {
verb_level &= ~(1|2);
int* user_groups;
char *posix_str = TTCN_pattern_to_regexp_uni(
expression.get_stringRepr_for_pattern().c_str(), &user_groups);
expression.get_stringRepr_for_pattern().c_str(), nocase, &user_groups);
if (user_groups == 0)
FATAL_ERROR("regexp(): Cannot find any groups in the second argument.");
verb_level = orig_verb_level;
......@@ -718,15 +720,18 @@ namespace Common {
regmatch_t* pmatch = (regmatch_t*)Malloc((nmatch+1)*sizeof(regmatch_t));
char* tmp = instr.convert_to_regexp_form();
if (nocase) {
unichar_pattern.convert_regex_str_to_lowercase(tmp);
}
string instr_conv(tmp);
Free(tmp);
ret_val = regexec(&posix_regexp, instr_conv.c_str(), nmatch+1, pmatch, 0);
if(ret_val == 0) {
if(pmatch[nmatch].rm_so != -1 && pmatch[nmatch].rm_eo != -1) {
retval = new ustring(
instr_conv.substr(pmatch[nmatch].rm_so,
pmatch[nmatch].rm_eo - pmatch[nmatch].rm_so)
.convert_stringRepr_for_pattern());
retval = new ustring(instr.extract_matched_section(pmatch[nmatch].rm_so,
pmatch[nmatch].rm_eo));
} else { retval = new ustring(); }
}
Free(pmatch);
......
......@@ -62,9 +62,9 @@ namespace Common {
extern int_val_t* float2int(const Real& value, const Location& loc);
extern string* float2str(const Real& value);
extern string* regexp(const string& instr, const string& expression,
const Int& groupno);
const Int& groupno, bool nocase);
extern ustring* regexp(const ustring& instr, const ustring& expression,
const Int& groupno);
const Int& groupno, bool nocase);
extern string* remove_bom(const string& encoded_value);
extern string* get_stringencoding(const string& encoded_value);
extern ustring decode_utf8(const string & ostr, CharCoding::CharCodingType expected_coding);
......
......@@ -262,6 +262,7 @@ namespace Common {
u.expr.ti1=p.u.expr.ti1->clone();
u.expr.t2=p.u.expr.t2->clone();
u.expr.v3=p.u.expr.v3->clone();
u.expr.b4=p.u.expr.b4;
break;
case OPTYPE_DECOMP: // v1 v2 v3
u.expr.v1=p.u.expr.v1->clone();
......@@ -1184,8 +1185,9 @@ namespace Common {
} // switch
}
// ti1 t2 v3
Value::Value(operationtype_t p_optype, TemplateInstance *p_ti1, TemplateInstance *p_t2, Value *p_v3)
// ti1 t2 v3 b4
Value::Value(operationtype_t p_optype, TemplateInstance *p_ti1,
TemplateInstance *p_t2, Value *p_v3, bool p_b4)
: GovernedSimple(S_V), valuetype(V_EXPR), my_governor(0)
{
u.expr.v_optype=p_optype;
......@@ -1195,7 +1197,8 @@ namespace Common {
if(!p_ti1 || !p_t2 || !p_v3) FATAL_ERROR("Value::Value()");
u.expr.ti1 = p_ti1;
u.expr.t2 = p_t2;
u.expr.v3=p_v3;
u.expr.v3 = p_v3;
u.expr.b4 = p_b4;
break;
default:
FATAL_ERROR("Value::Value()");
......@@ -8128,14 +8131,14 @@ error:
if (v1->valuetype == V_CSTR) {
const string& s1 = v1->get_val_str();
const string& s2 = v2->get_val_str();
string *result = regexp(s1, s2, i3);
string *result = regexp(s1, s2, i3, u.expr.b4);
clean_up();
valuetype = V_CSTR;
set_val_str(result);
} if (v1->valuetype == V_USTR) {
const ustring& s1 = v1->get_val_ustr();
const ustring& s2 = v2->get_val_ustr();
ustring *result = regexp(s1, s2, i3);
ustring *result = regexp(s1, s2, i3, u.expr.b4);
clean_up();
valuetype = V_USTR;
set_val_ustr(result);
......@@ -10783,7 +10786,11 @@ error:
return ret_val;
}
case OPTYPE_REGEXP: {
string ret_val("regexp(");
string ret_val("regexp");
if (u.expr.b4) {
ret_val += " @nocase ";
}
ret_val += "(";
u.expr.ti1->append_stringRepr(ret_val);
ret_val += ", ";
u.expr.t2->append_stringRepr(ret_val);
......@@ -12545,7 +12552,7 @@ error:
else u.expr.t2->generate_code(expr);
expr->expr = mputstr(expr->expr, ", ");
u.expr.v3->generate_code_expr_mandatory(expr);
expr->expr = mputc(expr->expr, ')');
expr->expr = mputprintf(expr->expr, ", %s)", u.expr.b4 ? "TRUE" : "FALSE");
}
void Value::generate_code_expr_replace(expression_struct *expr)
......
......@@ -213,7 +213,7 @@ namespace Common {
OPTYPE_DECODE, // r1 r2
OPTYPE_SUBSTR, // ti1 v2 v3
OPTYPE_REGEXP, // ti1 t2 v3
OPTYPE_REGEXP, // ti1 t2 v3 b4
OPTYPE_DECOMP, // v1 v2 v3 66
OPTYPE_REPLACE, // ti1 v2 v3 ti4
......@@ -428,8 +428,9 @@ namespace Common {
Value(operationtype_t p_optype, TemplateInstance *p_ti1, Value *p_v2);
/** Constructor used by V_EXPR "ti1 v2 v3" */
Value(operationtype_t p_optype, TemplateInstance *p_ti1, Value *p_v2, Value *p_v3);
/** Constructor used by V_EXPR "ti1 t2 v3" */
Value(operationtype_t p_optype, TemplateInstance *p_ti1, TemplateInstance *p_t2, Value *p_v3);
/** Constructor used by V_EXPR "ti1 t2 v3 b4" */
Value(operationtype_t p_optype, TemplateInstance *p_ti1, TemplateInstance *p_t2,
Value *p_v3, bool p_b4);
/** Constructor used by V_EXPR "ti1 v2 v3 ti4" */
Value(operationtype_t p_optype, TemplateInstance *p_ti1, Value *p_v2,
Value *p_v3, TemplateInstance *p_ti4);
......
......@@ -405,31 +405,6 @@ string string::get_stringRepr() const
return ret_val;