Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
PredefFunc.cc 35.56 KiB
/******************************************************************************
 * Copyright (c) 2000-2021 Ericsson Telecom AB
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v2.0
 * which accompanies this distribution, and is available at
 * https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.html
 *
 * Contributors:
 *   Baji, Laszlo
 *   Balasko, Jeno
 *   Baranyi, Botond
 *   Kovacs, Ferenc
 *   Raduly, Csaba
 *   Zalanyi, Balazs Andor
 *
 ******************************************************************************/
#include "PredefFunc.hh"
#include "error.h"
#include "Int.hh"
#include "Real.hh"
#include "Setting.hh"
#include "string.hh"
#include "ustring.hh"
#include "CompilerError.hh"
#include <stdio.h>
#include <sys/types.h>
#include <regex.h>
#include <stdint.h>
#include "../common/memory.h"
#include "../common/pattern.hh"
#include "../common/UnicharPattern.hh"
#include <iostream>
#include <locale.h>

// used by regex
#define ERRMSG_BUFSIZE 512

namespace Common {

  static const char utf32be[] = {'0','0','0','0','F','E','F','F',0};
  static const char utf32le[] = {'F','F','F','E','0','0','0','0',0};
  static const char utf16be[] = {'F','E','F','F',0};
  static const char utf16le[] = {'F','F','F','E',0};
  static const char utf8[]    = {'E','F','B','B','B','F',0};

  static inline unsigned char get_bit_value(char c, unsigned char bit_value)
  {
    switch (c) {
    case '0':
      return 0;
    case '1':
      return bit_value;
    default:
      FATAL_ERROR("Invalid binary digit (%c) in bitstring value", c);
      return 0;
    }
  }

  char toupper (const char c)
  {
    if (('A' <= c && 'F' >= c) ||
        ('0' <= c && '9' >= c)) return c;
    switch (c)
    {
      case 'a' : return 'A';
      case 'b' : return 'B';
      case 'c' : return 'C';
      case 'd' : return 'D';
      case 'e' : return 'E';
      case 'f' : return 'F';
      default:
        FATAL_ERROR("%c cannot be converted to hex character", c);
        break;
    }
  }

  char hexdigit_to_char(unsigned char hexdigit)
  {
    if (hexdigit < 10) return '0' + hexdigit;
    else if (hexdigit < 16) return 'A' + hexdigit - 10;
    else {
      FATAL_ERROR("hexdigit_to_char(): invalid argument: %d", hexdigit);
      return '\0'; // to avoid warning
    }
  }

  unsigned char char_to_hexdigit(char c)
  {
    if (c >= '0' && c <= '9') return c - '0';
    else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
    else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
    else {
      FATAL_ERROR("char_to_hexdigit(): invalid argument: %c", c);
      return 0; // to avoid warning
    }
  }

  string uchar2str(unsigned char uchar)
  {
    char str[2];
    str[0] = hexdigit_to_char(uchar / 16);
    str[1] = hexdigit_to_char(uchar % 16);
    return string(2, str);
  }

  unsigned char str2uchar(const char& c1, const char& c2)
  {
    unsigned char uc = 0;
    uc = char_to_hexdigit(c1);
    uc <<= 4;
    uc += char_to_hexdigit(c2);
    return uc;
  }

  int_val_t rem(const int_val_t& left, const int_val_t& right)
  {
    return (left - right * (left / right));
  }

  int_val_t mod(const int_val_t& left, const int_val_t& right)
  {
    int_val_t r = right < 0 ? -right : right;
    if (left > 0) {
      return rem(left, r);
    } else {
      int_val_t result = rem(left, r);
      return result == 0 ? result : result + r;
    }
  }

  string* to_uppercase(const string& value)
  {
    string *s = new string(value);
    for (size_t i = 0; i < s->size(); i++) {
      char& c=(*s)[i];
      if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
    }
    return s;
  }
  string* not4b_bit(const string& bstr)
  {
    string *s=new string(bstr);
    for(size_t i=0; i<s->size(); i++) {
      char& c=(*s)[i];
      switch(c) {
      case '0': c='1'; break;
      case '1': c='0'; break;
      default:
        FATAL_ERROR("not4b_bit(): Invalid char in bitstring.");
      } // switch c
    } // for i
    return s;
  }

  string* not4b_hex(const string& hstr)
  {
    string *s=new string(hstr);
    for(size_t i=0; i<s->size(); i++) {
      char& c=(*s)[i];
      switch(c) {
      case '0': c='F'; break;
      case '1': c='E'; break;
      case '2': c='D'; break;
      case '3': c='C'; break;
      case '4': c='B'; break;
      case '5': c='A'; break;
      case '6': c='9'; break;
      case '7': c='8'; break;
      case '8': c='7'; break;
      case '9': c='6'; break;
      case 'A': c='5'; break;
      case 'B': c='4'; break;
      case 'C': c='3'; break;
      case 'D': c='2'; break;
      case 'E': c='1'; break;
      case 'F': c='0'; break;
      case 'a': c='5'; break;
      case 'b': c='4'; break;
      case 'c': c='3'; break;
      case 'd': c='2'; break;
      case 'e': c='1'; break;
      case 'f': c='0'; break;
      default:
        FATAL_ERROR("not4b_hex(): Invalid char in hexstring.");
      } // switch c
    } // for i
    return s;
  }

  string* and4b(const string& left, const string& right)
  {
    string *s=new string(left);
    for(size_t i=0; i<s->size(); i++) {
      char& c=(*s)[i];
      c=hexdigit_to_char(char_to_hexdigit(c) & char_to_hexdigit(right[i]));
    } // for i
    return s;
  }

  string* or4b(const string& left, const string& right)
  {
    string *s=new string(left);
    for(size_t i=0; i<s->size(); i++) {
      char& c=(*s)[i];
      c=hexdigit_to_char(char_to_hexdigit(c) | char_to_hexdigit(right[i]));
    } // for i
    return s;
  }
  string* xor4b(const string& left, const string& right)
  {
    string *s=new string(left);
    for(size_t i=0; i<s->size(); i++) {
      char& c=(*s)[i];
      c=hexdigit_to_char(char_to_hexdigit(c) ^ char_to_hexdigit(right[i]));
    } // for i
    return s;
  }

  string* shift_left(const string& value, const Int& count)
  {
    if (count > 0) {
      string *s = new string;
      if (count < static_cast<Int>(value.size())) *s = value.substr(count);
      s->resize(value.size(), '0');
      return s;
    } else if (count < 0) return shift_right(value, -count);
    else return new string(value);
  }

  string* shift_right(const string& value, const Int& count)
  {
    if (count > 0) {
      string *s = new string;
      if (count < static_cast<Int>(value.size())) {
	s->resize(count, '0');
	*s += value.substr(0, value.size()-count);
      } else s->resize(value.size(), '0');
      return s;
    } else if (count < 0) return shift_left(value, -count);
    else return new string(value);
  }

  string* rotate_left(const string& value, const Int& p_count)
  {
    size_t size = value.size();
    if (size == 0) return new string(value);
    else if (p_count < 0) return rotate_right(value, -p_count);
    size_t count = p_count % size;
    if (count == 0) return new string(value);
    else return new string(value.substr(count) + value.substr(0, count));
  }

  string* rotate_right(const string& value, const Int& p_count)
  {
    size_t size = value.size();
    if (size == 0) return new string(value);
    else if (p_count < 0) return rotate_left(value, -p_count);
    size_t count = p_count % size;
    if (count == 0) return new string(value);
    else return new string(value.substr(size - count) +
      value.substr(0, size - count));
  }


  ustring* rotate_left(const ustring& value, const Int& p_count)
  {
    size_t size = value.size();
    if (size == 0) return new ustring(value);
    else if (p_count < 0) return rotate_right(value, -p_count);
    size_t count = p_count % size;
    if (count == 0) return new ustring(value);
    else return new ustring(value.substr(count) + value.substr(0, count));
  }

  ustring* rotate_right(const ustring& value, const Int& p_count)
  {
    size_t size = value.size();
    if (size == 0) return new ustring(value);
    else if (p_count < 0) return rotate_left(value, -p_count);
    size_t count = p_count % size;
    if (count == 0) return new ustring(value);
    else return new ustring(value.substr(size - count) +
      value.substr(0, size - count));
  }

  int_val_t* bit2int(const string& bstr)
  {
    size_t nof_bits = bstr.size();
    // skip the leading zeros
    size_t start_index = 0;
    while (start_index < nof_bits && bstr[start_index] == '0') start_index++;
    int_val_t *ret_val = new int_val_t((Int)0);
    for (size_t i = start_index; i < nof_bits; i++) {
      *ret_val <<= 1;
      if (bstr[i] == '1') *ret_val += 1;
    }
    return ret_val;
  }

  int_val_t* hex2int(const string& hstr)
  {
    size_t nof_digits = hstr.size();
    size_t start_index = 0;
    // Skip the leading zeros.
    while (start_index < nof_digits && hstr[start_index] == '0')
      start_index++;
    int_val_t *ret_val = new int_val_t((Int)0);
    for (size_t i = start_index; i < nof_digits; i++) {
      *ret_val <<= 4;
      *ret_val += char_to_hexdigit(hstr[i]);
    }
    return ret_val;
  }

  Int unichar2int(const ustring& ustr)
  {
    if (ustr.size() != 1) FATAL_ERROR("unichar2int(): invalid argument");
    const ustring::universal_char& uchar = ustr.u_str()[0];
    Int ret_val = (uchar.group << 24) | (uchar.plane << 16) | (uchar.row << 8) |
      uchar.cell;
    return ret_val;
  }

  string *int2bit(const int_val_t& value, const Int& length)
  {
    if (length < 0) FATAL_ERROR("int2bit(): negative length");
    size_t string_length = static_cast<size_t>(length);
    if (static_cast<Int>(string_length) != length ||
	string_length > string::max_string_len)
      FATAL_ERROR("int2bit(): length is too large");
    if (value < 0) FATAL_ERROR("int2bit(): negative value");
    string *bstr = new string;
    bstr->resize(string_length);
    int_val_t tmp_value = value;
    for (size_t i = 1; i <= string_length; i++) {
      (*bstr)[string_length - i] = (tmp_value & 1).get_val() ? '1' : '0';
      tmp_value >>= 1;
    }
    if (tmp_value != 0)
      FATAL_ERROR("int2bit(): %s does not fit in %lu bits", \
	value.t_str().c_str(), (unsigned long)string_length);
    return bstr;
  }

  static const char hdigits[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
    '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };

  string *int2hex(const int_val_t& value, const Int& length)
  {
    if (length < 0)
      FATAL_ERROR("int2hex(): negative length");
    size_t string_length = static_cast<size_t>(length);
    if (static_cast<Int>(string_length) != length ||
	string_length > string::max_string_len)
      FATAL_ERROR("int2hex(): length is too large");
    if (value < 0) FATAL_ERROR("int2hex(): negative value");
    string *hstr = new string;
    hstr->resize(string_length);
    int_val_t tmp_value = value;
    for (size_t i = 1; i <= string_length; i++) {
      (*hstr)[string_length - i] = hdigits[(tmp_value & 0x0f).get_val()];
      tmp_value >>= 4;
    }
    if (tmp_value != 0) {
      FATAL_ERROR("int2hex(): %s does not fit in %lu hexadecimal digits",
        value.t_str().c_str(), (unsigned long)string_length);
    }
    return hstr;
  }

  ustring *int2unichar(const Int& value)
  {
    if (value < 0 || value > 2147483647)
      FATAL_ERROR("int2unichar(): invalid argument");
    unsigned char group = (value >> 24) & 0xFF,
      plane = (value >> 16) & 0xFF,
      row = (value >> 8) & 0xFF,
      cell = value & 0xFF;
    return new ustring(group, plane, row, cell);
  }

  string *oct2char(const string& ostr)
  {
    string *cstr = new string;
    size_t ostr_size = ostr.size();
    if (ostr_size % 2)
      FATAL_ERROR("oct2char(): argument has odd length: %lu",
        (unsigned long) ostr_size);
    size_t cstr_size = ostr_size / 2;
    cstr->resize(cstr_size);
    const char *ostr_ptr = ostr.c_str();
    for (size_t i = 0; i < cstr_size; i++) {
      unsigned char c = 16 * char_to_hexdigit(ostr_ptr[2 * i]) +
      char_to_hexdigit(ostr_ptr[2 * i + 1]);
      if (c > 127) FATAL_ERROR("oct2char(): resulting charstring contains " \
                               "non-ascii character: %d", c);
      (*cstr)[i] = c;
    }
    return cstr;
  }

  string *char2oct(const string& cstr)
  {
    string *ostr = new string;
    size_t cstr_size = cstr.size();
    ostr->resize(cstr_size * 2, '0');
    const char *cstr_ptr = cstr.c_str();
    for (size_t i = 0; i < cstr_size; i++) {
      unsigned char c = cstr_ptr[i];
      (*ostr)[2 * i] = hexdigit_to_char(c / 16);
      (*ostr)[2 * i + 1] = hexdigit_to_char(c % 16);
    }
    return ostr;
  }

  string *bit2hex(const string& bstr)
  {
    size_t size=bstr.size();
    size_t hsize=(size+3)/4;
    string *hstr = new string;
    string *bstr4=NULL;
    if(size%4) {
      bstr4=new string;
      bstr4->resize(hsize*4,'0');
      bstr4->replace(4-(size%4),size,bstr);
    }
    hstr->resize(hsize,'0');
    string b4(4,"0000");
    for(size_t i=0;i<hsize;i++) {
      unsigned int u;
      if(size%4)b4=bstr4->substr(i*4,4);
      else b4=bstr.substr(i*4,4);
      if(b4[0]=='1')u=8;else u=0;
      if(b4[1]=='1')u+=4;
      if(b4[2]=='1')u+=2;
      if(b4[3]=='1')u++;
      (*hstr)[i]=hdigits[u];
    }
    if(bstr4!=NULL)delete bstr4;
    return hstr;
  }

  string *hex2oct(const string& hstr)
  {
    if(hstr.size()%2==0)return new string(hstr);
    else {
      string *ostr=new string("0");
      (*ostr)+=hstr;
      return ostr;
    }
  }

  string *asn_hex2oct(const string& hstr)
  {
    string *ostr = new string(hstr);
    size_t size = ostr->size();
    if (size % 2) ostr->resize(size + 1, '0');
    return ostr;
  }

  string *bit2oct(const string& bstr)
  {
    string *s1,*s2;
    s1=bit2hex(bstr);
    s2=hex2oct(*s1);
    delete s1;
    return s2;
  }

  string *asn_bit2oct(const string& bstr)
  {
    size_t size = bstr.size();
    string *ostr = new string;
    ostr->resize(((size+7)/8)*2);
    for(size_t i=0, j=0; i<size; ) {
      unsigned char digit1=0, digit2=0;
      digit1 += get_bit_value(bstr[i++], 8);
      if (i < size) {
        digit1 += get_bit_value(bstr[i++], 4);
        if (i < size) {
          digit1 += get_bit_value(bstr[i++], 2);
          if (i < size) {
            digit1 += get_bit_value(bstr[i++], 1);
            if (i < size) {
              digit2 += get_bit_value(bstr[i++], 8);
              if (i < size) {
                digit2 += get_bit_value(bstr[i++], 4);
                if (i < size) {
                  digit2 += get_bit_value(bstr[i++], 2);
                  if (i < size) digit2 += get_bit_value(bstr[i++], 1);
                }
              }
            }
          }
        }
      }
      (*ostr)[j++] = hexdigit_to_char(digit1);
      (*ostr)[j++] = hexdigit_to_char(digit2);
    }
    return ostr;
  }

  string *hex2bit(const string& hstr)
  {
    size_t size=hstr.size();
    string *bstr = new string;
    bstr->resize(4*size);
    for(size_t i=0; i<size; i++) {
      switch(hstr[i]) {
      case '0':
        bstr->replace(4*i, 4, "0000");
        break;
      case '1':
        bstr->replace(4*i, 4, "0001");
        break;
      case '2':
        bstr->replace(4*i, 4, "0010");
        break;
      case '3':
        bstr->replace(4*i, 4, "0011");
        break;
      case '4':
        bstr->replace(4*i, 4, "0100");
        break;
      case '5':
        bstr->replace(4*i, 4, "0101");
        break;
      case '6':
        bstr->replace(4*i, 4, "0110");
        break;
      case '7':
        bstr->replace(4*i, 4, "0111");
        break;
      case '8':
        bstr->replace(4*i, 4, "1000");
        break;
      case '9':
        bstr->replace(4*i, 4, "1001");
        break;
      case 'A':
      case 'a':
        bstr->replace(4*i, 4, "1010");
        break;
      case 'B':
      case 'b':
        bstr->replace(4*i, 4, "1011");
        break;
      case 'C':
      case 'c':
        bstr->replace(4*i, 4, "1100");
        break;
      case 'D':
      case 'd':
        bstr->replace(4*i, 4, "1101");
        break;
      case 'E':
      case 'e':
        bstr->replace(4*i, 4, "1110");
        break;
      case 'F':
      case 'f':
        bstr->replace(4*i, 4, "1111");
        break;
      default:
        FATAL_ERROR("Common::hex2bit(): invalid hexadecimal "
                    "digit in hexstring value");
      }
    }
    return bstr;
  }

  int_val_t* float2int(const Real& value, const Location& loc)
  {
    // We shouldn't mimic generality with `Int'.
    if (value >= (Real)LLONG_MIN && value <= (Real)LLONG_MAX)
      return new int_val_t((Int)value);
    char buf[512] = "";
    snprintf(buf, 511, "%f", value);
    char *dot = strchr(buf, '.');
    if (!dot) FATAL_ERROR("Conversion of float value `%f' to integer failed", value);
    else memset(dot, 0, sizeof(buf) - (dot - buf));
    return new int_val_t(buf, loc);
  }

/* TTCN-3 float values that have absolute value smaller than this are
   displayed in exponential notation. Same as in core/Float.hh */
#ifndef MIN_DECIMAL_FLOAT
#define MIN_DECIMAL_FLOAT		1.0E-4
#endif
/* TTCN-3 float values that have absolute value larger or equal than
   this are displayed in exponential notation. Same as in
   core/Float.hh */
#ifndef MAX_DECIMAL_FLOAT
#define MAX_DECIMAL_FLOAT		1.0E+10
#endif

  string *float2str(const Real& value)
  {
    if (value == REAL_INFINITY) {
      return new string("infinity");
    }
    if (value == -REAL_INFINITY) {
      return new string("-infinity");
    }
    if (value != value) {
      return new string("not_a_number");
    }
    char str_buf[64];
    bool f = (value > -MAX_DECIMAL_FLOAT && value <= -MIN_DECIMAL_FLOAT)
      || (value >= MIN_DECIMAL_FLOAT && value <   MAX_DECIMAL_FLOAT)
      || (value == 0.0);
    const char* loc = setlocale(LC_ALL, NULL);
    setlocale(LC_NUMERIC, "C"); // use default locale for displaying numbers
    snprintf(str_buf, 64, f ? "%f" : "%e", value);
    setlocale(LC_NUMERIC, loc);
    return new string(str_buf);
  }

  string* regexp(const string& instr, const string& expression,
                 const Int& groupno, bool nocase)
  {
    string* retval = regexp_internal(instr, expression, groupno, nocase);
    if (retval != NULL) {
      return retval;
    }
    return new string();
  }
  string* regexp_internal(const string& instr, const string& expression,
                          const Int& groupno, bool nocase)
  {
    string *retval=0;

    if(groupno<0) {
      FATAL_ERROR("regexp(): groupno must be a non-negative integer");
      return retval;
    }
    // do not report the warnings again
    // they were already reported while checking the operands
    unsigned orig_verb_level = verb_level;
    verb_level &= ~(1|2);
    char *posix_str=TTCN_pattern_to_regexp(expression.c_str());
    verb_level = orig_verb_level;
    if(posix_str==NULL) {
      FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
                  expression.c_str());
      return retval;
    }

    regex_t posix_regexp;
    int ret_val=regcomp(&posix_regexp, posix_str, REG_EXTENDED |
      (nocase ? REG_ICASE : 0));
    Free(posix_str);
    if(ret_val!=0) {
      /* regexp error */
      char msg[ERRMSG_BUFSIZE];
      regerror(ret_val, &posix_regexp, msg, sizeof(msg));
      FATAL_ERROR("regexp(): regcomp() failed: %s", msg);
      return retval;
    }

    size_t nmatch=groupno+1;
    if(nmatch>posix_regexp.re_nsub) {
      FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
                  "contains only %lu group(s).", (unsigned long) (nmatch - 1),
		  (unsigned long) posix_regexp.re_nsub);
      return retval;
    }
    regmatch_t* pmatch=(regmatch_t*)Malloc((nmatch+1)*sizeof(regmatch_t));
    ret_val=regexec(&posix_regexp, instr.c_str(), nmatch+1, pmatch, 0);
    if(ret_val==0) {
      if(pmatch[nmatch].rm_so != -1 && pmatch[nmatch].rm_eo != -1)
        retval = new string(instr.substr(pmatch[nmatch].rm_so,
          pmatch[nmatch].rm_eo - pmatch[nmatch].rm_so));
    }
    Free(pmatch);
    if(ret_val!=0) {
      if(ret_val==REG_NOMATCH) {
        regfree(&posix_regexp);
      }
      else {
        /* regexp error */
        char msg[ERRMSG_BUFSIZE];
        regerror(ret_val, &posix_regexp, msg, sizeof(msg));
        FATAL_ERROR("regexp(): regexec() failed: %s", msg);
      }
    }
    else regfree(&posix_regexp);

    return retval;
  }

  ustring* regexp(const ustring& instr, const ustring& expression,
    const Int& groupno, bool nocase)
  {
    ustring *retval=0;

    if(groupno<0) {
      FATAL_ERROR("regexp(): groupno must be a non-negative integer");
      return retval;
    }
    // do not report the warnings again
    // they were already reported while checking the operands
    unsigned orig_verb_level = verb_level;
    verb_level &= ~(1|2);
    int* user_groups;
    char *posix_str = TTCN_pattern_to_regexp_uni(
      expression.get_stringRepr_for_pattern().c_str(), nocase, &user_groups);
    if (user_groups == 0)
      FATAL_ERROR("regexp(): Cannot find any groups in the second argument.");
    verb_level = orig_verb_level;
    if(posix_str==NULL) {
      FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
        expression.get_stringRepr().c_str());
      return retval;
    }

    regex_t posix_regexp;
    int ret_val=regcomp(&posix_regexp, posix_str, REG_EXTENDED);
    Free(posix_str);
    if(ret_val!=0) {
      /* regexp error */
      char msg[ERRMSG_BUFSIZE];
      regerror(ret_val, &posix_regexp, msg, sizeof(msg));
      FATAL_ERROR("regexp(): regcomp() failed: %s", msg);
      return retval;
    }

    size_t nmatch=user_groups[groupno+1]+1;
    if(nmatch>posix_regexp.re_nsub) {
      FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
        "contains only %lu group(s).", (unsigned long) (groupno),
        (unsigned long) user_groups[0]);
      return retval;
    }

    Free(user_groups);

    regmatch_t* pmatch = (regmatch_t*)Malloc((nmatch+1)*sizeof(regmatch_t));
    char* tmp = instr.convert_to_regexp_form();
    
    if (nocase) {
      unichar_pattern.convert_regex_str_to_lowercase(tmp);
    }
    
    string instr_conv(tmp);
    Free(tmp);
    ret_val = regexec(&posix_regexp, instr_conv.c_str(), nmatch+1, pmatch, 0);
    if(ret_val == 0) {
      if(pmatch[nmatch].rm_so != -1 && pmatch[nmatch].rm_eo != -1) {
        retval = new ustring(instr.extract_matched_section(pmatch[nmatch].rm_so,
          pmatch[nmatch].rm_eo));
      } else { retval = new ustring(); }
    }
    Free(pmatch);
    if(ret_val!=0) {
      if(ret_val==REG_NOMATCH) {
        regfree(&posix_regexp);
        retval=new ustring();
      }
      else {
        /* regexp error */
        char msg[ERRMSG_BUFSIZE];
        regerror(ret_val, &posix_regexp, msg, sizeof(msg));
        FATAL_ERROR("regexp(): regexec() failed: %s", msg);
      }
    }
    else regfree(&posix_regexp);

    return retval;
  }

string* remove_bom(const string& encoded_value)
{
  size_t length = encoded_value.size();
  if (0 == length) return new string();
  if (length % 2) {
    ERROR("remove_bom(): Wrong string. The number of nibbles (%d) in string "
                "shall be divisible by 2", static_cast<int>(length));
    return new string(encoded_value);
  }

  int length_of_BOM = 0;
  string str_uppercase(encoded_value);
  size_t enough = length > sizeof(utf32be)-1 ? sizeof(utf32be)-1 : length;
  for (size_t i = 0; i < enough; ++i) {
    str_uppercase[i] = (char)toupper(encoded_value[i]);
  }

  if      (str_uppercase.find(utf32be, 0) < length) length_of_BOM = sizeof(utf32be)-1;
  else if (str_uppercase.find(utf32le, 0) < length) length_of_BOM = sizeof(utf32le)-1;
  else if (str_uppercase.find(utf16be, 0) < length) length_of_BOM = sizeof(utf16be)-1;
  else if (str_uppercase.find(utf16le, 0) < length) length_of_BOM = sizeof(utf16le)-1;
  else if (str_uppercase.find(utf8,    0) < length) length_of_BOM = sizeof(utf8)-1;
  else return new string(encoded_value); // no BOM found

  return new string(encoded_value.substr(length_of_BOM, length));
}

static CharCoding::CharCodingType is_ascii (size_t length, const unsigned char* strptr)
{
  const unsigned char nonASCII = 1 << 7;// MSB is 1 in case of non ASCII character  
  CharCoding::CharCodingType ret = CharCoding::ASCII;
  for (size_t i = 0; i < length; ++i) {
    if ( strptr[i] & nonASCII) {
      ret = CharCoding::UNKNOWN;
      break;
    }
  }
  return ret;
}

static CharCoding::CharCodingType is_utf8(size_t length, const unsigned char* strptr)
{
  const unsigned char MSB = 1 << 7; // MSB is 1 in case of non ASCII character  
  const unsigned char MSBmin1 = 1 << 6; // 0100 0000   
  size_t i = 0;
  while (length > i) {
    if ( strptr[i] & MSB) { // non ASCII char
    unsigned char maskUTF8 = 1 << 6; // 111x xxxx shows how many additional bytes are there
      if (!(strptr[i] & maskUTF8)) return CharCoding::UNKNOWN; // accepted 11xxx xxxx but received 10xx xxxx
      unsigned int noofUTF8 = 0; // 11xx xxxxx -> 2 bytes, 111x xxxxx -> 3 bytes , 1111 xxxxx -> 4 bytes in UTF-8
      while (strptr[i] & maskUTF8) {
        ++noofUTF8;
        maskUTF8 >>= 1; // shift right the mask
      }
      // the second and third (and so on) UTF-8 byte looks like 10xx xxxx      
      while (0 < noofUTF8 ) {
        ++i;
        if (i >= length || !(strptr[i] & MSB) || (strptr[i] & MSBmin1)) { // if not like this: 10xx xxxx
          return CharCoding::UNKNOWN;
        }
        --noofUTF8;
      }
    }
    ++i;
  }
  return CharCoding::UTF_8;
}

string* get_stringencoding(const string& encoded_value)
{
  size_t length = encoded_value.size();
  if (0 == length) return new string("<unknown>");
  if (length % 2) {
    ERROR("get_stringencoding(): Wrong string. The number of nibbles (%d) in string "
                "shall be divisible by 2", static_cast<int>(length));
    return new string("<unknown>");
  }

  string str_uppercase(encoded_value);
  size_t enough = length > sizeof(utf32be)-1 ? sizeof(utf32be)-1 : length;
  for (size_t i = 0; i < enough; ++i) {
    str_uppercase[i] = (char)toupper(encoded_value[i]);
  }

  if      (str_uppercase.find(utf32be, 0) < length) return new string("UTF-32BE");
  else if (str_uppercase.find(utf32le, 0) < length) return new string("UTF-32LE");
  else if (str_uppercase.find(utf16be, 0) < length) return new string("UTF-16BE");
  else if (str_uppercase.find(utf16le, 0) < length) return new string("UTF-16LE");
  else if (str_uppercase.find(utf8,    0) < length) return new string("UTF-8");

  unsigned char *uc_str = new unsigned char[length/2];
  string ret;
  for (size_t i = 0; i < length / 2; ++i) {
    uc_str[i] = str2uchar(encoded_value[2 * i], encoded_value[2 * i + 1]);
  }
  if (is_ascii (length / 2, uc_str) == CharCoding::ASCII) ret = "ASCII";
  else if (CharCoding::UTF_8 == is_utf8 (length / 2, uc_str)) ret = "UTF-8";
  else ret = "<unknown>";

  delete [] uc_str;
  return new string(ret);
}

static size_t check_BOM(CharCoding::CharCodingType expected_coding, size_t n_uc, unsigned char* uc_str)
{
  if (0 == n_uc) return 0;

  switch (expected_coding) {
    case CharCoding::UTF32:
    case CharCoding::UTF32BE:
    case CharCoding::UTF32LE:
      if (4 > n_uc) {
        ERROR("decode_utf32(): The string is shorter than the expected BOM");
        return 0;
      }
      break;
    case CharCoding::UTF16:
    case CharCoding::UTF16BE:
    case CharCoding::UTF16LE:
      if (2 > n_uc) {
        ERROR("decode_utf16(): The string is shorter than the expected BOM");
        return 0;
      }
      break;
    default: break;
  }

  //BOM indicates that the byte order is determined by a byte order mark, 
  //if present at the beginning the length of BOM is returned.
  bool badBOM = false;
  string errmsg;
  string caller;
  switch (expected_coding) {
    case CharCoding::UTF32BE:
    case CharCoding::UTF32:
      if (0x00 == uc_str[0] && 0x00 == uc_str[1] && 0xFE == uc_str[2] && 0xFF == uc_str[3]) 
        return 4;
      badBOM = true;
      caller = "decode_utf32()";
      errmsg = "UTF-32BE";
      break;
    case CharCoding::UTF32LE:
      if (0xFF == uc_str[0] && 0xFE == uc_str[1] && 0x00 == uc_str[2] && 0x00 == uc_str[3])
        return 4;
      badBOM = true;
      caller = "decode_utf32()";
      errmsg = "UTF-32LE";
      break;
    case CharCoding::UTF16BE:
    case CharCoding::UTF16:
      if (0xFE == uc_str[0] && 0xFF == uc_str[1])
        return 2;
      badBOM = true;
      caller = "decode_utf16()";
      errmsg = "UTF-16BE";
      break;
    case CharCoding::UTF16LE:
      if (0xFF == uc_str[0] && 0xFE == uc_str[1])
        return 2;
      badBOM = true;
      caller = "decode_utf16()";
      errmsg = "UTF-16LE";
      break;
    case CharCoding::UTF_8:
      if (0xEF == uc_str[0] && 0xBB == uc_str[1] && 0xBF == uc_str[2])
        return 3;
      return 0;
    default:
      if (CharCoding::UTF32 == expected_coding || CharCoding::UTF16 == expected_coding) {
        const char* str = CharCoding::UTF32 == expected_coding ? "UTF-32" : "UTF-16";
        ERROR("Wrong %s string. No BOM detected, however the given coding type (%s) "
               "expects it to define the endianness", str, str);
      }
      else {
        ERROR("Wrong string. No BOM detected");
      }
    }
  if (badBOM) ERROR("%s: Wrong %s string. The expected coding could not be verified",
                    caller.c_str(), errmsg.c_str());
  return 0;
}

static void fill_continuing_octets(int n_continuing, unsigned char *continuing_ptr,
                            size_t n_uc, const unsigned char* uc_str, int start_pos, 
                            int uchar_pos)
{
  for (int i = 0; i < n_continuing; i++) {
    if (start_pos + i < static_cast<int>(n_uc)) {
      unsigned char octet = uc_str[start_pos + i];
      if ((octet & 0xC0) != 0x80) {
        ERROR("decode_utf8(): Malformed: At character position %u, octet position %u: %02X is "
              "not a valid continuing octet.", uchar_pos, start_pos + i, octet);
        return;
      }
      continuing_ptr[i] = octet & 0x3F;
    } 
    else {
      if (start_pos + i == static_cast<int>(n_uc)) {
        if (i > 0) {
    // only a part of octets is missing
          ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d out "
                "of %d continuing octets %s missing from the end of the stream.",
                uchar_pos, start_pos + i, n_continuing - i, n_continuing,
                n_continuing - i > 1 ? "are" : "is");
          return;
        }
        else {
          // all octets are missing
          ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d "
                "continuing octet%s missing from the end of the stream.", uchar_pos,
                start_pos, n_continuing, n_continuing > 1 ? "s are" : " is");
          return;
        }
      }
      continuing_ptr[i] = 0;
    }
  }
}

ustring decode_utf8(const string & ostr, CharCoding::CharCodingType /*expected_coding*/)
{
  size_t length = ostr.size();
  if (0 == length) return ustring();
  if (length % 2) {
    ERROR("decode_utf8(): Wrong UTF-8 string. The number of nibbles (%d) in octetstring "
          "shall be divisible by 2", static_cast<int>(length));
    return ustring();
  }

  unsigned char *uc_str = new unsigned char[length/2];
  for (size_t i = 0; i < length / 2; ++i) {
    uc_str[i] = str2uchar(ostr[2 * i], ostr[2 * i + 1]);
  }
  ustring ucstr;
  size_t start = check_BOM(CharCoding::UTF_8, length /2, uc_str);

  for (size_t i = start; i < length / 2;) {
    // perform the decoding character by character
    if (uc_str[i] <= 0x7F) {
      // character encoded on a single octet: 0xxxxxxx (7 useful bits)
      unsigned char g = 0;
      unsigned char p = 0;
      unsigned char r = 0;
      unsigned char c = uc_str[i];
      ucstr += ustring(g, p, r, c);
      ++i;
    }
    else if (uc_str[i] <= 0xBF) {
      // continuing octet (10xxxxxx) without leading octet ==> malformed
      ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: continuing "
             "octet %02X without leading octet.", static_cast<int>(ucstr.size()),
             static_cast<int>(i), uc_str[i]);
      goto dec_error;
    }
    else if (uc_str[i] <= 0xDF) {
      // character encoded on 2 octets: 110xxxxx 10xxxxxx (11 useful bits)
      unsigned char octets[2];
      octets[0] = uc_str[i] & 0x1F;
      fill_continuing_octets(1, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
      unsigned char g = 0;
      unsigned char p = 0;
      unsigned char r = octets[0] >> 2;
      unsigned char c = octets[0] << 6 | octets[1];
      if (r == 0x00 && c < 0x80) {
        ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 2-octet "
              "encoding for quadruple (0, 0, 0, %u).", static_cast<int>(ucstr.size()),
              static_cast<int>(i), c);
        goto dec_error;
      }
      ucstr += ustring(g, p, r, c);
      i += 2;
    } 
    else if (uc_str[i] <= 0xEF) {
      // character encoded on 3 octets: 1110xxxx 10xxxxxx 10xxxxxx
      // (16 useful bits)
      unsigned char octets[3];
      octets[0] = uc_str[i] & 0x0F;
      fill_continuing_octets(2, octets + 1, length / 2, uc_str, i + 1,ucstr.size());
      unsigned char g = 0;
      unsigned char p = 0;
      unsigned char r = octets[0] << 4 | octets[1] >> 2;
      unsigned char c = octets[1] << 6 | octets[2];
      if (r < 0x08) {
        ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 3-octet "
              "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr.size()),
              static_cast<int>(i), r, c);
        goto dec_error;
      }
      ucstr += ustring(g, p, r, c);
      i += 3;
    } 
    else if (uc_str[i] <= 0xF7) {
      // character encoded on 4 octets: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
      // (21 useful bits)
      unsigned char octets[4];
      octets[0] = uc_str[i] & 0x07;
      fill_continuing_octets(3, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
      unsigned char g = 0;
      unsigned char p = octets[0] << 2 | octets[1] >> 4;
      unsigned char r = octets[1] << 4 | octets[2] >> 2;
      unsigned char c = octets[2] << 6 | octets[3];
      if (p == 0x00) {
        ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 4-octet "
              "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr.size()),
              static_cast<int>(i), r, c);
        goto dec_error;
      }
      ucstr += ustring(g, p, r, c);
      i += 4;
    }
    else if (uc_str[i] <= 0xFB) {
      // character encoded on 5 octets: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
      // 10xxxxxx (26 useful bits)
      unsigned char octets[5];
      octets[0] = uc_str[i] & 0x03;
      fill_continuing_octets(4, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
      unsigned char g = octets[0];
      unsigned char p = octets[1] << 2 | octets[2] >> 4;
      unsigned char r = octets[2] << 4 | octets[3] >> 2;
      unsigned char c = octets[3] << 6 | octets[4];
      if (g == 0x00 && p < 0x20) {
        ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 5-octet "
              "encoding for quadruple (0, %u, %u, %u).", static_cast<int>(ucstr.size()),
              static_cast<int>(i), p, r, c);
        goto dec_error;
      }
      ucstr += ustring(g, p, r, c);
      i += 5;
    }
    else if (uc_str[i] <= 0xFD) {
      // character encoded on 6 octets: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx
      // 10xxxxxx 10xxxxxx (31 useful bits)
      unsigned char octets[6];
      octets[0] = uc_str[i] & 0x01;
      fill_continuing_octets(5, octets + 1, length / 2, uc_str, i + 1,ucstr.size());
      unsigned char g = octets[0] << 6 | octets[1];
      unsigned char p = octets[2] << 2 | octets[3] >> 4;
      unsigned char r = octets[3] << 4 | octets[4] >> 2;
      unsigned char c = octets[4] << 6 | octets[5];
      if (g < 0x04) {
        ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 6-octet "
              "encoding for quadruple (%u, %u, %u, %u).", static_cast<int>(ucstr.size()),
                    static_cast<int>(i), g, p, r, c);
        goto dec_error;
      }
      ucstr += ustring(g, p, r, c);
      i += 6;
    }
    else {
      // not used code points: FE and FF => malformed
      ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: "
            "unused/reserved octet %02X.", static_cast<int>(ucstr.size()),
            static_cast<int>(i), uc_str[i]);
      goto dec_error;
    }
  }

  dec_error:
  delete[] uc_str;
  return ucstr;
}

}