Diff of /mcutfsig.c [000000] .. [b7a978]  Maximize  Restore

Switch to unified view

a b/mcutfsig.c
1
/*
2
 * mcutfsig.c
3
 *
4
 * $Id$
5
 *
6
 * Copyright (C) 2007, Keith Marshall
7
 *
8
 * This file implements the `mc_utf_signature' function, which is used
9
 * by `gencat', to identify message definition source files which appear
10
 * to exhibit any recognisable standard of Unicode encoding.
11
 *
12
 * Written by Keith Marshall  <keithmarshall@users.sourceforge.net>
13
 * Last Revision: 22-May-2007
14
 *
15
 *
16
 * This is free software.  It is provided AS IS, in the hope that it may
17
 * be useful, but WITHOUT WARRANTY OF ANY KIND, not even an IMPLIED WARRANTY
18
 * of MERCHANTABILITY, nor of FITNESS FOR ANY PARTICULAR PURPOSE.
19
 *
20
 * Permission is granted to redistribute this software, either "as is" or
21
 * in modified form, under the terms of the GNU General Public License, as
22
 * published by the Free Software Foundation; either version 2, or (at your
23
 * option) any later version.
24
 *
25
 * You should have received a copy of the GNU General Public License
26
 * along with this software; see the file COPYING.  If not, write to the
27
 * Free Software Foundation, 51 Franklin St - Fifth Floor, Boston,
28
 * MA 02110-1301, USA.
29
 *
30
 */
31
#include <mcutfsig.h>
32
33
unsigned short mc_utf_signature( unsigned char *stream )
34
{
35
  /* Inspect the first few bytes of the specified data stream;
36
   * attempt to identify a potential Unicode encoding signature,
37
   * defaulting to non-specific single byte encoding units.
38
   */
39
  unsigned short signature = 1;
40
  /*
41
   * The first character in the input stream must not be NUL,
42
   * and must be a member of the POSIX Portable Character Set;
43
   * if it isn't, then it may indicate a Unicode stream.
44
   */
45
  if( *stream == 0 )
46
  {
47
    /* An initial NUL byte anticipates a big-endian Unicode stream;
48
     * one such byte implies UTF-16, without a Byte Order Mark, while
49
     * two such followed by the big-endian form of the BOM, or three
50
     * without a BOM, indicates UTF-32.
51
     */
52
    int count = 4;
53
    while( count-- && (*stream++ == '\0') )
54
      ++signature;
55
    signature += UTF_BIG_ENDIAN;
56
  }
57
  if( (*stream & 0xfe) == 0xfe )
58
  {
59
    /* This looks like it might be a Unicode Byte Order Mark;
60
     * identify the UTF encoding standard, if any, which it represents.
61
     */
62
    unsigned bom = *stream++ << 8; bom |= *stream++;
63
    switch( bom )
64
    {
65
      case 0xfffe:
66
  /*
67
   * This is the BOM signature for a little-endian Unicode stream;
68
   * the first byte has already been included in the initial size
69
   * assigned for the encoding unit; adjust this to accommodate the
70
   * second byte, and incorporate the little-endian flag.
71
   */
72
  signature += UTF_WITH_BYTE_ORDER_MARK + UTF_LITTLE_ENDIAN + 1;
73
  if( *stream == '\0' )
74
  {
75
    int count = 4;
76
    while( count-- && (*stream++ == '\0') )
77
      ++signature;
78
  }
79
  break;
80
81
      case 0xfeff:
82
  /*
83
   * This is the BOM signature for a big-endian Unicode stream;
84
   * if preceded by two NULs, (already counted), then it is UTF-32,
85
   * else it is UTF-16.  In either case, adding an additional one
86
   * to the accumulated size of the encoding unit yields the
87
   * desired result, since the first byte of the BOM, and
88
   * any leading NULs, have already been counted.
89
   */
90
  signature += UTF_WITH_BYTE_ORDER_MARK + UTF_BIG_ENDIAN + 1;
91
  break;
92
93
      case 0xffbb:
94
  /*
95
   * Provided it's followed by one further `0xbf' byte, this is the
96
   * BOM used as a signature for a UTF-8 encoded stream; it becomes
97
   * invalid, if there were any leading NUL bytes.
98
   */
99
  if( (signature == 1) && (*stream++ == (unsigned char)('\xbf')) )
100
    signature |= UTF_WITH_BYTE_ORDER_MARK;
101
    }
102
  }
103
  else if( (signature == 1) && (*++stream == 0) )
104
  {
105
    /* NUL as the second byte in the input stream indicates a probable
106
     * little-endian Unicode input stream, although this is not indicated
107
     * by a Byte Order Mark; count the trailing NULs, to determine if we
108
     * should interpret it as UTF-16LE, or as UTF-32LE.
109
     */
110
    int count = 4;
111
    while( count-- && (*stream++ == '\0') )
112
      ++signature;
113
    signature += UTF_LITTLE_ENDIAN;
114
  }
115
  return signature;
116
}
117
118
/* $RCSfile$Revision$: end of file */

Get latest updates about Open Source Projects, Conferences and News.

Sign up for the SourceForge newsletter:





No, thanks