DAViCal
check_UTF8.php
1 <?php
2 /* ***** BEGIN LICENSE BLOCK *****
3  * Version: NPL 1.1/GPL 2.0/LGPL 2.1
4  *
5  * The contents of this file are subject to the Netscape Public License
6  * Version 1.1 (the "License"); you may not use this file except in
7  * compliance with the License. You may obtain a copy of the License at
8  * http://www.mozilla.org/NPL/
9  *
10  * Software distributed under the License is distributed on an "AS IS" basis,
11  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12  * for the specific language governing rights and limitations under the
13  * License.
14  *
15  * The Original Code is Mozilla Communicator client code.
16  *
17  * The Initial Developer of the Original Code is
18  * Netscape Communications Corporation.
19  * Portions created by the Initial Developer are Copyright (C) 1998
20  * the Initial Developer. All Rights Reserved.
21  *
22  * Contributor(s):
23  * Henri Sivonen, hsivonen@iki.fi
24  *
25  *
26  * Alternatively, the contents of this file may be used under the terms of
27  * either the GNU General Public License Version 2 or later (the "GPL"), or
28  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29  * in which case the provisions of the GPL or the LGPL are applicable instead
30  * of those above. If you wish to allow use of your version of this file only
31  * under the terms of either the GPL or the LGPL, and not to allow others to
32  * use your version of this file under the terms of the NPL, indicate your
33  * decision by deleting the provisions above and replace them with the notice
34  * and other provisions required by the GPL or the LGPL. If you do not delete
35  * the provisions above, a recipient may use your version of this file under
36  * the terms of any one of the NPL, the GPL or the LGPL.
37  *
38  * ***** END LICENSE BLOCK ***** */
39 
40 /*
41  * For the original C++ code, see
42  * http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
43  * http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
44  *
45  * The latest version of this file can be obtained from
46  * http://iki.fi/hsivonen/php-utf8/
47  *
48  * Version 1.0, 2003-05-30
49  */
50 
59 function utf8ToUnicode(&$str)
60 {
61  $mState = 0; // cached expected number of octets after the current octet
62  // until the beginning of the next UTF8 character sequence
63  $mUcs4 = 0; // cached Unicode character
64  $mBytes = 1; // cached expected number of octets in the current sequence
65 
66  $out = array();
67 
68  $len = strlen($str);
69  for($i = 0; $i < $len; $i++) {
70  $in = ord($str[$i]);
71  if (0 == $mState) {
72  // When mState is zero we expect either a US-ASCII character or a
73  // multi-octet sequence.
74  if (0 == (0x80 & ($in))) {
75  // US-ASCII, pass straight through.
76  $out[] = $in;
77  $mBytes = 1;
78  } else if (0xC0 == (0xE0 & ($in))) {
79  // First octet of 2 octet sequence
80  $mUcs4 = ($in);
81  $mUcs4 = ($mUcs4 & 0x1F) << 6;
82  $mState = 1;
83  $mBytes = 2;
84  } else if (0xE0 == (0xF0 & ($in))) {
85  // First octet of 3 octet sequence
86  $mUcs4 = ($in);
87  $mUcs4 = ($mUcs4 & 0x0F) << 12;
88  $mState = 2;
89  $mBytes = 3;
90  } else if (0xF0 == (0xF8 & ($in))) {
91  // First octet of 4 octet sequence
92  $mUcs4 = ($in);
93  $mUcs4 = ($mUcs4 & 0x07) << 18;
94  $mState = 3;
95  $mBytes = 4;
96  } else if (0xF8 == (0xFC & ($in))) {
97  /* First octet of 5 octet sequence.
98  *
99  * This is illegal because the encoded codepoint must be either
100  * (a) not the shortest form or
101  * (b) outside the Unicode range of 0-0x10FFFF.
102  * Rather than trying to resynchronize, we will carry on until the end
103  * of the sequence and let the later error handling code catch it.
104  */
105  $mUcs4 = ($in);
106  $mUcs4 = ($mUcs4 & 0x03) << 24;
107  $mState = 4;
108  $mBytes = 5;
109  } else if (0xFC == (0xFE & ($in))) {
110  // First octet of 6 octet sequence, see comments for 5 octet sequence.
111  $mUcs4 = ($in);
112  $mUcs4 = ($mUcs4 & 1) << 30;
113  $mState = 5;
114  $mBytes = 6;
115  } else {
116  /* Current octet is neither in the US-ASCII range nor a legal first
117  * octet of a multi-octet sequence.
118  */
119  return false;
120  }
121  } else {
122  // When mState is non-zero, we expect a continuation of the multi-octet
123  // sequence
124  if (0x80 == (0xC0 & ($in))) {
125  // Legal continuation.
126  $shift = ($mState - 1) * 6;
127  $tmp = $in;
128  $tmp = ($tmp & 0x0000003F) << $shift;
129  $mUcs4 |= $tmp;
130 
131  if (0 == --$mState) {
132  /* End of the multi-octet sequence. mUcs4 now contains the final
133  * Unicode codepoint to be output
134  *
135  * Check for illegal sequences and codepoints.
136  */
137 
138  // From Unicode 3.1, non-shortest form is illegal
139  if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
140  ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
141  ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
142  (4 < $mBytes) ||
143  // From Unicode 3.2, surrogate characters are illegal
144  (($mUcs4 & 0xFFFFF800) == 0xD800) ||
145  // Codepoints outside the Unicode range are illegal
146  ($mUcs4 > 0x10FFFF)) {
147  return false;
148  }
149  if (0xFEFF != $mUcs4) {
150  // BOM is legal but we don't want to output it
151  $out[] = $mUcs4;
152  }
153  //initialize UTF8 cache
154  $mState = 0;
155  $mUcs4 = 0;
156  $mBytes = 1;
157  }
158  } else {
159  /* ((0xC0 & (*in) != 0x80) && (mState != 0))
160  *
161  * Incomplete multi-octet sequence.
162  */
163  return false;
164  }
165  }
166  }
167  return $out;
168 }
169 
179 function unicodeToUtf8(&$arr)
180 {
181  $dest = '';
182  foreach ($arr as $src) {
183  if($src < 0) {
184  return false;
185  } else if ( $src <= 0x007f) {
186  $dest .= chr($src);
187  } else if ($src <= 0x07ff) {
188  $dest .= chr(0xc0 | ($src >> 6));
189  $dest .= chr(0x80 | ($src & 0x003f));
190  } else if($src == 0xFEFF) {
191  // nop -- zap the BOM
192  } else if ($src >= 0xD800 && $src <= 0xDFFF) {
193  // found a surrogate
194  return false;
195  } else if ($src <= 0xffff) {
196  $dest .= chr(0xe0 | ($src >> 12));
197  $dest .= chr(0x80 | (($src >> 6) & 0x003f));
198  $dest .= chr(0x80 | ($src & 0x003f));
199  } else if ($src <= 0x10ffff) {
200  $dest .= chr(0xf0 | ($src >> 18));
201  $dest .= chr(0x80 | (($src >> 12) & 0x3f));
202  $dest .= chr(0x80 | (($src >> 6) & 0x3f));
203  $dest .= chr(0x80 | ($src & 0x3f));
204  } else {
205  // out of range
206  return false;
207  }
208  }
209  return $dest;
210 }
211 function check_string($ics){
212  $ics_file = explode("\n",$ics);
213  foreach($ics_file as $line => $str){
214  if(false === utf8ToUnicode($str)){
215  $error[] = $line;
216  }
217  }
218  if(isset($error) && is_array($error)){
219  foreach($error as $line){
220  dbg_error_log( "LOG check_string","error on lines % invalid character in string %s" , ($line +1),$ics_file[$line] );
221  return false;
222  }
223  } else {
224 // dbg_error_log( "LOG check_string","the string is UTF8 compliant");
225  return true;
226  }
227 }
228 ?>