encodedstream.h
1 // Tencent is pleased to support the open source community by making RapidJSON available.
2 //
3 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
4 //
5 // Licensed under the MIT License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // http://opensource.org/licenses/MIT
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #ifndef RAPIDJSON_ENCODEDSTREAM_H_
16 #define RAPIDJSON_ENCODEDSTREAM_H_
17 
18 #include "stream.h"
19 #include "memorystream.h"
20 
21 #ifdef __GNUC__
22 RAPIDJSON_DIAG_PUSH
23 RAPIDJSON_DIAG_OFF(effc++)
24 #endif
25 
26 #ifdef __clang__
27 RAPIDJSON_DIAG_PUSH
28 RAPIDJSON_DIAG_OFF(padded)
29 #endif
30 
31 RAPIDJSON_NAMESPACE_BEGIN
32 
33 //! Input byte stream wrapper with a statically bound encoding.
34 /*!
35  \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
36  \tparam InputByteStream Type of input byte stream. For example, FileReadStream.
37 */
38 template <typename Encoding, typename InputByteStream>
40  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
41 public:
42  typedef typename Encoding::Ch Ch;
43 
44  EncodedInputStream(InputByteStream& is) : is_(is) {
45  current_ = Encoding::TakeBOM(is_);
46  }
47 
48  Ch Peek() const { return current_; }
49  Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; }
50  size_t Tell() const { return is_.Tell(); }
51 
52  // Not implemented
53  void Put(Ch) { RAPIDJSON_ASSERT(false); }
54  void Flush() { RAPIDJSON_ASSERT(false); }
55  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
56  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
57 
58 private:
60  EncodedInputStream& operator=(const EncodedInputStream&);
61 
62  InputByteStream& is_;
63  Ch current_;
64 };
65 
66 //! Specialized for UTF8 MemoryStream.
67 template <>
69 public:
70  typedef UTF8<>::Ch Ch;
71 
72  EncodedInputStream(MemoryStream& is) : is_(is) {
73  if (static_cast<unsigned char>(is_.Peek()) == 0xEFu) is_.Take();
74  if (static_cast<unsigned char>(is_.Peek()) == 0xBBu) is_.Take();
75  if (static_cast<unsigned char>(is_.Peek()) == 0xBFu) is_.Take();
76  }
77  Ch Peek() const { return is_.Peek(); }
78  Ch Take() { return is_.Take(); }
79  size_t Tell() const { return is_.Tell(); }
80 
81  // Not implemented
82  void Put(Ch) {}
83  void Flush() {}
84  Ch* PutBegin() { return 0; }
85  size_t PutEnd(Ch*) { return 0; }
86 
87  MemoryStream& is_;
88 
89 private:
91  EncodedInputStream& operator=(const EncodedInputStream&);
92 };
93 
94 //! Output byte stream wrapper with statically bound encoding.
95 /*!
96  \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
97  \tparam OutputByteStream Type of input byte stream. For example, FileWriteStream.
98 */
99 template <typename Encoding, typename OutputByteStream>
101  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
102 public:
103  typedef typename Encoding::Ch Ch;
104 
105  EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) {
106  if (putBOM)
107  Encoding::PutBOM(os_);
108  }
109 
110  void Put(Ch c) { Encoding::Put(os_, c); }
111  void Flush() { os_.Flush(); }
112 
113  // Not implemented
114  Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;}
115  Ch Take() { RAPIDJSON_ASSERT(false); return 0;}
116  size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
117  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
118  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
119 
120 private:
122  EncodedOutputStream& operator=(const EncodedOutputStream&);
123 
124  OutputByteStream& os_;
125 };
126 
127 #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
128 
129 //! Input stream wrapper with dynamically bound encoding and automatic encoding detection.
130 /*!
131  \tparam CharType Type of character for reading.
132  \tparam InputByteStream type of input byte stream to be wrapped.
133 */
134 template <typename CharType, typename InputByteStream>
136  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
137 public:
138  typedef CharType Ch;
139 
140  //! Constructor.
141  /*!
142  \param is input stream to be wrapped.
143  \param type UTF encoding type if it is not detected from the stream.
144  */
145  AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) {
146  RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
147  DetectType();
148  static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) };
149  takeFunc_ = f[type_];
150  current_ = takeFunc_(*is_);
151  }
152 
153  UTFType GetType() const { return type_; }
154  bool HasBOM() const { return hasBOM_; }
155 
156  Ch Peek() const { return current_; }
157  Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; }
158  size_t Tell() const { return is_->Tell(); }
159 
160  // Not implemented
161  void Put(Ch) { RAPIDJSON_ASSERT(false); }
162  void Flush() { RAPIDJSON_ASSERT(false); }
163  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
164  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
165 
166 private:
167  AutoUTFInputStream(const AutoUTFInputStream&);
168  AutoUTFInputStream& operator=(const AutoUTFInputStream&);
169 
170  // Detect encoding type with BOM or RFC 4627
171  void DetectType() {
172  // BOM (Byte Order Mark):
173  // 00 00 FE FF UTF-32BE
174  // FF FE 00 00 UTF-32LE
175  // FE FF UTF-16BE
176  // FF FE UTF-16LE
177  // EF BB BF UTF-8
178 
179  const unsigned char* c = reinterpret_cast<const unsigned char *>(is_->Peek4());
180  if (!c)
181  return;
182 
183  unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24));
184  hasBOM_ = false;
185  if (bom == 0xFFFE0000) { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
186  else if (bom == 0x0000FEFF) { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
187  else if ((bom & 0xFFFF) == 0xFFFE) { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take(); }
188  else if ((bom & 0xFFFF) == 0xFEFF) { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take(); }
189  else if ((bom & 0xFFFFFF) == 0xBFBBEF) { type_ = kUTF8; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); }
190 
191  // RFC 4627: Section 3
192  // "Since the first two characters of a JSON text will always be ASCII
193  // characters [RFC0020], it is possible to determine whether an octet
194  // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
195  // at the pattern of nulls in the first four octets."
196  // 00 00 00 xx UTF-32BE
197  // 00 xx 00 xx UTF-16BE
198  // xx 00 00 00 UTF-32LE
199  // xx 00 xx 00 UTF-16LE
200  // xx xx xx xx UTF-8
201 
202  if (!hasBOM_) {
203  int pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
204  switch (pattern) {
205  case 0x08: type_ = kUTF32BE; break;
206  case 0x0A: type_ = kUTF16BE; break;
207  case 0x01: type_ = kUTF32LE; break;
208  case 0x05: type_ = kUTF16LE; break;
209  case 0x0F: type_ = kUTF8; break;
210  default: break; // Use type defined by user.
211  }
212  }
213 
214  // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
215  if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
216  if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
217  }
218 
219  typedef Ch (*TakeFunc)(InputByteStream& is);
220  InputByteStream* is_;
221  UTFType type_;
222  Ch current_;
223  TakeFunc takeFunc_;
224  bool hasBOM_;
225 };
226 
227 //! Output stream wrapper with dynamically bound encoding and automatic encoding detection.
228 /*!
229  \tparam CharType Type of character for writing.
230  \tparam OutputByteStream type of output byte stream to be wrapped.
231 */
232 template <typename CharType, typename OutputByteStream>
234  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
235 public:
236  typedef CharType Ch;
237 
238  //! Constructor.
239  /*!
240  \param os output stream to be wrapped.
241  \param type UTF encoding type.
242  \param putBOM Whether to write BOM at the beginning of the stream.
243  */
244  AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) {
245  RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
246 
247  // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
248  if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
249  if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
250 
251  static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) };
252  putFunc_ = f[type_];
253 
254  if (putBOM)
255  PutBOM();
256  }
257 
258  UTFType GetType() const { return type_; }
259 
260  void Put(Ch c) { putFunc_(*os_, c); }
261  void Flush() { os_->Flush(); }
262 
263  // Not implemented
264  Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;}
265  Ch Take() { RAPIDJSON_ASSERT(false); return 0;}
266  size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
267  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
268  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
269 
270 private:
271  AutoUTFOutputStream(const AutoUTFOutputStream&);
272  AutoUTFOutputStream& operator=(const AutoUTFOutputStream&);
273 
274  void PutBOM() {
275  typedef void (*PutBOMFunc)(OutputByteStream&);
276  static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) };
277  f[type_](*os_);
278  }
279 
280  typedef void (*PutFunc)(OutputByteStream&, Ch);
281 
282  OutputByteStream* os_;
283  UTFType type_;
284  PutFunc putFunc_;
285 };
286 
287 #undef RAPIDJSON_ENCODINGS_FUNC
288 
289 RAPIDJSON_NAMESPACE_END
290 
291 #ifdef __clang__
292 RAPIDJSON_DIAG_POP
293 #endif
294 
295 #ifdef __GNUC__
296 RAPIDJSON_DIAG_POP
297 #endif
298 
299 #endif // RAPIDJSON_FILESTREAM_H_
rapidjson::kUTF16LE
UTF-16 little endian.
Definition: encodings.h:605
rapidjson::EncodedInputStream
Input byte stream wrapper with a statically bound encoding.
Definition: encodedstream.h:39
rapidjson::MemoryStream
Represents an in-memory input byte stream.
Definition: memorystream.h:40
rapidjson::kUTF32LE
UTF-32 little endian.
Definition: encodings.h:607
RAPIDJSON_ASSERT
#define RAPIDJSON_ASSERT(x)
Assertion.
Definition: rapidjson.h:437
rapidjson::AutoUTFInputStream::AutoUTFInputStream
AutoUTFInputStream(InputByteStream &is, UTFType type=kUTF8)
Constructor.
Definition: encodedstream.h:145
rapidjson::kUTF16BE
UTF-16 big endian.
Definition: encodings.h:606
rapidjson::AutoUTFOutputStream::AutoUTFOutputStream
AutoUTFOutputStream(OutputByteStream &os, UTFType type, bool putBOM)
Constructor.
Definition: encodedstream.h:244
rapidjson::UTFType
UTFType
Runtime-specified UTF encoding type of a stream.
Definition: encodings.h:603
rapidjson::EncodedOutputStream
Output byte stream wrapper with statically bound encoding.
Definition: encodedstream.h:100
rapidjson::kUTF8
UTF-8.
Definition: encodings.h:604
rapidjson::UTF8
UTF-8 encoding.
Definition: encodings.h:96
rapidjson::AutoUTFOutputStream
Output stream wrapper with dynamically bound encoding and automatic encoding detection.
Definition: encodedstream.h:233
rapidjson::AutoUTFInputStream
Input stream wrapper with dynamically bound encoding and automatic encoding detection.
Definition: encodedstream.h:135
rapidjson::kUTF32BE
UTF-32 big endian.
Definition: encodings.h:608
RAPIDJSON_STATIC_ASSERT
#define RAPIDJSON_STATIC_ASSERT(x)
(Internal) macro to check for conditions at compile-time
Definition: rapidjson.h:476