1 /*
2 * Copyright 2013 The Netty Project
3 *
4 * The Netty Project licenses this file to you under the Apache License,
5 * version 2.0 (the "License"); you may not use this file except in compliance
6 * with the License. You may obtain a copy of the License at:
7 *
8 * https://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations
14 * under the License.
15 */
16 package io.netty.handler.codec.xml;
17
18 import static io.netty.util.internal.ObjectUtil.checkPositive;
19
20 import io.netty.buffer.ByteBuf;
21 import io.netty.channel.ChannelHandlerContext;
22 import io.netty.handler.codec.ByteToMessageDecoder;
23 import io.netty.handler.codec.CorruptedFrameException;
24 import io.netty.handler.codec.TooLongFrameException;
25
26 import java.util.List;
27
28 /**
29 * A frame decoder for single separate XML based message streams.
30 * <p/>
31 * A couple examples will better help illustrate
32 * what this decoder actually does.
33 * <p/>
34 * Given an input array of bytes split over 3 frames like this:
35 * <pre>
36 * +-----+-----+-----------+
37 * | <an | Xml | Element/> |
38 * +-----+-----+-----------+
39 * </pre>
40 * <p/>
41 * this decoder would output a single frame:
42 * <p/>
43 * <pre>
44 * +-----------------+
45 * | <anXmlElement/> |
46 * +-----------------+
47 * </pre>
48 *
49 * Given an input array of bytes split over 5 frames like this:
50 * <pre>
51 * +-----+-----+-----------+-----+----------------------------------+
52 * | <an | Xml | Element/> | <ro | ot><child>content</child></root> |
53 * +-----+-----+-----------+-----+----------------------------------+
54 * </pre>
55 * <p/>
56 * this decoder would output two frames:
57 * <p/>
58 * <pre>
59 * +-----------------+-------------------------------------+
60 * | <anXmlElement/> | <root><child>content</child></root> |
61 * +-----------------+-------------------------------------+
62 * </pre>
63 *
64 * <p/>
65 * The byte stream is expected to be in UTF-8 character encoding or ASCII. The current implementation
66 * uses direct {@code byte} to {@code char} cast and then compares that {@code char} to a few low range
67 * ASCII characters like {@code '<'}, {@code '>'} or {@code '/'}. UTF-8 is not using low range [0..0x7F]
68 * byte values for multibyte codepoint representations therefore fully supported by this implementation.
69 * <p/>
70 * Please note that this decoder is not suitable for
71 * xml streaming protocols such as
72 * <a href="https://xmpp.org/rfcs/rfc6120.html">XMPP</a>,
73 * where an initial xml element opens the stream and only
74 * gets closed at the end of the session, although this class
75 * could probably allow for such type of message flow with
76 * minor modifications.
77 */
78 public class XmlFrameDecoder extends ByteToMessageDecoder {
79
80 private final int maxFrameLength;
81
82 public XmlFrameDecoder(int maxFrameLength) {
83 this.maxFrameLength = checkPositive(maxFrameLength, "maxFrameLength");
84 }
85
86 @Override
87 protected void decode(ChannelHandlerContext ctx, ByteBuf in, List<Object> out) throws Exception {
88 boolean openingBracketFound = false;
89 boolean atLeastOneXmlElementFound = false;
90 boolean inCDATASection = false;
91 long openBracketsCount = 0;
92 int length = 0;
93 int leadingWhiteSpaceCount = 0;
94 final int bufferLength = in.writerIndex();
95
96 if (bufferLength > maxFrameLength) {
97 // bufferLength exceeded maxFrameLength; dropping frame
98 in.skipBytes(in.readableBytes());
99 fail(bufferLength);
100 return;
101 }
102
103 for (int i = in.readerIndex(); i < bufferLength; i++) {
104 final byte readByte = in.getByte(i);
105 if (!openingBracketFound && Character.isWhitespace(readByte)) {
106 // xml has not started and whitespace char found
107 leadingWhiteSpaceCount++;
108 } else if (!openingBracketFound && readByte != '<') {
109 // garbage found before xml start
110 fail(ctx);
111 in.skipBytes(in.readableBytes());
112 return;
113 } else if (!inCDATASection && readByte == '<') {
114 openingBracketFound = true;
115
116 if (i < bufferLength - 1) {
117 final byte peekAheadByte = in.getByte(i + 1);
118 if (peekAheadByte == '/') {
119 // found </, we must check if it is enclosed
120 int peekFurtherAheadIndex = i + 2;
121 while (peekFurtherAheadIndex <= bufferLength - 1) {
122 //if we have </ and enclosing > we can decrement openBracketsCount
123 if (in.getByte(peekFurtherAheadIndex) == '>') {
124 openBracketsCount--;
125 break;
126 }
127 peekFurtherAheadIndex++;
128 }
129 } else if (isValidStartCharForXmlElement(peekAheadByte)) {
130 atLeastOneXmlElementFound = true;
131 // char after < is a valid xml element start char,
132 // incrementing openBracketsCount
133 openBracketsCount++;
134 } else if (peekAheadByte == '!') {
135 if (isCommentBlockStart(in, i)) {
136 // <!-- comment --> start found
137 openBracketsCount++;
138 } else if (isCDATABlockStart(in, i)) {
139 // <![CDATA[ start found
140 openBracketsCount++;
141 inCDATASection = true;
142 }
143 } else if (peekAheadByte == '?') {
144 // <?xml ?> start found
145 openBracketsCount++;
146 }
147 }
148 } else if (!inCDATASection && readByte == '/') {
149 if (i < bufferLength - 1 && in.getByte(i + 1) == '>') {
150 // found />, decrementing openBracketsCount
151 openBracketsCount--;
152 }
153 } else if (readByte == '>') {
154 length = i + 1;
155
156 if (i - 1 > -1) {
157 final byte peekBehindByte = in.getByte(i - 1);
158
159 if (!inCDATASection) {
160 if (peekBehindByte == '?') {
161 // an <?xml ?> tag was closed
162 openBracketsCount--;
163 } else if (peekBehindByte == '-' && i - 2 > -1 && in.getByte(i - 2) == '-') {
164 // a <!-- comment --> was closed
165 openBracketsCount--;
166 }
167 } else if (peekBehindByte == ']' && i - 2 > -1 && in.getByte(i - 2) == ']') {
168 // a <![CDATA[...]]> block was closed
169 openBracketsCount--;
170 inCDATASection = false;
171 }
172 }
173
174 if (atLeastOneXmlElementFound && openBracketsCount == 0) {
175 // xml is balanced, bailing out
176 break;
177 }
178 }
179 }
180
181 final int readerIndex = in.readerIndex();
182 int xmlElementLength = length - readerIndex;
183
184 if (openBracketsCount == 0 && xmlElementLength > 0) {
185 if (readerIndex + xmlElementLength >= bufferLength) {
186 xmlElementLength = in.readableBytes();
187 }
188 final ByteBuf frame =
189 extractFrame(in, readerIndex + leadingWhiteSpaceCount, xmlElementLength - leadingWhiteSpaceCount);
190 in.skipBytes(xmlElementLength);
191 out.add(frame);
192 }
193 }
194
195 private void fail(long frameLength) {
196 if (frameLength > 0) {
197 throw new TooLongFrameException(
198 "frame length exceeds " + maxFrameLength + ": " + frameLength + " - discarded");
199 } else {
200 throw new TooLongFrameException(
201 "frame length exceeds " + maxFrameLength + " - discarding");
202 }
203 }
204
205 private static void fail(ChannelHandlerContext ctx) {
206 ctx.fireExceptionCaught(new CorruptedFrameException("frame contains content before the xml starts"));
207 }
208
209 private static ByteBuf extractFrame(ByteBuf buffer, int index, int length) {
210 return buffer.copy(index, length);
211 }
212
213 /**
214 * Asks whether the given byte is a valid
215 * start char for an xml element name.
216 * <p/>
217 * Please refer to the
218 * <a href="https://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameStartChar">NameStartChar</a>
219 * formal definition in the W3C XML spec for further info.
220 *
221 * @param b the input char
222 * @return true if the char is a valid start char
223 */
224 private static boolean isValidStartCharForXmlElement(final byte b) {
225 return b >= 'a' && b <= 'z' || b >= 'A' && b <= 'Z' || b == ':' || b == '_';
226 }
227
228 private static boolean isCommentBlockStart(final ByteBuf in, final int i) {
229 return i < in.writerIndex() - 3
230 && in.getByte(i + 2) == '-'
231 && in.getByte(i + 3) == '-';
232 }
233
234 private static boolean isCDATABlockStart(final ByteBuf in, final int i) {
235 return i < in.writerIndex() - 8
236 && in.getByte(i + 2) == '['
237 && in.getByte(i + 3) == 'C'
238 && in.getByte(i + 4) == 'D'
239 && in.getByte(i + 5) == 'A'
240 && in.getByte(i + 6) == 'T'
241 && in.getByte(i + 7) == 'A'
242 && in.getByte(i + 8) == '[';
243 }
244
245 }