View Javadoc
1   /*
2    * Copyright 2014 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License,
5    * version 2.0 (the "License"); you may not use this file except in compliance
6    * with the License. You may obtain a copy of the License at:
7    *
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13   * License for the specific language governing permissions and limitations
14   * under the License.
15   */
16  package io.netty.channel.epoll;
17  
18  import io.netty.channel.EventLoop;
19  import io.netty.channel.EventLoopGroup;
20  import io.netty.channel.SelectStrategy;
21  import io.netty.channel.SingleThreadEventLoop;
22  import io.netty.channel.epoll.AbstractEpollChannel.AbstractEpollUnsafe;
23  import io.netty.channel.unix.FileDescriptor;
24  import io.netty.channel.unix.IovArray;
25  import io.netty.util.IntSupplier;
26  import io.netty.util.collection.IntObjectHashMap;
27  import io.netty.util.collection.IntObjectMap;
28  import io.netty.util.concurrent.RejectedExecutionHandler;
29  import io.netty.util.internal.ObjectUtil;
30  import io.netty.util.internal.PlatformDependent;
31  import io.netty.util.internal.logging.InternalLogger;
32  import io.netty.util.internal.logging.InternalLoggerFactory;
33  
34  import java.io.IOException;
35  import java.util.Queue;
36  import java.util.concurrent.Executor;
37  import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
38  
39  import static java.lang.Math.min;
40  
41  /**
42   * {@link EventLoop} which uses epoll under the covers. Only works on Linux!
43   */
44  class EpollEventLoop extends SingleThreadEventLoop {
45      private static final InternalLogger logger = InternalLoggerFactory.getInstance(EpollEventLoop.class);
46      private static final AtomicIntegerFieldUpdater<EpollEventLoop> WAKEN_UP_UPDATER =
47              AtomicIntegerFieldUpdater.newUpdater(EpollEventLoop.class, "wakenUp");
48  
49      static {
50          // Ensure JNI is initialized by the time this class is loaded by this time!
51          // We use unix-common methods in this class which are backed by JNI methods.
52          Epoll.ensureAvailability();
53      }
54  
55      // Pick a number that no task could have previously used.
56      private long prevDeadlineNanos = nanoTime() - 1;
57      private final FileDescriptor epollFd;
58      private final FileDescriptor eventFd;
59      private final FileDescriptor timerFd;
60      private final IntObjectMap<AbstractEpollChannel> channels = new IntObjectHashMap<AbstractEpollChannel>(4096);
61      private final boolean allowGrowing;
62      private final EpollEventArray events;
63  
64      // These are initialized on first use
65      private IovArray iovArray;
66      private NativeDatagramPacketArray datagramPacketArray;
67  
68      private final SelectStrategy selectStrategy;
69      private final IntSupplier selectNowSupplier = new IntSupplier() {
70          @Override
71          public int get() throws Exception {
72              return epollWaitNow();
73          }
74      };
75      @SuppressWarnings("unused") // AtomicIntegerFieldUpdater
76      private volatile int wakenUp;
77      private volatile int ioRatio = 50;
78  
79      // See http://man7.org/linux/man-pages/man2/timerfd_create.2.html.
80      private static final long MAX_SCHEDULED_TIMERFD_NS = 999999999;
81  
82      EpollEventLoop(EventLoopGroup parent, Executor executor, int maxEvents,
83                     SelectStrategy strategy, RejectedExecutionHandler rejectedExecutionHandler) {
84          super(parent, executor, false, DEFAULT_MAX_PENDING_TASKS, rejectedExecutionHandler);
85          selectStrategy = ObjectUtil.checkNotNull(strategy, "strategy");
86          if (maxEvents == 0) {
87              allowGrowing = true;
88              events = new EpollEventArray(4096);
89          } else {
90              allowGrowing = false;
91              events = new EpollEventArray(maxEvents);
92          }
93          boolean success = false;
94          FileDescriptor epollFd = null;
95          FileDescriptor eventFd = null;
96          FileDescriptor timerFd = null;
97          try {
98              this.epollFd = epollFd = Native.newEpollCreate();
99              this.eventFd = eventFd = Native.newEventFd();
100             try {
101                 Native.epollCtlAdd(epollFd.intValue(), eventFd.intValue(), Native.EPOLLIN);
102             } catch (IOException e) {
103                 throw new IllegalStateException("Unable to add eventFd filedescriptor to epoll", e);
104             }
105             this.timerFd = timerFd = Native.newTimerFd();
106             try {
107                 Native.epollCtlAdd(epollFd.intValue(), timerFd.intValue(), Native.EPOLLIN | Native.EPOLLET);
108             } catch (IOException e) {
109                 throw new IllegalStateException("Unable to add timerFd filedescriptor to epoll", e);
110             }
111             success = true;
112         } finally {
113             if (!success) {
114                 if (epollFd != null) {
115                     try {
116                         epollFd.close();
117                     } catch (Exception e) {
118                         // ignore
119                     }
120                 }
121                 if (eventFd != null) {
122                     try {
123                         eventFd.close();
124                     } catch (Exception e) {
125                         // ignore
126                     }
127                 }
128                 if (timerFd != null) {
129                     try {
130                         timerFd.close();
131                     } catch (Exception e) {
132                         // ignore
133                     }
134                 }
135             }
136         }
137     }
138 
139     /**
140      * Return a cleared {@link IovArray} that can be used for writes in this {@link EventLoop}.
141      */
142     IovArray cleanIovArray() {
143         if (iovArray == null) {
144             iovArray = new IovArray();
145         } else {
146             iovArray.clear();
147         }
148         return iovArray;
149     }
150 
151     /**
152      * Return a cleared {@link NativeDatagramPacketArray} that can be used for writes in this {@link EventLoop}.
153      */
154     NativeDatagramPacketArray cleanDatagramPacketArray() {
155         if (datagramPacketArray == null) {
156             datagramPacketArray = new NativeDatagramPacketArray();
157         } else {
158             datagramPacketArray.clear();
159         }
160         return datagramPacketArray;
161     }
162 
163     @Override
164     protected void wakeup(boolean inEventLoop) {
165         if (!inEventLoop && WAKEN_UP_UPDATER.compareAndSet(this, 0, 1)) {
166             // write to the evfd which will then wake-up epoll_wait(...)
167             Native.eventFdWrite(eventFd.intValue(), 1L);
168         }
169     }
170 
171     /**
172      * Register the given epoll with this {@link EventLoop}.
173      */
174     void add(AbstractEpollChannel ch) throws IOException {
175         assert inEventLoop();
176         int fd = ch.socket.intValue();
177         Native.epollCtlAdd(epollFd.intValue(), fd, ch.flags);
178         channels.put(fd, ch);
179     }
180 
181     /**
182      * The flags of the given epoll was modified so update the registration
183      */
184     void modify(AbstractEpollChannel ch) throws IOException {
185         assert inEventLoop();
186         Native.epollCtlMod(epollFd.intValue(), ch.socket.intValue(), ch.flags);
187     }
188 
189     /**
190      * Deregister the given epoll from this {@link EventLoop}.
191      */
192     void remove(AbstractEpollChannel ch) throws IOException {
193         assert inEventLoop();
194 
195         if (ch.isOpen()) {
196             int fd = ch.socket.intValue();
197             if (channels.remove(fd) != null) {
198                 // Remove the epoll. This is only needed if it's still open as otherwise it will be automatically
199                 // removed once the file-descriptor is closed.
200                 Native.epollCtlDel(epollFd.intValue(), ch.fd().intValue());
201             }
202         }
203     }
204 
205     @Override
206     protected Queue<Runnable> newTaskQueue(int maxPendingTasks) {
207         // This event loop never calls takeTask()
208         return maxPendingTasks == Integer.MAX_VALUE ? PlatformDependent.<Runnable>newMpscQueue()
209                                                     : PlatformDependent.<Runnable>newMpscQueue(maxPendingTasks);
210     }
211 
212     /**
213      * Returns the percentage of the desired amount of time spent for I/O in the event loop.
214      */
215     public int getIoRatio() {
216         return ioRatio;
217     }
218 
219     /**
220      * Sets the percentage of the desired amount of time spent for I/O in the event loop.  The default value is
221      * {@code 50}, which means the event loop will try to spend the same amount of time for I/O as for non-I/O tasks.
222      */
223     public void setIoRatio(int ioRatio) {
224         if (ioRatio <= 0 || ioRatio > 100) {
225             throw new IllegalArgumentException("ioRatio: " + ioRatio + " (expected: 0 < ioRatio <= 100)");
226         }
227         this.ioRatio = ioRatio;
228     }
229 
230     @Override
231     public int registeredChannels() {
232         return channels.size();
233     }
234 
235     private int epollWait(boolean oldWakeup) throws IOException {
236         // If a task was submitted when wakenUp value was 1, the task didn't get a chance to produce wakeup event.
237         // So we need to check task queue again before calling epoll_wait. If we don't, the task might be pended
238         // until epoll_wait was timed out. It might be pended until idle timeout if IdleStateHandler existed
239         // in pipeline.
240         if (oldWakeup && hasTasks()) {
241             return epollWaitNow();
242         }
243 
244         int delaySeconds;
245         int delayNanos;
246         long curDeadlineNanos = deadlineNanos();
247         if (curDeadlineNanos == prevDeadlineNanos) {
248             delaySeconds = -1;
249             delayNanos = -1;
250         } else {
251             long totalDelay = delayNanos(System.nanoTime());
252             prevDeadlineNanos = curDeadlineNanos;
253             delaySeconds = (int) min(totalDelay / 1000000000L, Integer.MAX_VALUE);
254             delayNanos = (int) min(totalDelay - delaySeconds * 1000000000L, MAX_SCHEDULED_TIMERFD_NS);
255         }
256         return Native.epollWait(epollFd, events, timerFd, delaySeconds, delayNanos);
257     }
258 
259     private int epollWaitNow() throws IOException {
260         return Native.epollWait(epollFd, events, timerFd, 0, 0);
261     }
262 
263     private int epollBusyWait() throws IOException {
264         return Native.epollBusyWait(epollFd, events);
265     }
266 
267     @Override
268     protected void run() {
269         for (;;) {
270             try {
271                 int strategy = selectStrategy.calculateStrategy(selectNowSupplier, hasTasks());
272                 switch (strategy) {
273                     case SelectStrategy.CONTINUE:
274                         continue;
275 
276                     case SelectStrategy.BUSY_WAIT:
277                         strategy = epollBusyWait();
278                         break;
279 
280                     case SelectStrategy.SELECT:
281                         strategy = epollWait(WAKEN_UP_UPDATER.getAndSet(this, 0) == 1);
282 
283                         // 'wakenUp.compareAndSet(false, true)' is always evaluated
284                         // before calling 'selector.wakeup()' to reduce the wake-up
285                         // overhead. (Selector.wakeup() is an expensive operation.)
286                         //
287                         // However, there is a race condition in this approach.
288                         // The race condition is triggered when 'wakenUp' is set to
289                         // true too early.
290                         //
291                         // 'wakenUp' is set to true too early if:
292                         // 1) Selector is waken up between 'wakenUp.set(false)' and
293                         //    'selector.select(...)'. (BAD)
294                         // 2) Selector is waken up between 'selector.select(...)' and
295                         //    'if (wakenUp.get()) { ... }'. (OK)
296                         //
297                         // In the first case, 'wakenUp' is set to true and the
298                         // following 'selector.select(...)' will wake up immediately.
299                         // Until 'wakenUp' is set to false again in the next round,
300                         // 'wakenUp.compareAndSet(false, true)' will fail, and therefore
301                         // any attempt to wake up the Selector will fail, too, causing
302                         // the following 'selector.select(...)' call to block
303                         // unnecessarily.
304                         //
305                         // To fix this problem, we wake up the selector again if wakenUp
306                         // is true immediately after selector.select(...).
307                         // It is inefficient in that it wakes up the selector for both
308                         // the first case (BAD - wake-up required) and the second case
309                         // (OK - no wake-up required).
310 
311                         if (wakenUp == 1) {
312                             Native.eventFdWrite(eventFd.intValue(), 1L);
313                         }
314                         // fallthrough
315                     default:
316                 }
317 
318                 final int ioRatio = this.ioRatio;
319                 if (ioRatio == 100) {
320                     try {
321                         if (strategy > 0) {
322                             processReady(events, strategy);
323                         }
324                     } finally {
325                         // Ensure we always run tasks.
326                         runAllTasks();
327                     }
328                 } else {
329                     final long ioStartTime = System.nanoTime();
330 
331                     try {
332                         if (strategy > 0) {
333                             processReady(events, strategy);
334                         }
335                     } finally {
336                         // Ensure we always run tasks.
337                         final long ioTime = System.nanoTime() - ioStartTime;
338                         runAllTasks(ioTime * (100 - ioRatio) / ioRatio);
339                     }
340                 }
341                 if (allowGrowing && strategy == events.length()) {
342                     //increase the size of the array as we needed the whole space for the events
343                     events.increase();
344                 }
345             } catch (Throwable t) {
346                 handleLoopException(t);
347             }
348             // Always handle shutdown even if the loop processing threw an exception.
349             try {
350                 if (isShuttingDown()) {
351                     closeAll();
352                     if (confirmShutdown()) {
353                         break;
354                     }
355                 }
356             } catch (Throwable t) {
357                 handleLoopException(t);
358             }
359         }
360     }
361 
362     /**
363      * Visible only for testing!
364      */
365     void handleLoopException(Throwable t) {
366         logger.warn("Unexpected exception in the selector loop.", t);
367 
368         // Prevent possible consecutive immediate failures that lead to
369         // excessive CPU consumption.
370         try {
371             Thread.sleep(1000);
372         } catch (InterruptedException e) {
373             // Ignore.
374         }
375     }
376 
377     private void closeAll() {
378         try {
379             epollWaitNow();
380         } catch (IOException ignore) {
381             // ignore on close
382         }
383         // Using the intermediate collection to prevent ConcurrentModificationException.
384         // In the `close()` method, the channel is deleted from `channels` map.
385         AbstractEpollChannel[] localChannels = channels.values().toArray(new AbstractEpollChannel[0]);
386 
387         for (AbstractEpollChannel ch : localChannels) {
388             ch.unsafe().close(ch.unsafe().voidPromise());
389         }
390     }
391 
392     private void processReady(EpollEventArray events, int ready) {
393         for (int i = 0; i < ready; i ++) {
394             final int fd = events.fd(i);
395             if (fd == eventFd.intValue()) {
396                 // consume wakeup event.
397                 Native.eventFdRead(fd);
398             } else if (fd == timerFd.intValue()) {
399                 // consume wakeup event, necessary because the timer is added with ET mode.
400                 Native.timerFdRead(fd);
401             } else {
402                 final long ev = events.events(i);
403 
404                 AbstractEpollChannel ch = channels.get(fd);
405                 if (ch != null) {
406                     // Don't change the ordering of processing EPOLLOUT | EPOLLRDHUP / EPOLLIN if you're not 100%
407                     // sure about it!
408                     // Re-ordering can easily introduce bugs and bad side-effects, as we found out painfully in the
409                     // past.
410                     AbstractEpollUnsafe unsafe = (AbstractEpollUnsafe) ch.unsafe();
411 
412                     // First check for EPOLLOUT as we may need to fail the connect ChannelPromise before try
413                     // to read from the file descriptor.
414                     // See https://github.com/netty/netty/issues/3785
415                     //
416                     // It is possible for an EPOLLOUT or EPOLLERR to be generated when a connection is refused.
417                     // In either case epollOutReady() will do the correct thing (finish connecting, or fail
418                     // the connection).
419                     // See https://github.com/netty/netty/issues/3848
420                     if ((ev & (Native.EPOLLERR | Native.EPOLLOUT)) != 0) {
421                         // Force flush of data as the epoll is writable again
422                         unsafe.epollOutReady();
423                     }
424 
425                     // Check EPOLLIN before EPOLLRDHUP to ensure all data is read before shutting down the input.
426                     // See https://github.com/netty/netty/issues/4317.
427                     //
428                     // If EPOLLIN or EPOLLERR was received and the channel is still open call epollInReady(). This will
429                     // try to read from the underlying file descriptor and so notify the user about the error.
430                     if ((ev & (Native.EPOLLERR | Native.EPOLLIN)) != 0) {
431                         // The Channel is still open and there is something to read. Do it now.
432                         unsafe.epollInReady();
433                     }
434 
435                     // Check if EPOLLRDHUP was set, this will notify us for connection-reset in which case
436                     // we may close the channel directly or try to read more data depending on the state of the
437                     // Channel and als depending on the AbstractEpollChannel subtype.
438                     if ((ev & Native.EPOLLRDHUP) != 0) {
439                         unsafe.epollRdHupReady();
440                     }
441                 } else {
442                     // We received an event for an fd which we not use anymore. Remove it from the epoll_event set.
443                     try {
444                         Native.epollCtlDel(epollFd.intValue(), fd);
445                     } catch (IOException ignore) {
446                         // This can happen but is nothing we need to worry about as we only try to delete
447                         // the fd from the epoll set as we not found it in our mappings. So this call to
448                         // epollCtlDel(...) is just to ensure we cleanup stuff and so may fail if it was
449                         // deleted before or the file descriptor was closed before.
450                     }
451                 }
452             }
453         }
454     }
455 
456     @Override
457     protected void cleanup() {
458         try {
459             try {
460                 epollFd.close();
461             } catch (IOException e) {
462                 logger.warn("Failed to close the epoll fd.", e);
463             }
464             try {
465                 eventFd.close();
466             } catch (IOException e) {
467                 logger.warn("Failed to close the event fd.", e);
468             }
469             try {
470                 timerFd.close();
471             } catch (IOException e) {
472                 logger.warn("Failed to close the timer fd.", e);
473             }
474         } finally {
475             // release native memory
476             if (iovArray != null) {
477                 iovArray.release();
478                 iovArray = null;
479             }
480             if (datagramPacketArray != null) {
481                 datagramPacketArray.release();
482                 datagramPacketArray = null;
483             }
484             events.free();
485         }
486     }
487 }