diff options
Diffstat (limited to 'src/unix/linux-core.c')
| -rw-r--r-- | src/unix/linux-core.c | 816 | 
1 files changed, 816 insertions, 0 deletions
diff --git a/src/unix/linux-core.c b/src/unix/linux-core.c new file mode 100644 index 0000000000..f71dd2d6b9 --- /dev/null +++ b/src/unix/linux-core.c @@ -0,0 +1,816 @@ +/* Copyright Joyent, Inc. and other Node contributors. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "uv.h" +#include "internal.h" + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <errno.h> + +#include <net/if.h> +#include <sys/param.h> +#include <sys/prctl.h> +#include <sys/sysinfo.h> +#include <unistd.h> +#include <fcntl.h> +#include <time.h> + +#ifndef __ANDROID__ +#define HAVE_IFADDRS_H 1 +#endif + +#ifdef __UCLIBC__ +# if __UCLIBC_MAJOR__ < 0 || __UCLIBC_MINOR__ < 9 || __UCLIBC_SUBLEVEL__ < 32 +#  undef HAVE_IFADDRS_H +# endif +#endif +#ifdef HAVE_IFADDRS_H +# include <ifaddrs.h> +# include <sys/socket.h> +# include <net/ethernet.h> +# include <linux/if_packet.h> +#endif + +/* Available from 2.6.32 onwards. */ +#ifndef CLOCK_MONOTONIC_COARSE +# define CLOCK_MONOTONIC_COARSE 6 +#endif + +/* This is rather annoying: CLOCK_BOOTTIME lives in <linux/time.h> but we can't + * include that file because it conflicts with <time.h>. We'll just have to + * define it ourselves. + */ +#ifndef CLOCK_BOOTTIME +# define CLOCK_BOOTTIME 7 +#endif + +static int read_models(unsigned int numcpus, uv_cpu_info_t* ci); +static int read_times(unsigned int numcpus, uv_cpu_info_t* ci); +static void read_speeds(unsigned int numcpus, uv_cpu_info_t* ci); +static unsigned long read_cpufreq(unsigned int cpunum); + + +int uv__platform_loop_init(uv_loop_t* loop, int default_loop) { +  int fd; + +  fd = uv__epoll_create1(UV__EPOLL_CLOEXEC); + +  /* epoll_create1() can fail either because it's not implemented (old kernel) +   * or because it doesn't understand the EPOLL_CLOEXEC flag. +   */ +  if (fd == -1 && (errno == ENOSYS || errno == EINVAL)) { +    fd = uv__epoll_create(256); + +    if (fd != -1) +      uv__cloexec(fd, 1); +  } + +  loop->backend_fd = fd; +  loop->inotify_fd = -1; +  loop->inotify_watchers = NULL; + +  if (fd == -1) +    return -errno; + +  return 0; +} + + +void uv__platform_loop_delete(uv_loop_t* loop) { +  if (loop->inotify_fd == -1) return; +  uv__io_stop(loop, &loop->inotify_read_watcher, UV__POLLIN); +  uv__close(loop->inotify_fd); +  loop->inotify_fd = -1; +} + + +void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) { +  struct uv__epoll_event* events; +  uintptr_t i; +  uintptr_t nfds; + +  assert(loop->watchers != NULL); + +  events = (struct uv__epoll_event*) loop->watchers[loop->nwatchers]; +  nfds = (uintptr_t) loop->watchers[loop->nwatchers + 1]; +  if (events == NULL) +    return; + +  /* Invalidate events with same file descriptor */ +  for (i = 0; i < nfds; i++) +    if ((int) events[i].data == fd) +      events[i].data = -1; +} + + +void uv__io_poll(uv_loop_t* loop, int timeout) { +  struct uv__epoll_event events[1024]; +  struct uv__epoll_event* pe; +  struct uv__epoll_event e; +  QUEUE* q; +  uv__io_t* w; +  uint64_t base; +  uint64_t diff; +  int nevents; +  int count; +  int nfds; +  int fd; +  int op; +  int i; + +  if (loop->nfds == 0) { +    assert(QUEUE_EMPTY(&loop->watcher_queue)); +    return; +  } + +  while (!QUEUE_EMPTY(&loop->watcher_queue)) { +    q = QUEUE_HEAD(&loop->watcher_queue); +    QUEUE_REMOVE(q); +    QUEUE_INIT(q); + +    w = QUEUE_DATA(q, uv__io_t, watcher_queue); +    assert(w->pevents != 0); +    assert(w->fd >= 0); +    assert(w->fd < (int) loop->nwatchers); + +    e.events = w->pevents; +    e.data = w->fd; + +    if (w->events == 0) +      op = UV__EPOLL_CTL_ADD; +    else +      op = UV__EPOLL_CTL_MOD; + +    /* XXX Future optimization: do EPOLL_CTL_MOD lazily if we stop watching +     * events, skip the syscall and squelch the events after epoll_wait(). +     */ +    if (uv__epoll_ctl(loop->backend_fd, op, w->fd, &e)) { +      if (errno != EEXIST) +        abort(); + +      assert(op == UV__EPOLL_CTL_ADD); + +      /* We've reactivated a file descriptor that's been watched before. */ +      if (uv__epoll_ctl(loop->backend_fd, UV__EPOLL_CTL_MOD, w->fd, &e)) +        abort(); +    } + +    w->events = w->pevents; +  } + +  assert(timeout >= -1); +  base = loop->time; +  count = 48; /* Benchmarks suggest this gives the best throughput. */ + +  for (;;) { +    nfds = uv__epoll_wait(loop->backend_fd, +                          events, +                          ARRAY_SIZE(events), +                          timeout); + +    /* Update loop->time unconditionally. It's tempting to skip the update when +     * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the +     * operating system didn't reschedule our process while in the syscall. +     */ +    SAVE_ERRNO(uv__update_time(loop)); + +    if (nfds == 0) { +      assert(timeout != -1); +      return; +    } + +    if (nfds == -1) { +      if (errno != EINTR) +        abort(); + +      if (timeout == -1) +        continue; + +      if (timeout == 0) +        return; + +      /* Interrupted by a signal. Update timeout and poll again. */ +      goto update_timeout; +    } + +    nevents = 0; + +    assert(loop->watchers != NULL); +    loop->watchers[loop->nwatchers] = (void*) events; +    loop->watchers[loop->nwatchers + 1] = (void*) (uintptr_t) nfds; +    for (i = 0; i < nfds; i++) { +      pe = events + i; +      fd = pe->data; + +      /* Skip invalidated events, see uv__platform_invalidate_fd */ +      if (fd == -1) +        continue; + +      assert(fd >= 0); +      assert((unsigned) fd < loop->nwatchers); + +      w = loop->watchers[fd]; + +      if (w == NULL) { +        /* File descriptor that we've stopped watching, disarm it. +         * +         * Ignore all errors because we may be racing with another thread +         * when the file descriptor is closed. +         */ +        uv__epoll_ctl(loop->backend_fd, UV__EPOLL_CTL_DEL, fd, pe); +        continue; +      } + +      /* Give users only events they're interested in. Prevents spurious +       * callbacks when previous callback invocation in this loop has stopped +       * the current watcher. Also, filters out events that users has not +       * requested us to watch. +       */ +      pe->events &= w->pevents | UV__POLLERR | UV__POLLHUP; + +      /* Work around an epoll quirk where it sometimes reports just the +       * EPOLLERR or EPOLLHUP event.  In order to force the event loop to +       * move forward, we merge in the read/write events that the watcher +       * is interested in; uv__read() and uv__write() will then deal with +       * the error or hangup in the usual fashion. +       * +       * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user +       * reads the available data, calls uv_read_stop(), then sometime later +       * calls uv_read_start() again.  By then, libuv has forgotten about the +       * hangup and the kernel won't report EPOLLIN again because there's +       * nothing left to read.  If anything, libuv is to blame here.  The +       * current hack is just a quick bandaid; to properly fix it, libuv +       * needs to remember the error/hangup event.  We should get that for +       * free when we switch over to edge-triggered I/O. +       */ +      if (pe->events == UV__EPOLLERR || pe->events == UV__EPOLLHUP) +        pe->events |= w->pevents & (UV__EPOLLIN | UV__EPOLLOUT); + +      if (pe->events != 0) { +        w->cb(loop, w, pe->events); +        nevents++; +      } +    } +    loop->watchers[loop->nwatchers] = NULL; +    loop->watchers[loop->nwatchers + 1] = NULL; + +    if (nevents != 0) { +      if (nfds == ARRAY_SIZE(events) && --count != 0) { +        /* Poll for more events but don't block this time. */ +        timeout = 0; +        continue; +      } +      return; +    } + +    if (timeout == 0) +      return; + +    if (timeout == -1) +      continue; + +update_timeout: +    assert(timeout > 0); + +    diff = loop->time - base; +    if (diff >= (uint64_t) timeout) +      return; + +    timeout -= diff; +  } +} + + +uint64_t uv__hrtime(uv_clocktype_t type) { +  static clock_t fast_clock_id = -1; +  struct timespec t; +  clock_t clock_id; + +  /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has +   * millisecond granularity or better.  CLOCK_MONOTONIC_COARSE is +   * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may +   * decide to make a costly system call. +   */ +  /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE +   * when it has microsecond granularity or better (unlikely). +   */ +  if (type == UV_CLOCK_FAST && fast_clock_id == -1) { +    if (clock_getres(CLOCK_MONOTONIC_COARSE, &t) == 0 && +        t.tv_nsec <= 1 * 1000 * 1000) { +      fast_clock_id = CLOCK_MONOTONIC_COARSE; +    } else { +      fast_clock_id = CLOCK_MONOTONIC; +    } +  } + +  clock_id = CLOCK_MONOTONIC; +  if (type == UV_CLOCK_FAST) +    clock_id = fast_clock_id; + +  if (clock_gettime(clock_id, &t)) +    return 0;  /* Not really possible. */ + +  return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec; +} + + +void uv_loadavg(double avg[3]) { +  struct sysinfo info; + +  if (sysinfo(&info) < 0) return; + +  avg[0] = (double) info.loads[0] / 65536.0; +  avg[1] = (double) info.loads[1] / 65536.0; +  avg[2] = (double) info.loads[2] / 65536.0; +} + + +int uv_exepath(char* buffer, size_t* size) { +  ssize_t n; + +  if (buffer == NULL || size == NULL) +    return -EINVAL; + +  n = readlink("/proc/self/exe", buffer, *size - 1); +  if (n == -1) +    return -errno; + +  buffer[n] = '\0'; +  *size = n; + +  return 0; +} + + +uint64_t uv_get_free_memory(void) { +  return (uint64_t) sysconf(_SC_PAGESIZE) * sysconf(_SC_AVPHYS_PAGES); +} + + +uint64_t uv_get_total_memory(void) { +  return (uint64_t) sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES); +} + + +int uv_resident_set_memory(size_t* rss) { +  char buf[1024]; +  const char* s; +  ssize_t n; +  long val; +  int fd; +  int i; + +  do +    fd = open("/proc/self/stat", O_RDONLY); +  while (fd == -1 && errno == EINTR); + +  if (fd == -1) +    return -errno; + +  do +    n = read(fd, buf, sizeof(buf) - 1); +  while (n == -1 && errno == EINTR); + +  uv__close(fd); +  if (n == -1) +    return -errno; +  buf[n] = '\0'; + +  s = strchr(buf, ' '); +  if (s == NULL) +    goto err; + +  s += 1; +  if (*s != '(') +    goto err; + +  s = strchr(s, ')'); +  if (s == NULL) +    goto err; + +  for (i = 1; i <= 22; i++) { +    s = strchr(s + 1, ' '); +    if (s == NULL) +      goto err; +  } + +  errno = 0; +  val = strtol(s, NULL, 10); +  if (errno != 0) +    goto err; +  if (val < 0) +    goto err; + +  *rss = val * getpagesize(); +  return 0; + +err: +  return -EINVAL; +} + + +int uv_uptime(double* uptime) { +  static volatile int no_clock_boottime; +  struct timespec now; +  int r; + +  /* Try CLOCK_BOOTTIME first, fall back to CLOCK_MONOTONIC if not available +   * (pre-2.6.39 kernels). CLOCK_MONOTONIC doesn't increase when the system +   * is suspended. +   */ +  if (no_clock_boottime) { +    retry: r = clock_gettime(CLOCK_MONOTONIC, &now); +  } +  else if ((r = clock_gettime(CLOCK_BOOTTIME, &now)) && errno == EINVAL) { +    no_clock_boottime = 1; +    goto retry; +  } + +  if (r) +    return -errno; + +  *uptime = now.tv_sec; +  return 0; +} + + +int uv_cpu_info(uv_cpu_info_t** cpu_infos, int* count) { +  unsigned int numcpus; +  uv_cpu_info_t* ci; +  int err; + +  *cpu_infos = NULL; +  *count = 0; + +  numcpus = sysconf(_SC_NPROCESSORS_ONLN); +  assert(numcpus != (unsigned int) -1); +  assert(numcpus != 0); + +  ci = calloc(numcpus, sizeof(*ci)); +  if (ci == NULL) +    return -ENOMEM; + +  err = read_models(numcpus, ci); +  if (err == 0) +    err = read_times(numcpus, ci); + +  if (err) { +    uv_free_cpu_info(ci, numcpus); +    return err; +  } + +  /* read_models() on x86 also reads the CPU speed from /proc/cpuinfo. +   * We don't check for errors here. Worst case, the field is left zero. +   */ +  if (ci[0].speed == 0) +    read_speeds(numcpus, ci); + +  *cpu_infos = ci; +  *count = numcpus; + +  return 0; +} + + +static void read_speeds(unsigned int numcpus, uv_cpu_info_t* ci) { +  unsigned int num; + +  for (num = 0; num < numcpus; num++) +    ci[num].speed = read_cpufreq(num) / 1000; +} + + +/* Also reads the CPU frequency on x86. The other architectures only have + * a BogoMIPS field, which may not be very accurate. + * + * Note: Simply returns on error, uv_cpu_info() takes care of the cleanup. + */ +static int read_models(unsigned int numcpus, uv_cpu_info_t* ci) { +  static const char model_marker[] = "model name\t: "; +  static const char speed_marker[] = "cpu MHz\t\t: "; +  const char* inferred_model; +  unsigned int model_idx; +  unsigned int speed_idx; +  char buf[1024]; +  char* model; +  FILE* fp; + +  /* Most are unused on non-ARM, non-MIPS and non-x86 architectures. */ +  (void) &model_marker; +  (void) &speed_marker; +  (void) &speed_idx; +  (void) &model; +  (void) &buf; +  (void) &fp; + +  model_idx = 0; +  speed_idx = 0; + +#if defined(__arm__) || \ +    defined(__i386__) || \ +    defined(__mips__) || \ +    defined(__x86_64__) +  fp = fopen("/proc/cpuinfo", "r"); +  if (fp == NULL) +    return -errno; + +  while (fgets(buf, sizeof(buf), fp)) { +    if (model_idx < numcpus) { +      if (strncmp(buf, model_marker, sizeof(model_marker) - 1) == 0) { +        model = buf + sizeof(model_marker) - 1; +        model = strndup(model, strlen(model) - 1);  /* Strip newline. */ +        if (model == NULL) { +          fclose(fp); +          return -ENOMEM; +        } +        ci[model_idx++].model = model; +        continue; +      } +    } +#if defined(__arm__) || defined(__mips__) +    if (model_idx < numcpus) { +#if defined(__arm__) +      /* Fallback for pre-3.8 kernels. */ +      static const char model_marker[] = "Processor\t: "; +#else	/* defined(__mips__) */ +      static const char model_marker[] = "cpu model\t\t: "; +#endif +      if (strncmp(buf, model_marker, sizeof(model_marker) - 1) == 0) { +        model = buf + sizeof(model_marker) - 1; +        model = strndup(model, strlen(model) - 1);  /* Strip newline. */ +        if (model == NULL) { +          fclose(fp); +          return -ENOMEM; +        } +        ci[model_idx++].model = model; +        continue; +      } +    } +#else  /* !__arm__ && !__mips__ */ +    if (speed_idx < numcpus) { +      if (strncmp(buf, speed_marker, sizeof(speed_marker) - 1) == 0) { +        ci[speed_idx++].speed = atoi(buf + sizeof(speed_marker) - 1); +        continue; +      } +    } +#endif  /* __arm__ || __mips__ */ +  } + +  fclose(fp); +#endif  /* __arm__ || __i386__ || __mips__ || __x86_64__ */ + +  /* Now we want to make sure that all the models contain *something* because +   * it's not safe to leave them as null. Copy the last entry unless there +   * isn't one, in that case we simply put "unknown" into everything. +   */ +  inferred_model = "unknown"; +  if (model_idx > 0) +    inferred_model = ci[model_idx - 1].model; + +  while (model_idx < numcpus) { +    model = strndup(inferred_model, strlen(inferred_model)); +    if (model == NULL) +      return -ENOMEM; +    ci[model_idx++].model = model; +  } + +  return 0; +} + + +static int read_times(unsigned int numcpus, uv_cpu_info_t* ci) { +  unsigned long clock_ticks; +  struct uv_cpu_times_s ts; +  unsigned long user; +  unsigned long nice; +  unsigned long sys; +  unsigned long idle; +  unsigned long dummy; +  unsigned long irq; +  unsigned int num; +  unsigned int len; +  char buf[1024]; +  FILE* fp; + +  clock_ticks = sysconf(_SC_CLK_TCK); +  assert(clock_ticks != (unsigned long) -1); +  assert(clock_ticks != 0); + +  fp = fopen("/proc/stat", "r"); +  if (fp == NULL) +    return -errno; + +  if (!fgets(buf, sizeof(buf), fp)) +    abort(); + +  num = 0; + +  while (fgets(buf, sizeof(buf), fp)) { +    if (num >= numcpus) +      break; + +    if (strncmp(buf, "cpu", 3)) +      break; + +    /* skip "cpu<num> " marker */ +    { +      unsigned int n; +      int r = sscanf(buf, "cpu%u ", &n); +      assert(r == 1); +      (void) r;  /* silence build warning */ +      for (len = sizeof("cpu0"); n /= 10; len++); +    } + +    /* Line contains user, nice, system, idle, iowait, irq, softirq, steal, +     * guest, guest_nice but we're only interested in the first four + irq. +     * +     * Don't use %*s to skip fields or %ll to read straight into the uint64_t +     * fields, they're not allowed in C89 mode. +     */ +    if (6 != sscanf(buf + len, +                    "%lu %lu %lu %lu %lu %lu", +                    &user, +                    &nice, +                    &sys, +                    &idle, +                    &dummy, +                    &irq)) +      abort(); + +    ts.user = clock_ticks * user; +    ts.nice = clock_ticks * nice; +    ts.sys  = clock_ticks * sys; +    ts.idle = clock_ticks * idle; +    ts.irq  = clock_ticks * irq; +    ci[num++].cpu_times = ts; +  } +  fclose(fp); +  assert(num == numcpus); + +  return 0; +} + + +static unsigned long read_cpufreq(unsigned int cpunum) { +  unsigned long val; +  char buf[1024]; +  FILE* fp; + +  snprintf(buf, +           sizeof(buf), +           "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", +           cpunum); + +  fp = fopen(buf, "r"); +  if (fp == NULL) +    return 0; + +  if (fscanf(fp, "%lu", &val) != 1) +    val = 0; + +  fclose(fp); + +  return val; +} + + +void uv_free_cpu_info(uv_cpu_info_t* cpu_infos, int count) { +  int i; + +  for (i = 0; i < count; i++) { +    free(cpu_infos[i].model); +  } + +  free(cpu_infos); +} + + +int uv_interface_addresses(uv_interface_address_t** addresses, +  int* count) { +#ifndef HAVE_IFADDRS_H +  return -ENOSYS; +#else +  struct ifaddrs *addrs, *ent; +  uv_interface_address_t* address; +  int i; +  struct sockaddr_ll *sll; + +  if (getifaddrs(&addrs)) +    return -errno; + +  *count = 0; + +  /* Count the number of interfaces */ +  for (ent = addrs; ent != NULL; ent = ent->ifa_next) { +    if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)) || +        (ent->ifa_addr == NULL) || +        (ent->ifa_addr->sa_family == PF_PACKET)) { +      continue; +    } + +    (*count)++; +  } + +  *addresses = malloc(*count * sizeof(**addresses)); +  if (!(*addresses)) +    return -ENOMEM; + +  address = *addresses; + +  for (ent = addrs; ent != NULL; ent = ent->ifa_next) { +    if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING))) +      continue; + +    if (ent->ifa_addr == NULL) +      continue; + +    /* +     * On Linux getifaddrs returns information related to the raw underlying +     * devices. We're not interested in this information yet. +     */ +    if (ent->ifa_addr->sa_family == PF_PACKET) +      continue; + +    address->name = strdup(ent->ifa_name); + +    if (ent->ifa_addr->sa_family == AF_INET6) { +      address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr); +    } else { +      address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr); +    } + +    if (ent->ifa_netmask->sa_family == AF_INET6) { +      address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask); +    } else { +      address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask); +    } + +    address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK); + +    address++; +  } + +  /* Fill in physical addresses for each interface */ +  for (ent = addrs; ent != NULL; ent = ent->ifa_next) { +    if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)) || +        (ent->ifa_addr == NULL) || +        (ent->ifa_addr->sa_family != PF_PACKET)) { +      continue; +    } + +    address = *addresses; + +    for (i = 0; i < (*count); i++) { +      if (strcmp(address->name, ent->ifa_name) == 0) { +        sll = (struct sockaddr_ll*)ent->ifa_addr; +        memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr)); +      } +      address++; +    } +  } + +  freeifaddrs(addrs); + +  return 0; +#endif +} + + +void uv_free_interface_addresses(uv_interface_address_t* addresses, +  int count) { +  int i; + +  for (i = 0; i < count; i++) { +    free(addresses[i].name); +  } + +  free(addresses); +} + + +void uv__set_process_title(const char* title) { +#if defined(PR_SET_NAME) +  prctl(PR_SET_NAME, title);  /* Only copies first 16 characters. */ +#endif +}  | 
