Merge remote-tracking branch 'origin/master' into f/centos8

Signed-off-by: Charles Short <charles.short@windriver.com>
Change-Id: Ic300a310de3ac942dbe6e2d8aa8852308d5123b5
This commit is contained in:
Charles Short 2021-05-31 05:50:57 -04:00
commit a13966754d
639 changed files with 334877 additions and 141 deletions

View File

@ -0,0 +1,49 @@
From 54d85d8a0378a6610012adeae7abaefaf01ea9a1 Mon Sep 17 00:00:00 2001
From: Zhixiong Chi <zhixiong.chi@windriver.com>
Date: Tue, 9 Feb 2021 18:30:14 -0800
Subject: [PATCH] WRS: dhcp: set the prefixlen to 64
Drop the patch dhcp-dhclient_ipv6_prefix.patch to keep the default
value of the prefixlen to 64, since we don't need this patch to set
the default value 128 as usual, otherwise it will occurs that all hosts
(controller|compute node) offline after booting off the controller-0,
or the other usage scenes.
As usual, 128 is usually the specifications call for host address
and it doesn't include any on-link information.
By contrast, 64 indicates that's subnet area, and this vaule is used
frequently as usual. So we still use the previous vaule 64.
Meanwhile we don't need to modify the relevant place where every
application code needed for the compatibility any more.
Signed-off-by: Zhixiong Chi <zhixiong.chi@windriver.com>
---
SPECS/dhcp.spec | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/SPECS/dhcp.spec b/SPECS/dhcp.spec
index c893632..f7add1a 100644
--- a/SPECS/dhcp.spec
+++ b/SPECS/dhcp.spec
@@ -111,7 +111,8 @@ Patch70: dhcp-4.2.5-reference_count_overflow.patch
Patch71: dhcp-replay_file_limit.patch
Patch72: dhcp-4.2.5-expiry_before_renewal_v2.patch
Patch73: dhcp-4.2.5-bind-config.patch
-Patch74: dhcp-dhclient_ipv6_prefix.patch
+#Drop dhcp-dhclient_ipv6_prefix.patch not to set the default prefixlen 128
+#Patch74: dhcp-dhclient_ipv6_prefix.patch
# Support build with bind 9.11.3+
Patch75: dhcp-4.2.5-isc-util.patch
@@ -452,7 +453,8 @@ rm -rf includes/isc-dhcp
# https://bugzilla.redhat.com/show_bug.cgi?id=1647784
%patch72 -p1 -b .t2-expirity
-%patch74 -p1 -b .ipv6-prefix
+# Drop this patch not to set the default prefixlen 128
+#%patch74 -p1 -b .ipv6-prefix
# Support for BIND 9.11
%patch73 -p1 -b .bind-config
--
2.17.0

View File

@ -0,0 +1,6 @@
TAR_NAME=inih
GIT_SHA=b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69
VERSION=44
COPY_LIST="$STX_BASE/downloads/$TAR_NAME-$GIT_SHA.tar.gz"
TIS_PATCH_VER=PKG_GITREVCOUNT

View File

@ -0,0 +1,45 @@
%global git_sha b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69
Summary: inih
Name: inih
Version: 44
Release: 0%{?_tis_dist}.%{tis_patch_ver}
License: New BSD
Group: base
Packager: Wind River <info@windriver.com>
URL: https://github.com/benhoyt/inih/releases/tag/r44
Source0: %{name}-%{git_sha}.tar.gz
BuildRequires: gcc
BuildRequires: gcc-c++
BuildRequires: ncurses-static
BuildRequires: perl-version
%define debug_package %{nil}
%description
Simple .INI file parser written in C
%prep
%setup
%build
pushd extra > /dev/null
make -f Makefile.static
popd > /dev/null
%install
install -d -m 755 %{buildroot}%{_libdir}
install -d -m 755 %{buildroot}%{_includedir}
install -d -m 755 %{buildroot}%{_datadir}/inih/
install -p -D -m 755 extra/libinih.a %{buildroot}%{_libdir}/libinih.a
install -p -D -m 644 ini.h %{buildroot}%{_includedir}/ini.h
install -p -D -m 644 LICENSE.txt %{buildroot}%{_datadir}/inih/LICENSE.txt
%clean
rm -rf $RPM_BUILD_ROOT
%files
%defattr(-,root,root,-)
%{_libdir}/*
%{_includedir}/*
%{_datadir}/*

View File

@ -1,23 +0,0 @@
Metadata-Version: 1.1
Name: net-snmp
Version: 5.8
Summary: A collection of SNMP protocol tools and libraries
Home-page:
Author:
Author-email:
License: BSD
Description:
SNMP (Simple Network Management Protocol) is a protocol used for
network management. The NET-SNMP project includes various SNMP tools:
an extensible agent, an SNMP library, tools for requesting or setting
information from SNMP agents, tools for generating and handling SNMP
traps, a version of the netstat command which uses SNMP, and a Tk/Perl
mib browser. This package contains the snmpd and snmptrapd daemons,
documentation, etc.
You will probably also want to install the net-snmp-utils package,
which contains NET-SNMP utilities.
Platform: UNKNOWN

View File

@ -1,26 +0,0 @@
From c25a30b4a0c7347234c2af4afab099b5735bbf71 Mon Sep 17 00:00:00 2001
From: Scott Little <scott.little@windriver.com>
Date: Mon, 2 Oct 2017 17:05:59 -0400
Subject: [PATCH] 0001-Update-package-versioning-for-TIS-format.patch
Signed-off-by: Dongqi Chen <chen.dq@neusoft.com>
---
SPECS/net-snmp.spec | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/SPECS/net-snmp.spec b/SPECS/net-snmp.spec
index a59db4f..8a24ba1 100644
--- a/SPECS/net-snmp.spec
+++ b/SPECS/net-snmp.spec
@@ -10,7 +10,7 @@
Summary: A collection of SNMP protocol tools and libraries
Name: net-snmp
Version: 5.8
-Release: 7%{?dist}.2
+Release: 7.el8_0.2%{?_tis_dist}.%{tis_patch_ver}
Epoch: 1
License: BSD
--
2.7.4

View File

@ -1,2 +0,0 @@
0001-Update-package-versioning-for-TIS-format.patch
spec-build-configure-changes.patch

View File

@ -1,44 +0,0 @@
Signed-off-by: Dongqi Chen <chen.dq@neusoft.com>
---
SPECS/net-snmp.spec | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/SPECS/net-snmp.spec b/SPECS/net-snmp.spec
index 8a24ba1..40af31f 100644
--- a/SPECS/net-snmp.spec
+++ b/SPECS/net-snmp.spec
@@ -1,3 +1,6 @@
+# Disable check
+%global netsnmp_check 0
+
# use nestnmp_check 0 to speed up packaging by disabling 'make test'
%{!?netsnmp_check: %global netsnmp_check 1}
@@ -198,7 +201,7 @@ rm testing/fulltests/default/T200*
# Autoreconf to get autoconf 2.69 for ARM (#926223)
autoreconf
-MIBS="host agentx smux \
+MIBS="agentx smux \
ucd-snmp/diskio tcp-mib udp-mib mibII/mta_sendmail \
ip-mib/ipv4InterfaceTable ip-mib/ipv6InterfaceTable \
ip-mib/ipAddressPrefixTable/ipAddressPrefixTable \
@@ -224,6 +227,7 @@ MIBS="$MIBS ucd-snmp/lmsensorsMib"
--with-ldflags="-Wl,-z,relro -Wl,-z,now" \
--with-logfile="/var/log/snmpd.log" \
--with-mib-modules="$MIBS" \
+ --with-out-mib-modules=host \
--with-mysql \
--with-openssl \
--with-persistent-directory="/var/lib/net-snmp" \
@@ -234,6 +238,7 @@ MIBS="$MIBS ucd-snmp/lmsensorsMib"
--with-systemd \
--with-temp-file-pattern=/var/run/net-snmp/snmp-tmp-XXXXXX \
--with-transports="DTLSUDP TLSTCP" \
+ --without-nl \
--with-sys-contact="root@localhost" <<EOF
EOF
--
2.7.4

View File

@ -1 +0,0 @@
mirror:Source/net-snmp-5.8-7.el8_0.2.src.rpm

View File

@ -0,0 +1,8 @@
TAR_NAME=pf-bb-config
GIT_SHA=945712e8876be2003f2f31de70353c48501519fa
COPY_LIST=" \
$STX_BASE/downloads/$TAR_NAME-$GIT_SHA.tar.gz \
$PKG_BASE/files/* \
"
TIS_PATCH_VER=PKG_GITREVCOUNT

View File

@ -0,0 +1,44 @@
%global git_sha 945712e8876be2003f2f31de70353c48501519fa
Summary: PF BBDEV (baseband device) Configuration Application.
Name: pf-bb-config
Version: 21.3
Release: 0%{?_tis_dist}.%{tis_patch_ver}
License: Apache-2.0
Group: base
Packager: Wind River <info@windriver.com>
URL: https://github.com/intel/pf-bb-config/releases/tag/v21.3
Source0: %{name}-%{git_sha}.tar.gz
Patch0: Reject-device-configuration-if-not-enabled.patch
BuildRequires: gcc
BuildRequires: inih
%define debug_package %{nil}
%description
The PF BBDEV (baseband device) Configuration Application "pf_bb_config" provides a means to
configure the baseband device at the host-level. The program accesses the configuration
space and sets the various parameters through memory-mapped IO read/writes.
%prep
%setup
%patch0 -p1
%build
make
%install
install -d -m 755 %{buildroot}%{_bindir}
install -d -m 755 %{buildroot}%{_datadir}/pf-bb-config/acc100
install -p -D -m 700 pf_bb_config %{buildroot}%{_bindir}/pf_bb_config
install -p -D -m 700 acc100/acc100_config_vf_5g.cfg %{buildroot}%{_datadir}/pf-bb-config/acc100/acc100_config_vf_5g.cfg
install -p -D -m 644 README.md %{buildroot}%{_datadir}/pf-bb-config/README.md
%clean
rm -rf $RPM_BUILD_ROOT
%files
%defattr(-,root,root,-)
%{_bindir}/*
%{_datadir}/*

View File

@ -0,0 +1,48 @@
From 8ac364315c153e546fbae9dd63c562b9a1e42d82 Mon Sep 17 00:00:00 2001
From: Babak Sarashki <Babak.SarAshki@windriver.com>
Date: Sun, 24 Jan 2021 13:46:20 -0500
Subject: [PATCH] Reject device configuration if not enabled
Signed-off-by: Babak Sarashki <Babak.SarAshki@windriver.com>
---
config_app.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/config_app.c b/config_app.c
index fdad259..f1aa52b 100644
--- a/config_app.c
+++ b/config_app.c
@@ -114,6 +114,7 @@ static bool
get_device_id(hw_device *device, const char *location)
{
unsigned long vendor_id = -1, device_id = -1;
+ unsigned int device_enabled = 0;
struct dirent *dirent;
DIR *dir;
char pci_path[PATH_MAX];
@@ -139,6 +140,12 @@ get_device_id(hw_device *device, const char *location)
snprintf(file_path, sizeof(file_path), "%s/%s",
pci_path, dirent->d_name);
+ /* Is device enabled? */
+ if (strncmp(dirent->d_name, "enable",
+ strlen(dirent->d_name)) == 0 &&
+ dirent->d_type == DT_REG)
+ device_enabled = get_file_val(file_path);
+
/* Get Device ID */
if (strncmp(dirent->d_name, DEVICE_FILE,
strlen(dirent->d_name)) == 0 &&
@@ -154,7 +161,8 @@ get_device_id(hw_device *device, const char *location)
closedir(dir);
/* Check if device is found */
- return (vendor_id == device->vendor_id &&
+ return (device_enabled &&
+ vendor_id == device->vendor_id &&
device_id == device->device_id);
}
--
2.29.2

View File

@ -0,0 +1,80 @@
From 8b63ddb68a39d48ebb621d76a2b1f07f5ff67ac7 Mon Sep 17 00:00:00 2001
Message-Id: <8b63ddb68a39d48ebb621d76a2b1f07f5ff67ac7.1574264572.git.Jim.Somerville@windriver.com>
From: systemd team <systemd-maint@redhat.com>
Date: Tue, 8 Nov 2016 17:06:01 -0500
Subject: [PATCH 1/3] inject millisec in syslog date
Signed-off-by: Jim Somerville <Jim.Somerville@windriver.com>
Signed-off-by: Long Li <lilong-neu@neusoft.com>
---
src/journal/journald-syslog.c | 47 +++++++++++++++++++++++++++++++++++++------
1 file changed, 41 insertions(+), 6 deletions(-)
diff --git a/src/journal/journald-syslog.c b/src/journal/journald-syslog.c
index 97711ac..2149b20 100644
--- a/src/journal/journald-syslog.c
+++ b/src/journal/journald-syslog.c
@@ -25,6 +25,44 @@
/* Warn once every 30s if we missed syslog message */
#define WARN_FORWARD_SYSLOG_MISSED_USEC (30 * USEC_PER_SEC)
+/* internal function that builds a formatted time str of the
+ * tv parameter into the passed buffer. (ie Nov 7 16:28:38.109)
+ * If tv is NULL, then the clock function is used to build the formatted time
+ * returns (same as snprintf) - number of characters written to buffer.
+ */
+static int formatSyslogDate(char * buffer, int bufLen, const struct timeval *tv) {
+ struct timeval tv_tmp;
+ long int millisec;
+ char tmpbuf[64];
+ struct tm *tm;
+ time_t t;
+
+ if (!tv) {
+ // no timeval input so get time data from clock
+ usec_t now_usec = now(CLOCK_REALTIME);
+ time_t now_sec = ((time_t) now_usec / USEC_PER_SEC);
+ long int now_fraction_secs = now_usec % USEC_PER_SEC;
+ tv_tmp.tv_sec = now_sec;
+ tv_tmp.tv_usec = now_fraction_secs;
+ tv = &tv_tmp;
+ }
+
+ t = tv->tv_sec;
+ tm = localtime(&t);
+ if (!tm)
+ return 0;
+
+ // format time to the second granularity - ie Nov 7 16:28:38
+ if (strftime(tmpbuf,sizeof(tmpbuf),"%h %e %T", tm) <= 0)
+ return 0;
+
+ millisec = tv->tv_usec / 1000;
+ // now append millisecond granularity (ie Nov 7 16:28:38.109) to
+ // the formatted string.
+ return snprintf(buffer, bufLen, "%s.%03lu", tmpbuf, millisec);
+}
+
+
static void forward_syslog_iovec(Server *s, const struct iovec *iovec, unsigned n_iovec, const struct ucred *ucred, const struct timeval *tv) {
static const union sockaddr_union sa = {
@@ -133,12 +171,9 @@ void server_forward_syslog(Server *s, int priority, const char *identifier, cons
iovec[n++] = IOVEC_MAKE_STRING(header_priority);
/* Second: timestamp */
- t = tv ? tv->tv_sec : ((time_t) (now(CLOCK_REALTIME) / USEC_PER_SEC));
- tm = localtime(&t);
- if (!tm)
- return;
- if (strftime(header_time, sizeof(header_time), "%h %e %T ", tm) <= 0)
- return;
+ if (formatSyslogDate(header_time, sizeof(header_time), tv) <=0 )
+ return;
+
iovec[n++] = IOVEC_MAKE_STRING(header_time);
/* Third: identifier and PID */
--
2.7.4

View File

@ -0,0 +1,31 @@
From 7cc3363381f83bb060e8e686eb64b5425f2d4409 Mon Sep 17 00:00:00 2001
Message-Id: <7cc3363381f83bb060e8e686eb64b5425f2d4409.1574264572.git.Jim.Somerville@windriver.com>
In-Reply-To: <8b63ddb68a39d48ebb621d76a2b1f07f5ff67ac7.1574264572.git.Jim.Somerville@windriver.com>
References: <8b63ddb68a39d48ebb621d76a2b1f07f5ff67ac7.1574264572.git.Jim.Somerville@windriver.com>
From: slin14 <shuicheng.lin@intel.com>
Date: Thu, 9 Aug 2018 18:38:18 +0800
Subject: [PATCH 2/3] fix build error for unused variable
Signed-off-by: slin14 <shuicheng.lin@intel.com>
Signed-off-by: Jim Somerville <Jim.Somerville@windriver.com>
Signed-off-by: Long Li <lilong-neu@neusoft.com>
---
src/journal/journald-syslog.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/src/journal/journald-syslog.c b/src/journal/journald-syslog.c
index 2149b20..23e66b2 100644
--- a/src/journal/journald-syslog.c
+++ b/src/journal/journald-syslog.c
@@ -154,8 +154,6 @@ void server_forward_syslog(Server *s, int priority, const char *identifier, cons
char header_priority[DECIMAL_STR_MAX(priority) + 3], header_time[64],
header_pid[STRLEN("[]: ") + DECIMAL_STR_MAX(pid_t) + 1];
int n = 0;
- time_t t;
- struct tm *tm;
_cleanup_free_ char *ident_buf = NULL;
assert(s);
--
2.7.4

View File

@ -0,0 +1,80 @@
From 339ea8b005c037eaad217dfd3cc10b2b110bdd28 Mon Sep 17 00:00:00 2001
Message-Id: <339ea8b005c037eaad217dfd3cc10b2b110bdd28.1574264572.git.Jim.Somerville@windriver.com>
In-Reply-To: <8b63ddb68a39d48ebb621d76a2b1f07f5ff67ac7.1574264572.git.Jim.Somerville@windriver.com>
References: <8b63ddb68a39d48ebb621d76a2b1f07f5ff67ac7.1574264572.git.Jim.Somerville@windriver.com>
From: Shuicheng Lin <shuicheng.lin@intel.com>
Date: Tue, 2 Apr 2019 16:43:03 +0000
Subject: [PATCH 3/3] Fix compile failure due to deprecated value
Issue occur after upgrade build tool chain. Fix it per tool chain's
suggestion.
Error message is like below:
"
Value MHD_HTTP_REQUEST_ENTITY_TOO_LARGE is deprecated,
use MHD_HTTP_PAYLOAD_TOO_LARGE [-Werror]
Value MHD_HTTP_METHOD_NOT_ACCEPTABLE is deprecated,
use MHD_HTTP_NOT_ACCEPTABLE [-Werror]
"
Signed-off-by: Shuicheng Lin <shuicheng.lin@intel.com>
Signed-off-by: Mawrer Ramirez <mawrer.a.ramirez.martinez@intel.com>
Signed-off-by: Jim Somerville <Jim.Somerville@windriver.com>
---
src/journal-remote/journal-gatewayd.c | 4 ++--
src/journal-remote/journal-remote.c | 6 +++---
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/journal-remote/journal-gatewayd.c b/src/journal-remote/journal-gatewayd.c
index d1f0ce3..8364044 100644
--- a/src/journal-remote/journal-gatewayd.c
+++ b/src/journal-remote/journal-gatewayd.c
@@ -684,7 +684,7 @@ static int request_handler_file(
if (fstat(fd, &st) < 0)
return mhd_respondf(connection, MHD_HTTP_INTERNAL_SERVER_ERROR, "Failed to stat file: %m\n");
- response = MHD_create_response_from_fd_at_offset(st.st_size, fd, 0);
+ response = MHD_create_response_from_fd_at_offset64(st.st_size, fd, 0);
if (!response)
return respond_oom(connection);
@@ -824,7 +824,7 @@ static int request_handler(
assert(method);
if (!streq(method, "GET"))
- return mhd_respond(connection, MHD_HTTP_METHOD_NOT_ACCEPTABLE,
+ return mhd_respond(connection, MHD_HTTP_NOT_ACCEPTABLE,
"Unsupported method.\n");
diff --git a/src/journal-remote/journal-remote.c b/src/journal-remote/journal-remote.c
index 431e283..476c826 100644
--- a/src/journal-remote/journal-remote.c
+++ b/src/journal-remote/journal-remote.c
@@ -526,13 +526,13 @@ static int process_http_upload(
log_warning("Failed to process data for connection %p", connection);
if (r == -ENOBUFS)
return mhd_respondf(connection,
- MHD_HTTP_REQUEST_ENTITY_TOO_LARGE,
+ MHD_HTTP_PAYLOAD_TOO_LARGE,
"Entry is too large, maximum is %u bytes.\n",
DATA_SIZE_MAX);
else if (r == -E2BIG)
return mhd_respondf(connection,
- MHD_HTTP_REQUEST_ENTITY_TOO_LARGE,
+ MHD_HTTP_PAYLOAD_TOO_LARGE,
"Entry with more fields than the maximum of %u\n",
ENTRY_FIELD_COUNT_MAX);
@@ -586,7 +586,7 @@ static int request_handler(
*connection_cls);
if (!streq(method, "POST"))
- return mhd_respond(connection, MHD_HTTP_METHOD_NOT_ACCEPTABLE,
+ return mhd_respond(connection, MHD_HTTP_NOT_ACCEPTABLE,
"Unsupported method.\n");
if (!streq(url, "/upload"))
--
1.8.3.1

View File

@ -0,0 +1,54 @@
From 5de71cb7d887a569bfb987efdceda493338990bf Mon Sep 17 00:00:00 2001
From: Tom Gundersen <teg@jklm.no>
Date: Thu, 4 Jun 2015 16:54:45 +0200
Subject: [PATCH 01/20] sd-event: don't touch fd's accross forks
We protect most of the API from use accross forks, but we still allow both
sd_event and sd_event_source objects to be unref'ed. This would cause
problems as it would unregister sources from the underlying eventfd, hence
also affecting the original instance in the parent process.
This fixes the issue by not touching the fds on unref when done accross a fork,
but still free the memory.
This fixes a regression introduced by
"udevd: move main-loop to sd-event": 693d371d30fee
where the worker processes were disabling the inotify event source in the
main daemon.
[commit f68067348f58cd08d8f4f5325ce22f9a9d2c2140 from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 9d48e5a..a84bfbb 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -474,6 +474,9 @@ static int source_io_unregister(sd_event_source *s) {
assert(s);
assert(s->type == SOURCE_IO);
+ if (event_pid_changed(s->event))
+ return 0;
+
if (!s->io.registered)
return 0;
@@ -604,6 +607,9 @@ static int event_update_signal_fd(sd_event *e) {
assert(e);
+ if (event_pid_changed(e))
+ return 0;
+
add_to_epoll = e->signal_fd < 0;
r = signalfd(e->signal_fd, &e->sigset, SFD_NONBLOCK|SFD_CLOEXEC);
--
2.17.1

View File

@ -0,0 +1,815 @@
From 2976f3b959bef0e6f0a1f4d55d998c5d60e56b0d Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 3 Sep 2015 20:13:09 +0200
Subject: [PATCH 02/20] sd-event: make sure RT signals are not dropped
RT signals operate in a queue, and we should be careful to never merge
two queued signals into one. Hence, makes sure we only ever dequeue a
single signal at a time and leave the remaining ones queued in the
signalfd. In order to implement correct priorities for the signals
introduce one signalfd per priority, so that we only process the highest
priority signal at a time.
[commit 9da4cb2be260ed123f2676cb85cb350c527b1492 from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 430 ++++++++++++++++++---------
src/libsystemd/sd-event/test-event.c | 66 +++-
2 files changed, 357 insertions(+), 139 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index a84bfbb..26ef3ea 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -56,9 +56,22 @@ typedef enum EventSourceType {
_SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
} EventSourceType;
+/* All objects we use in epoll events start with this value, so that
+ * we know how to dispatch it */
+typedef enum WakeupType {
+ WAKEUP_NONE,
+ WAKEUP_EVENT_SOURCE,
+ WAKEUP_CLOCK_DATA,
+ WAKEUP_SIGNAL_DATA,
+ _WAKEUP_TYPE_MAX,
+ _WAKEUP_TYPE_INVALID = -1,
+} WakeupType;
+
#define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
struct sd_event_source {
+ WakeupType wakeup;
+
unsigned n_ref;
sd_event *event;
@@ -120,6 +133,7 @@ struct sd_event_source {
};
struct clock_data {
+ WakeupType wakeup;
int fd;
/* For all clocks we maintain two priority queues each, one
@@ -136,11 +150,23 @@ struct clock_data {
bool needs_rearm:1;
};
+struct signal_data {
+ WakeupType wakeup;
+
+ /* For each priority we maintain one signal fd, so that we
+ * only have to dequeue a single event per priority at a
+ * time. */
+
+ int fd;
+ int64_t priority;
+ sigset_t sigset;
+ sd_event_source *current;
+};
+
struct sd_event {
unsigned n_ref;
int epoll_fd;
- int signal_fd;
int watchdog_fd;
Prioq *pending;
@@ -157,8 +183,8 @@ struct sd_event {
usec_t perturb;
- sigset_t sigset;
- sd_event_source **signal_sources;
+ sd_event_source **signal_sources; /* indexed by signal number */
+ Hashmap *signal_data; /* indexed by priority */
Hashmap *child_sources;
unsigned n_enabled_child_sources;
@@ -355,6 +381,7 @@ static int exit_prioq_compare(const void *a, const void *b) {
static void free_clock_data(struct clock_data *d) {
assert(d);
+ assert(d->wakeup == WAKEUP_CLOCK_DATA);
safe_close(d->fd);
prioq_free(d->earliest);
@@ -378,7 +405,6 @@ static void event_free(sd_event *e) {
*(e->default_event_ptr) = NULL;
safe_close(e->epoll_fd);
- safe_close(e->signal_fd);
safe_close(e->watchdog_fd);
free_clock_data(&e->realtime);
@@ -392,6 +418,7 @@ static void event_free(sd_event *e) {
prioq_free(e->exit);
free(e->signal_sources);
+ hashmap_free(e->signal_data);
hashmap_free(e->child_sources);
set_free(e->post_sources);
@@ -409,13 +436,12 @@ _public_ int sd_event_new(sd_event** ret) {
return -ENOMEM;
e->n_ref = 1;
- e->signal_fd = e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
+ e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
+ e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
e->original_pid = getpid();
e->perturb = USEC_INFINITY;
- assert_se(sigemptyset(&e->sigset) == 0);
-
e->pending = prioq_new(pending_prioq_compare);
if (!e->pending) {
r = -ENOMEM;
@@ -510,7 +536,6 @@ static int source_io_register(
r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
else
r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
-
if (r < 0)
return -errno;
@@ -592,45 +617,171 @@ static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
}
}
-static bool need_signal(sd_event *e, int signal) {
- return (e->signal_sources && e->signal_sources[signal] &&
- e->signal_sources[signal]->enabled != SD_EVENT_OFF)
- ||
- (signal == SIGCHLD &&
- e->n_enabled_child_sources > 0);
-}
+static int event_make_signal_data(
+ sd_event *e,
+ int sig,
+ struct signal_data **ret) {
-static int event_update_signal_fd(sd_event *e) {
struct epoll_event ev = {};
- bool add_to_epoll;
+ struct signal_data *d;
+ bool added = false;
+ sigset_t ss_copy;
+ int64_t priority;
int r;
assert(e);
if (event_pid_changed(e))
- return 0;
+ return -ECHILD;
- add_to_epoll = e->signal_fd < 0;
+ if (e->signal_sources && e->signal_sources[sig])
+ priority = e->signal_sources[sig]->priority;
+ else
+ priority = 0;
- r = signalfd(e->signal_fd, &e->sigset, SFD_NONBLOCK|SFD_CLOEXEC);
- if (r < 0)
- return -errno;
+ d = hashmap_get(e->signal_data, &priority);
+ if (d) {
+ if (sigismember(&d->sigset, sig) > 0) {
+ if (ret)
+ *ret = d;
+ return 0;
+ }
+ } else {
+ r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
+ if (r < 0)
+ return r;
+
+ d = new0(struct signal_data, 1);
+ if (!d)
+ return -ENOMEM;
+
+ d->wakeup = WAKEUP_SIGNAL_DATA;
+ d->fd = -1;
+ d->priority = priority;
+
+ r = hashmap_put(e->signal_data, &d->priority, d);
+ if (r < 0)
+ return r;
- e->signal_fd = r;
+ added = true;
+ }
+
+ ss_copy = d->sigset;
+ assert_se(sigaddset(&ss_copy, sig) >= 0);
+
+ r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
+ if (r < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ d->sigset = ss_copy;
- if (!add_to_epoll)
+ if (d->fd >= 0) {
+ if (ret)
+ *ret = d;
return 0;
+ }
+
+ d->fd = r;
ev.events = EPOLLIN;
- ev.data.ptr = INT_TO_PTR(SOURCE_SIGNAL);
+ ev.data.ptr = d;
- r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->signal_fd, &ev);
- if (r < 0) {
- e->signal_fd = safe_close(e->signal_fd);
- return -errno;
+ r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
+ if (r < 0) {
+ r = -errno;
+ goto fail;
}
+ if (ret)
+ *ret = d;
+
return 0;
+
+fail:
+ if (added) {
+ d->fd = safe_close(d->fd);
+ hashmap_remove(e->signal_data, &d->priority);
+ free(d);
+ }
+
+ return r;
+}
+
+static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
+ assert(e);
+ assert(d);
+
+ /* Turns off the specified signal in the signal data
+ * object. If the signal mask of the object becomes empty that
+ * way removes it. */
+
+ if (sigismember(&d->sigset, sig) == 0)
+ return;
+
+ assert_se(sigdelset(&d->sigset, sig) >= 0);
+
+ if (sigisemptyset(&d->sigset)) {
+
+ /* If all the mask is all-zero we can get rid of the structure */
+ hashmap_remove(e->signal_data, &d->priority);
+ assert(!d->current);
+ safe_close(d->fd);
+ free(d);
+ return;
+ }
+
+ assert(d->fd >= 0);
+
+ if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
+ log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
+}
+
+static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
+ struct signal_data *d;
+ static const int64_t zero_priority = 0;
+
+ assert(e);
+
+ /* Rechecks if the specified signal is still something we are
+ * interested in. If not, we'll unmask it, and possibly drop
+ * the signalfd for it. */
+
+ if (sig == SIGCHLD &&
+ e->n_enabled_child_sources > 0)
+ return;
+
+ if (e->signal_sources &&
+ e->signal_sources[sig] &&
+ e->signal_sources[sig]->enabled != SD_EVENT_OFF)
+ return;
+
+ /*
+ * The specified signal might be enabled in three different queues:
+ *
+ * 1) the one that belongs to the priority passed (if it is non-NULL)
+ * 2) the one that belongs to the priority of the event source of the signal (if there is one)
+ * 3) the 0 priority (to cover the SIGCHLD case)
+ *
+ * Hence, let's remove it from all three here.
+ */
+
+ if (priority) {
+ d = hashmap_get(e->signal_data, priority);
+ if (d)
+ event_unmask_signal_data(e, d, sig);
+ }
+
+ if (e->signal_sources && e->signal_sources[sig]) {
+ d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
+ if (d)
+ event_unmask_signal_data(e, d, sig);
+ }
+
+ d = hashmap_get(e->signal_data, &zero_priority);
+ if (d)
+ event_unmask_signal_data(e, d, sig);
}
static void source_disconnect(sd_event_source *s) {
@@ -669,17 +820,11 @@ static void source_disconnect(sd_event_source *s) {
case SOURCE_SIGNAL:
if (s->signal.sig > 0) {
+
if (s->event->signal_sources)
s->event->signal_sources[s->signal.sig] = NULL;
- /* If the signal was on and now it is off... */
- if (s->enabled != SD_EVENT_OFF && !need_signal(s->event, s->signal.sig)) {
- assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
-
- (void) event_update_signal_fd(s->event);
- /* If disabling failed, we might get a spurious event,
- * but otherwise nothing bad should happen. */
- }
+ event_gc_signal_data(s->event, &s->priority, s->signal.sig);
}
break;
@@ -689,18 +834,10 @@ static void source_disconnect(sd_event_source *s) {
if (s->enabled != SD_EVENT_OFF) {
assert(s->event->n_enabled_child_sources > 0);
s->event->n_enabled_child_sources--;
-
- /* We know the signal was on, if it is off now... */
- if (!need_signal(s->event, SIGCHLD)) {
- assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
-
- (void) event_update_signal_fd(s->event);
- /* If disabling failed, we might get a spurious event,
- * but otherwise nothing bad should happen. */
- }
}
- hashmap_remove(s->event->child_sources, INT_TO_PTR(s->child.pid));
+ (void) hashmap_remove(s->event->child_sources, INT_TO_PTR(s->child.pid));
+ event_gc_signal_data(s->event, &s->priority, SIGCHLD);
}
break;
@@ -779,6 +916,14 @@ static int source_set_pending(sd_event_source *s, bool b) {
d->needs_rearm = true;
}
+ if (s->type == SOURCE_SIGNAL && !b) {
+ struct signal_data *d;
+
+ d = hashmap_get(s->event->signal_data, &s->priority);
+ if (d && d->current == s)
+ d->current = NULL;
+ }
+
return 0;
}
@@ -828,6 +973,7 @@ _public_ int sd_event_add_io(
if (!s)
return -ENOMEM;
+ s->wakeup = WAKEUP_EVENT_SOURCE;
s->io.fd = fd;
s->io.events = events;
s->io.callback = callback;
@@ -884,7 +1030,7 @@ static int event_setup_timer_fd(
return -errno;
ev.events = EPOLLIN;
- ev.data.ptr = INT_TO_PTR(clock_to_event_source_type(clock));
+ ev.data.ptr = d;
r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
if (r < 0) {
@@ -994,9 +1140,9 @@ _public_ int sd_event_add_signal(
void *userdata) {
sd_event_source *s;
+ struct signal_data *d;
sigset_t ss;
int r;
- bool previous;
assert_return(e, -EINVAL);
assert_return(sig > 0, -EINVAL);
@@ -1021,8 +1167,6 @@ _public_ int sd_event_add_signal(
} else if (e->signal_sources[sig])
return -EBUSY;
- previous = need_signal(e, sig);
-
s = source_new(e, !ret, SOURCE_SIGNAL);
if (!s)
return -ENOMEM;
@@ -1034,14 +1178,10 @@ _public_ int sd_event_add_signal(
e->signal_sources[sig] = s;
- if (!previous) {
- assert_se(sigaddset(&e->sigset, sig) == 0);
-
- r = event_update_signal_fd(e);
- if (r < 0) {
- source_free(s);
- return r;
- }
+ r = event_make_signal_data(e, sig, &d);
+ if (r < 0) {
+ source_free(s);
+ return r;
}
/* Use the signal name as description for the event source by default */
@@ -1063,7 +1203,6 @@ _public_ int sd_event_add_child(
sd_event_source *s;
int r;
- bool previous;
assert_return(e, -EINVAL);
assert_return(pid > 1, -EINVAL);
@@ -1080,8 +1219,6 @@ _public_ int sd_event_add_child(
if (hashmap_contains(e->child_sources, INT_TO_PTR(pid)))
return -EBUSY;
- previous = need_signal(e, SIGCHLD);
-
s = source_new(e, !ret, SOURCE_CHILD);
if (!s)
return -ENOMEM;
@@ -1100,14 +1237,11 @@ _public_ int sd_event_add_child(
e->n_enabled_child_sources ++;
- if (!previous) {
- assert_se(sigaddset(&e->sigset, SIGCHLD) == 0);
-
- r = event_update_signal_fd(e);
- if (r < 0) {
- source_free(s);
- return r;
- }
+ r = event_make_signal_data(e, SIGCHLD, NULL);
+ if (r < 0) {
+ e->n_enabled_child_sources--;
+ source_free(s);
+ return r;
}
e->need_process_child = true;
@@ -1407,6 +1541,8 @@ _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority)
}
_public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
+ int r;
+
assert_return(s, -EINVAL);
assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
assert_return(!event_pid_changed(s->event), -ECHILD);
@@ -1414,7 +1550,25 @@ _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority)
if (s->priority == priority)
return 0;
- s->priority = priority;
+ if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
+ struct signal_data *old, *d;
+
+ /* Move us from the signalfd belonging to the old
+ * priority to the signalfd of the new priority */
+
+ assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
+
+ s->priority = priority;
+
+ r = event_make_signal_data(s->event, s->signal.sig, &d);
+ if (r < 0) {
+ s->priority = old->priority;
+ return r;
+ }
+
+ event_unmask_signal_data(s->event, old, s->signal.sig);
+ } else
+ s->priority = priority;
if (s->pending)
prioq_reshuffle(s->event->pending, s, &s->pending_index);
@@ -1482,34 +1636,18 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
}
case SOURCE_SIGNAL:
- assert(need_signal(s->event, s->signal.sig));
-
s->enabled = m;
- if (!need_signal(s->event, s->signal.sig)) {
- assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
-
- (void) event_update_signal_fd(s->event);
- /* If disabling failed, we might get a spurious event,
- * but otherwise nothing bad should happen. */
- }
-
+ event_gc_signal_data(s->event, &s->priority, s->signal.sig);
break;
case SOURCE_CHILD:
- assert(need_signal(s->event, SIGCHLD));
-
s->enabled = m;
assert(s->event->n_enabled_child_sources > 0);
s->event->n_enabled_child_sources--;
- if (!need_signal(s->event, SIGCHLD)) {
- assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
-
- (void) event_update_signal_fd(s->event);
- }
-
+ event_gc_signal_data(s->event, &s->priority, SIGCHLD);
break;
case SOURCE_EXIT:
@@ -1555,37 +1693,33 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
}
case SOURCE_SIGNAL:
- /* Check status before enabling. */
- if (!need_signal(s->event, s->signal.sig)) {
- assert_se(sigaddset(&s->event->sigset, s->signal.sig) == 0);
-
- r = event_update_signal_fd(s->event);
- if (r < 0) {
- s->enabled = SD_EVENT_OFF;
- return r;
- }
- }
s->enabled = m;
+
+ r = event_make_signal_data(s->event, s->signal.sig, NULL);
+ if (r < 0) {
+ s->enabled = SD_EVENT_OFF;
+ event_gc_signal_data(s->event, &s->priority, s->signal.sig);
+ return r;
+ }
+
break;
case SOURCE_CHILD:
- /* Check status before enabling. */
- if (s->enabled == SD_EVENT_OFF) {
- if (!need_signal(s->event, SIGCHLD)) {
- assert_se(sigaddset(&s->event->sigset, s->signal.sig) == 0);
-
- r = event_update_signal_fd(s->event);
- if (r < 0) {
- s->enabled = SD_EVENT_OFF;
- return r;
- }
- }
+ if (s->enabled == SD_EVENT_OFF)
s->event->n_enabled_child_sources++;
- }
s->enabled = m;
+
+ r = event_make_signal_data(s->event, s->signal.sig, SIGCHLD);
+ if (r < 0) {
+ s->enabled = SD_EVENT_OFF;
+ s->event->n_enabled_child_sources--;
+ event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+ return r;
+ }
+
break;
case SOURCE_EXIT:
@@ -2029,20 +2163,35 @@ static int process_child(sd_event *e) {
return 0;
}
-static int process_signal(sd_event *e, uint32_t events) {
+static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
bool read_one = false;
int r;
assert(e);
-
assert_return(events == EPOLLIN, -EIO);
+ /* If there's a signal queued on this priority and SIGCHLD is
+ on this priority too, then make sure to recheck the
+ children we watch. This is because we only ever dequeue
+ the first signal per priority, and if we dequeue one, and
+ SIGCHLD might be enqueued later we wouldn't know, but we
+ might have higher priority children we care about hence we
+ need to check that explicitly. */
+
+ if (sigismember(&d->sigset, SIGCHLD))
+ e->need_process_child = true;
+
+ /* If there's already an event source pending for this
+ * priority we don't read another */
+ if (d->current)
+ return 0;
+
for (;;) {
struct signalfd_siginfo si;
ssize_t n;
sd_event_source *s = NULL;
- n = read(e->signal_fd, &si, sizeof(si));
+ n = read(d->fd, &si, sizeof(si));
if (n < 0) {
if (errno == EAGAIN || errno == EINTR)
return read_one;
@@ -2057,24 +2206,21 @@ static int process_signal(sd_event *e, uint32_t events) {
read_one = true;
- if (si.ssi_signo == SIGCHLD) {
- r = process_child(e);
- if (r < 0)
- return r;
- if (r > 0)
- continue;
- }
-
if (e->signal_sources)
s = e->signal_sources[si.ssi_signo];
-
if (!s)
continue;
+ if (s->pending)
+ continue;
s->signal.siginfo = si;
+ d->current = s;
+
r = source_set_pending(s, true);
if (r < 0)
return r;
+
+ return 1;
}
}
@@ -2393,23 +2539,31 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
for (i = 0; i < m; i++) {
- if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_TIME_REALTIME))
- r = flush_timer(e, e->realtime.fd, ev_queue[i].events, &e->realtime.next);
- else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_TIME_BOOTTIME))
- r = flush_timer(e, e->boottime.fd, ev_queue[i].events, &e->boottime.next);
- else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_TIME_MONOTONIC))
- r = flush_timer(e, e->monotonic.fd, ev_queue[i].events, &e->monotonic.next);
- else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_TIME_REALTIME_ALARM))
- r = flush_timer(e, e->realtime_alarm.fd, ev_queue[i].events, &e->realtime_alarm.next);
- else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_TIME_BOOTTIME_ALARM))
- r = flush_timer(e, e->boottime_alarm.fd, ev_queue[i].events, &e->boottime_alarm.next);
- else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL))
- r = process_signal(e, ev_queue[i].events);
- else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
+ if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
- else
- r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
+ else {
+ WakeupType *t = ev_queue[i].data.ptr;
+
+ switch (*t) {
+
+ case WAKEUP_EVENT_SOURCE:
+ r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
+ break;
+ case WAKEUP_CLOCK_DATA: {
+ struct clock_data *d = ev_queue[i].data.ptr;
+ r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
+ break;
+ }
+
+ case WAKEUP_SIGNAL_DATA:
+ r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
+ break;
+
+ default:
+ assert_not_reached("Invalid wake-up pointer");
+ }
+ }
if (r < 0)
goto finish;
}
diff --git a/src/libsystemd/sd-event/test-event.c b/src/libsystemd/sd-event/test-event.c
index 721700b..6bb1420 100644
--- a/src/libsystemd/sd-event/test-event.c
+++ b/src/libsystemd/sd-event/test-event.c
@@ -160,7 +160,7 @@ static int exit_handler(sd_event_source *s, void *userdata) {
return 3;
}
-int main(int argc, char *argv[]) {
+static void test_basic(void) {
sd_event *e = NULL;
sd_event_source *w = NULL, *x = NULL, *y = NULL, *z = NULL, *q = NULL, *t = NULL;
static const char ch = 'x';
@@ -248,6 +248,70 @@ int main(int argc, char *argv[]) {
safe_close_pair(b);
safe_close_pair(d);
safe_close_pair(k);
+}
+
+static int last_rtqueue_sigval = 0;
+static int n_rtqueue = 0;
+
+static int rtqueue_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
+ last_rtqueue_sigval = si->ssi_int;
+ n_rtqueue ++;
+ return 0;
+}
+
+static void test_rtqueue(void) {
+ sd_event_source *u = NULL, *v = NULL, *s = NULL;
+ sd_event *e = NULL;
+
+ assert_se(sd_event_default(&e) >= 0);
+
+ assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGRTMIN+2, SIGRTMIN+3, SIGUSR2, -1) >= 0);
+ assert_se(sd_event_add_signal(e, &u, SIGRTMIN+2, rtqueue_handler, NULL) >= 0);
+ assert_se(sd_event_add_signal(e, &v, SIGRTMIN+3, rtqueue_handler, NULL) >= 0);
+ assert_se(sd_event_add_signal(e, &s, SIGUSR2, rtqueue_handler, NULL) >= 0);
+
+ assert_se(sd_event_source_set_priority(v, -10) >= 0);
+
+ assert(sigqueue(getpid(), SIGRTMIN+2, (union sigval) { .sival_int = 1 }) >= 0);
+ assert(sigqueue(getpid(), SIGRTMIN+3, (union sigval) { .sival_int = 2 }) >= 0);
+ assert(sigqueue(getpid(), SIGUSR2, (union sigval) { .sival_int = 3 }) >= 0);
+ assert(sigqueue(getpid(), SIGRTMIN+3, (union sigval) { .sival_int = 4 }) >= 0);
+ assert(sigqueue(getpid(), SIGUSR2, (union sigval) { .sival_int = 5 }) >= 0);
+
+ assert_se(n_rtqueue == 0);
+ assert_se(last_rtqueue_sigval == 0);
+
+ assert_se(sd_event_run(e, (uint64_t) -1) >= 1);
+ assert_se(n_rtqueue == 1);
+ assert_se(last_rtqueue_sigval == 2); /* first SIGRTMIN+3 */
+
+ assert_se(sd_event_run(e, (uint64_t) -1) >= 1);
+ assert_se(n_rtqueue == 2);
+ assert_se(last_rtqueue_sigval == 4); /* second SIGRTMIN+3 */
+
+ assert_se(sd_event_run(e, (uint64_t) -1) >= 1);
+ assert_se(n_rtqueue == 3);
+ assert_se(last_rtqueue_sigval == 3); /* first SIGUSR2 */
+
+ assert_se(sd_event_run(e, (uint64_t) -1) >= 1);
+ assert_se(n_rtqueue == 4);
+ assert_se(last_rtqueue_sigval == 1); /* SIGRTMIN+2 */
+
+ assert_se(sd_event_run(e, 0) == 0); /* the other SIGUSR2 is dropped, because the first one was still queued */
+ assert_se(n_rtqueue == 4);
+ assert_se(last_rtqueue_sigval == 1);
+
+ sd_event_source_unref(u);
+ sd_event_source_unref(v);
+ sd_event_source_unref(s);
+
+ sd_event_unref(e);
+}
+
+int main(int argc, char *argv[]) {
+
+ test_basic();
+ test_rtqueue();
return 0;
}
--
2.17.1

View File

@ -0,0 +1,216 @@
From ea762f1c0206c99d2ba4d3cba41cadf70311a3cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Sekleta=CC=81r?= <msekleta@redhat.com>
Date: Fri, 23 Oct 2020 18:29:27 +0200
Subject: [PATCH 03/20] sd-event: split out helper functions for reshuffling
prioqs
We typically don't just reshuffle a single prioq at once, but always
two. Let's add two helper functions that do this, and reuse them
everywhere.
(Note that this drops one minor optimization:
sd_event_source_set_time_accuracy() previously only reshuffled the
"latest" prioq, since changing the accuracy has no effect on the
earliest time of an event source, just the latest time an event source
can run. This optimization is removed to simplify things, given that
it's not really worth the effort as prioq_reshuffle() on properly
ordered prioqs has practically zero cost O(1)).
(Slightly generalized, commented and split out of #17284 by Lennart)
(cherry picked from commit e1951c16a8fbe5b0b9ecc08f4f835a806059d28f)
Related: #1819868
[commit 4ce10f8e41a85a56ad9b805442eb1149ece7c82a from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 96 ++++++++++++------------------
1 file changed, 38 insertions(+), 58 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 26ef3ea..eb3182f 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -784,6 +784,33 @@ static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig)
event_unmask_signal_data(e, d, sig);
}
+static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
+ assert(s);
+
+ /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
+ * they are enabled/disabled or marked pending and such. */
+
+ if (s->pending)
+ prioq_reshuffle(s->event->pending, s, &s->pending_index);
+
+ if (s->prepare)
+ prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
+}
+
+static void event_source_time_prioq_reshuffle(sd_event_source *s) {
+ struct clock_data *d;
+
+ assert(s);
+ assert(EVENT_SOURCE_IS_TIME(s->type));
+
+ /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
+ * pending, enable state. Makes sure the two prioq's are ordered properly again. */
+ assert_se(d = event_get_clock_data(s->event, s->type));
+ prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
+ prioq_reshuffle(d->latest, s, &s->time.latest_index);
+ d->needs_rearm = true;
+}
+
static void source_disconnect(sd_event_source *s) {
sd_event *event;
@@ -905,16 +932,8 @@ static int source_set_pending(sd_event_source *s, bool b) {
} else
assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
- if (EVENT_SOURCE_IS_TIME(s->type)) {
- struct clock_data *d;
-
- d = event_get_clock_data(s->event, s->type);
- assert(d);
-
- prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
- prioq_reshuffle(d->latest, s, &s->time.latest_index);
- d->needs_rearm = true;
- }
+ if (EVENT_SOURCE_IS_TIME(s->type))
+ event_source_time_prioq_reshuffle(s);
if (s->type == SOURCE_SIGNAL && !b) {
struct signal_data *d;
@@ -1570,11 +1589,7 @@ _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority)
} else
s->priority = priority;
- if (s->pending)
- prioq_reshuffle(s->event->pending, s, &s->pending_index);
-
- if (s->prepare)
- prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
+ event_source_pp_prioq_reshuffle(s);
if (s->type == SOURCE_EXIT)
prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
@@ -1622,18 +1637,10 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
case SOURCE_TIME_BOOTTIME:
case SOURCE_TIME_MONOTONIC:
case SOURCE_TIME_REALTIME_ALARM:
- case SOURCE_TIME_BOOTTIME_ALARM: {
- struct clock_data *d;
-
+ case SOURCE_TIME_BOOTTIME_ALARM:
s->enabled = m;
- d = event_get_clock_data(s->event, s->type);
- assert(d);
-
- prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
- prioq_reshuffle(d->latest, s, &s->time.latest_index);
- d->needs_rearm = true;
+ event_source_time_prioq_reshuffle(s);
break;
- }
case SOURCE_SIGNAL:
s->enabled = m;
@@ -1679,18 +1686,10 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
case SOURCE_TIME_BOOTTIME:
case SOURCE_TIME_MONOTONIC:
case SOURCE_TIME_REALTIME_ALARM:
- case SOURCE_TIME_BOOTTIME_ALARM: {
- struct clock_data *d;
-
+ case SOURCE_TIME_BOOTTIME_ALARM:
s->enabled = m;
- d = event_get_clock_data(s->event, s->type);
- assert(d);
-
- prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
- prioq_reshuffle(d->latest, s, &s->time.latest_index);
- d->needs_rearm = true;
+ event_source_time_prioq_reshuffle(s);
break;
- }
case SOURCE_SIGNAL:
@@ -1737,11 +1736,7 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
}
}
- if (s->pending)
- prioq_reshuffle(s->event->pending, s, &s->pending_index);
-
- if (s->prepare)
- prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
+ event_source_pp_prioq_reshuffle(s);
return 0;
}
@@ -1757,7 +1752,6 @@ _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
}
_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
- struct clock_data *d;
assert_return(s, -EINVAL);
assert_return(usec != (uint64_t) -1, -EINVAL);
@@ -1769,13 +1763,7 @@ _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
source_set_pending(s, false);
- d = event_get_clock_data(s->event, s->type);
- assert(d);
-
- prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
- prioq_reshuffle(d->latest, s, &s->time.latest_index);
- d->needs_rearm = true;
-
+ event_source_time_prioq_reshuffle(s);
return 0;
}
@@ -1790,7 +1778,6 @@ _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *use
}
_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
- struct clock_data *d;
assert_return(s, -EINVAL);
assert_return(usec != (uint64_t) -1, -EINVAL);
@@ -1805,12 +1792,7 @@ _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec
source_set_pending(s, false);
- d = event_get_clock_data(s->event, s->type);
- assert(d);
-
- prioq_reshuffle(d->latest, s, &s->time.latest_index);
- d->needs_rearm = true;
-
+ event_source_time_prioq_reshuffle(s);
return 0;
}
@@ -2088,9 +2070,7 @@ static int process_timer(
if (r < 0)
return r;
- prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
- prioq_reshuffle(d->latest, s, &s->time.latest_index);
- d->needs_rearm = true;
+ event_source_time_prioq_reshuffle(s);
}
return 0;
--
2.17.1

View File

@ -0,0 +1,50 @@
From 76969d09522ca2ab58bc157eb9ce357af5677f3a Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 25 May 2018 17:06:39 +0200
Subject: [PATCH 04/20] sd-event: drop pending events when we turn off/on an
event source
[commit ac989a783a31df95e6c0ce2a90a8d2e1abe73592 from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index eb3182f..6e93059 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -1623,6 +1623,13 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
if (m == SD_EVENT_OFF) {
+ /* Unset the pending flag when this event source is disabled */
+ if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
+ r = source_set_pending(s, false);
+ if (r < 0)
+ return r;
+ }
+
switch (s->type) {
case SOURCE_IO:
@@ -1672,6 +1679,14 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
}
} else {
+
+ /* Unset the pending flag when this event source is enabled */
+ if (s->enabled == SD_EVENT_OFF && !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
+ r = source_set_pending(s, false);
+ if (r < 0)
+ return r;
+ }
+
switch (s->type) {
case SOURCE_IO:
--
2.17.1

View File

@ -0,0 +1,31 @@
From 7380d2cca8bda0f8c821645f8a5ddb8ac47aec46 Mon Sep 17 00:00:00 2001
From: Thomas Hindoe Paaboel Andersen <phomes@gmail.com>
Date: Sun, 6 Sep 2015 22:06:45 +0200
Subject: [PATCH 05/20] sd-event: fix call to event_make_signal_data
This looks like a typo from commit 9da4cb2b where it was added.
[commit b8a50a99a6e158a5b3ceacf0764dbe9f42558f3e from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 6e93059..7c33dcd 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -1726,7 +1726,7 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
s->enabled = m;
- r = event_make_signal_data(s->event, s->signal.sig, SIGCHLD);
+ r = event_make_signal_data(s->event, s->signal.sig, NULL);
if (r < 0) {
s->enabled = SD_EVENT_OFF;
s->event->n_enabled_child_sources--;
--
2.17.1

View File

@ -0,0 +1,37 @@
From 0a2519a5ab04e775115c90039d30bdc576a79c06 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 7 Sep 2015 00:31:24 +0200
Subject: [PATCH 06/20] sd-event: make sure to create a signal queue for the
right signal
We should never access the "signal" part of the event source unless the
event source is actually for a signal. In this case it's a child pid
handler however, hence make sure to use the right signal.
This is a fix for PR #1177, which in turn was a fix for
9da4cb2be260ed123f2676cb85cb350c527b1492.
[commit 10edebf6cd69cfbe0d38dbaf5478264fbb60a51e from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 7c33dcd..2f5ff23 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -1726,7 +1726,7 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
s->enabled = m;
- r = event_make_signal_data(s->event, s->signal.sig, NULL);
+ r = event_make_signal_data(s->event, SIGCHLD, NULL);
if (r < 0) {
s->enabled = SD_EVENT_OFF;
s->event->n_enabled_child_sources--;
--
2.17.1

View File

@ -0,0 +1,315 @@
From 477bbfd4f5012613144c5ba5517aa8de1f300da6 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 23 Oct 2020 21:21:58 +0200
Subject: [PATCH 07/20] sd-event: split out enable and disable codepaths from
sd_event_source_set_enabled()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
So far half of sd_event_source_set_enabled() was doing enabling, the
other half was doing disabling. Let's split that into two separate
calls.
(This also adds a new shortcut to sd_event_source_set_enabled(): if the
caller toggles between "ON" and "ONESHOT" we'll now shortcut this, since
the event source is already enabled in that case and shall remain
enabled.)
This heavily borrows and is inspired from Michal Sekletár's #17284
refactoring.
(cherry picked from commit ddfde737b546c17e54182028153aa7f7e78804e3)
Related: #1819868
[commit d7ad6ad123200f562081ff09f7bed3c6d969ac0a from
https://github.com/systemd-rhel/rhel-8/
LZ: Dropped SOURCE_INOTIFY related parts because it hasn't been added
in this systemd version.]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 224 +++++++++++++++--------------
1 file changed, 118 insertions(+), 106 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 2f5ff23..2e07478 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -1606,153 +1606,165 @@ _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
return 0;
}
-_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
+static int event_source_disable(sd_event_source *s) {
int r;
- assert_return(s, -EINVAL);
- assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
- assert_return(!event_pid_changed(s->event), -ECHILD);
+ assert(s);
+ assert(s->enabled != SD_EVENT_OFF);
- /* If we are dead anyway, we are fine with turning off
- * sources, but everything else needs to fail. */
- if (s->event->state == SD_EVENT_FINISHED)
- return m == SD_EVENT_OFF ? 0 : -ESTALE;
+ /* Unset the pending flag when this event source is disabled */
+ if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
+ r = source_set_pending(s, false);
+ if (r < 0)
+ return r;
+ }
- if (s->enabled == m)
- return 0;
+ s->enabled = SD_EVENT_OFF;
- if (m == SD_EVENT_OFF) {
+ switch (s->type) {
- /* Unset the pending flag when this event source is disabled */
- if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
- r = source_set_pending(s, false);
- if (r < 0)
- return r;
- }
+ case SOURCE_IO:
+ source_io_unregister(s);
+ break;
- switch (s->type) {
+ case SOURCE_TIME_REALTIME:
+ case SOURCE_TIME_BOOTTIME:
+ case SOURCE_TIME_MONOTONIC:
+ case SOURCE_TIME_REALTIME_ALARM:
+ case SOURCE_TIME_BOOTTIME_ALARM:
+ event_source_time_prioq_reshuffle(s);
+ break;
- case SOURCE_IO:
- r = source_io_unregister(s);
- if (r < 0)
- return r;
+ case SOURCE_SIGNAL:
+ event_gc_signal_data(s->event, &s->priority, s->signal.sig);
+ break;
- s->enabled = m;
- break;
+ case SOURCE_CHILD:
+ assert(s->event->n_enabled_child_sources > 0);
+ s->event->n_enabled_child_sources--;
- case SOURCE_TIME_REALTIME:
- case SOURCE_TIME_BOOTTIME:
- case SOURCE_TIME_MONOTONIC:
- case SOURCE_TIME_REALTIME_ALARM:
- case SOURCE_TIME_BOOTTIME_ALARM:
- s->enabled = m;
- event_source_time_prioq_reshuffle(s);
- break;
+ event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+ break;
- case SOURCE_SIGNAL:
- s->enabled = m;
+ case SOURCE_EXIT:
+ prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
+ break;
- event_gc_signal_data(s->event, &s->priority, s->signal.sig);
- break;
+ case SOURCE_DEFER:
+ case SOURCE_POST:
+ break;
- case SOURCE_CHILD:
- s->enabled = m;
+ default:
+ assert_not_reached("Wut? I shouldn't exist.");
+ }
- assert(s->event->n_enabled_child_sources > 0);
- s->event->n_enabled_child_sources--;
+ return 0;
+}
- event_gc_signal_data(s->event, &s->priority, SIGCHLD);
- break;
+static int event_source_enable(sd_event_source *s, int m) {
+ int r;
- case SOURCE_EXIT:
- s->enabled = m;
- prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
- break;
+ assert(s);
+ assert(IN_SET(m, SD_EVENT_ON, SD_EVENT_ONESHOT));
+ assert(s->enabled == SD_EVENT_OFF);
- case SOURCE_DEFER:
- case SOURCE_POST:
- s->enabled = m;
- break;
+ /* Unset the pending flag when this event source is enabled */
+ if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
+ r = source_set_pending(s, false);
+ if (r < 0)
+ return r;
+ }
- default:
- assert_not_reached("Wut? I shouldn't exist.");
- }
+ s->enabled = m;
- } else {
+ switch (s->type) {
- /* Unset the pending flag when this event source is enabled */
- if (s->enabled == SD_EVENT_OFF && !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
- r = source_set_pending(s, false);
- if (r < 0)
- return r;
+ case SOURCE_IO:
+ r = source_io_register(s, m, s->io.events);
+ if (r < 0) {
+ s->enabled = SD_EVENT_OFF;
+ return r;
}
- switch (s->type) {
+ break;
- case SOURCE_IO:
- r = source_io_register(s, m, s->io.events);
- if (r < 0)
- return r;
+ case SOURCE_TIME_REALTIME:
+ case SOURCE_TIME_BOOTTIME:
+ case SOURCE_TIME_MONOTONIC:
+ case SOURCE_TIME_REALTIME_ALARM:
+ case SOURCE_TIME_BOOTTIME_ALARM:
+ event_source_time_prioq_reshuffle(s);
+ break;
- s->enabled = m;
- break;
+ case SOURCE_SIGNAL:
+ r = event_make_signal_data(s->event, s->signal.sig, NULL);
+ if (r < 0) {
+ s->enabled = SD_EVENT_OFF;
+ event_gc_signal_data(s->event, &s->priority, s->signal.sig);
+ return r;
+ }
- case SOURCE_TIME_REALTIME:
- case SOURCE_TIME_BOOTTIME:
- case SOURCE_TIME_MONOTONIC:
- case SOURCE_TIME_REALTIME_ALARM:
- case SOURCE_TIME_BOOTTIME_ALARM:
- s->enabled = m;
- event_source_time_prioq_reshuffle(s);
- break;
+ break;
- case SOURCE_SIGNAL:
+ case SOURCE_CHILD:
+ s->event->n_enabled_child_sources++;
- s->enabled = m;
+ r = event_make_signal_data(s->event, SIGCHLD, NULL);
+ if (r < 0) {
+ s->enabled = SD_EVENT_OFF;
+ s->event->n_enabled_child_sources--;
+ event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+ return r;
+ }
- r = event_make_signal_data(s->event, s->signal.sig, NULL);
- if (r < 0) {
- s->enabled = SD_EVENT_OFF;
- event_gc_signal_data(s->event, &s->priority, s->signal.sig);
- return r;
- }
- break;
+ break;
- case SOURCE_CHILD:
+ case SOURCE_EXIT:
+ prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
+ break;
- if (s->enabled == SD_EVENT_OFF)
- s->event->n_enabled_child_sources++;
+ case SOURCE_DEFER:
+ case SOURCE_POST:
+ break;
- s->enabled = m;
+ default:
+ assert_not_reached("Wut? I shouldn't exist.");
+ }
- r = event_make_signal_data(s->event, SIGCHLD, NULL);
- if (r < 0) {
- s->enabled = SD_EVENT_OFF;
- s->event->n_enabled_child_sources--;
- event_gc_signal_data(s->event, &s->priority, SIGCHLD);
- return r;
- }
+ return 0;
+}
- break;
+_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
+ int r;
- case SOURCE_EXIT:
- s->enabled = m;
- prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
- break;
+ assert_return(s, -EINVAL);
+ assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
+ assert_return(!event_pid_changed(s->event), -ECHILD);
- case SOURCE_DEFER:
- case SOURCE_POST:
- s->enabled = m;
- break;
+ /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
+ if (s->event->state == SD_EVENT_FINISHED)
+ return m == SD_EVENT_OFF ? 0 : -ESTALE;
- default:
- assert_not_reached("Wut? I shouldn't exist.");
+ if (s->enabled == m) /* No change? */
+ return 0;
+
+ if (m == SD_EVENT_OFF)
+ r = event_source_disable(s);
+ else {
+ if (s->enabled != SD_EVENT_OFF) {
+ /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
+ * event source is already enabled after all. */
+ s->enabled = m;
+ return 0;
}
+
+ r = event_source_enable(s, m);
}
+ if (r < 0)
+ return r;
event_source_pp_prioq_reshuffle(s);
-
return 0;
}
--
2.17.1

View File

@ -0,0 +1,73 @@
From 5e365321f3006d44f57bb27ff9de96ca01c1104a Mon Sep 17 00:00:00 2001
From: Evgeny Vereshchagin <evvers@ya.ru>
Date: Sun, 22 Nov 2015 06:41:31 +0000
Subject: [PATCH 08/20] sd-event: use prioq_ensure_allocated where possible
[commit c983e776c4e7e2ea6e1990123d215e639deb353b from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 30 +++++++++++-------------------
1 file changed, 11 insertions(+), 19 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 2e07478..7074520 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -442,11 +442,9 @@ _public_ int sd_event_new(sd_event** ret) {
e->original_pid = getpid();
e->perturb = USEC_INFINITY;
- e->pending = prioq_new(pending_prioq_compare);
- if (!e->pending) {
- r = -ENOMEM;
+ r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
+ if (r < 0)
goto fail;
- }
e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
if (e->epoll_fd < 0) {
@@ -1096,17 +1094,13 @@ _public_ int sd_event_add_time(
d = event_get_clock_data(e, type);
assert(d);
- if (!d->earliest) {
- d->earliest = prioq_new(earliest_time_prioq_compare);
- if (!d->earliest)
- return -ENOMEM;
- }
+ r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
+ if (r < 0)
+ return r;
- if (!d->latest) {
- d->latest = prioq_new(latest_time_prioq_compare);
- if (!d->latest)
- return -ENOMEM;
- }
+ r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
+ if (r < 0)
+ return r;
if (d->fd < 0) {
r = event_setup_timer_fd(e, d, clock);
@@ -1357,11 +1351,9 @@ _public_ int sd_event_add_exit(
assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
assert_return(!event_pid_changed(e), -ECHILD);
- if (!e->exit) {
- e->exit = prioq_new(exit_prioq_compare);
- if (!e->exit)
- return -ENOMEM;
- }
+ r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
+ if (r < 0)
+ return r;
s = source_new(e, !ret, SOURCE_EXIT);
if (!s)
--
2.17.1

View File

@ -0,0 +1,79 @@
From 77b772bce846db28dc447420fd380a51eadcde15 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 23 Nov 2020 11:40:24 +0100
Subject: [PATCH 09/20] sd-event: split clock data allocation out of
sd_event_add_time()
Just some simple refactoring, that will make things easier for us later.
But it looks better this way even without the later function reuse.
(cherry picked from commit 41c63f36c3352af8bebf03b6181f5d866431d0af)
Related: #1819868
[commit 6cc0022115afbac9ac66c456b140601d90271687 from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 34 ++++++++++++++++++++----------
1 file changed, 23 insertions(+), 11 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 7074520..8e6536f 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -1065,6 +1065,28 @@ static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata)
return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
}
+static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
+ int r;
+
+ assert(d);
+
+ if (d->fd < 0) {
+ r = event_setup_timer_fd(e, d, clock);
+ if (r < 0)
+ return r;
+ }
+
+ r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
+ if (r < 0)
+ return r;
+
+ r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
_public_ int sd_event_add_time(
sd_event *e,
sd_event_source **ret,
@@ -1094,20 +1116,10 @@ _public_ int sd_event_add_time(
d = event_get_clock_data(e, type);
assert(d);
- r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
- if (r < 0)
- return r;
-
- r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
+ r = setup_clock_data(e, d, clock);
if (r < 0)
return r;
- if (d->fd < 0) {
- r = event_setup_timer_fd(e, d, clock);
- if (r < 0)
- return r;
- }
-
s = source_new(e, !ret, type);
if (!s)
return -ENOMEM;
--
2.17.1

View File

@ -0,0 +1,120 @@
From dad1d000b493f98f4f5eaf4bfa34c8617f41970f Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 23 Nov 2020 15:25:35 +0100
Subject: [PATCH 10/20] sd-event: split out code to add/remove timer event
sources to earliest/latest prioq
Just some refactoring that makes code prettier, and will come handy
later, because we can reuse these functions at more places.
(cherry picked from commit 1e45e3fecc303e7ae9946220c742f69675e99c34)
Related: #1819868
[commit 88b2618e4de850060a1c5c22b049e6de0578fbb5 from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 57 +++++++++++++++++++++---------
1 file changed, 41 insertions(+), 16 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 8e6536f..e0e0eaa 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -809,6 +809,19 @@ static void event_source_time_prioq_reshuffle(sd_event_source *s) {
d->needs_rearm = true;
}
+static void event_source_time_prioq_remove(
+ sd_event_source *s,
+ struct clock_data *d) {
+
+ assert(s);
+ assert(d);
+
+ prioq_remove(d->earliest, s, &s->time.earliest_index);
+ prioq_remove(d->latest, s, &s->time.latest_index);
+ s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
+ d->needs_rearm = true;
+}
+
static void source_disconnect(sd_event_source *s) {
sd_event *event;
@@ -833,13 +846,8 @@ static void source_disconnect(sd_event_source *s) {
case SOURCE_TIME_REALTIME_ALARM:
case SOURCE_TIME_BOOTTIME_ALARM: {
struct clock_data *d;
-
- d = event_get_clock_data(s->event, s->type);
- assert(d);
-
- prioq_remove(d->earliest, s, &s->time.earliest_index);
- prioq_remove(d->latest, s, &s->time.latest_index);
- d->needs_rearm = true;
+ assert_se(d = event_get_clock_data(s->event, s->type));
+ event_source_time_prioq_remove(s, d);
break;
}
@@ -1087,6 +1095,30 @@ static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock)
return 0;
}
+static int event_source_time_prioq_put(
+ sd_event_source *s,
+ struct clock_data *d) {
+
+ int r;
+
+ assert(s);
+ assert(d);
+
+ r = prioq_put(d->earliest, s, &s->time.earliest_index);
+ if (r < 0)
+ return r;
+
+ r = prioq_put(d->latest, s, &s->time.latest_index);
+ if (r < 0) {
+ assert_se(prioq_remove(d->earliest, s, &s->time.earliest_index) > 0);
+ s->time.earliest_index = PRIOQ_IDX_NULL;
+ return r;
+ }
+
+ d->needs_rearm = true;
+ return 0;
+}
+
_public_ int sd_event_add_time(
sd_event *e,
sd_event_source **ret,
@@ -1113,8 +1145,7 @@ _public_ int sd_event_add_time(
type = clock_to_event_source_type(clock);
assert_return(type >= 0, -ENOTSUP);
- d = event_get_clock_data(e, type);
- assert(d);
+ assert_se(d = event_get_clock_data(e, type));
r = setup_clock_data(e, d, clock);
if (r < 0)
@@ -1131,13 +1162,7 @@ _public_ int sd_event_add_time(
s->userdata = userdata;
s->enabled = SD_EVENT_ONESHOT;
- d->needs_rearm = true;
-
- r = prioq_put(d->earliest, s, &s->time.earliest_index);
- if (r < 0)
- goto fail;
-
- r = prioq_put(d->latest, s, &s->time.latest_index);
+ r = event_source_time_prioq_put(s, d);
if (r < 0)
goto fail;
--
2.17.1

View File

@ -0,0 +1,126 @@
From 6dc0338be9020eebcbfafe078a46bc7be8e4a2ff Mon Sep 17 00:00:00 2001
From: Tom Gundersen <teg@jklm.no>
Date: Sat, 14 Mar 2015 11:47:35 +0100
Subject: [PATCH 11/20] sd-event: rename PASSIVE/PREPARED to INITIAL/ARMED
[commit 2b0c9ef7352dae53ee746c32033999c1346633b3 from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 22 +++++++++++-----------
src/systemd/sd-event.h | 4 ++--
2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index e0e0eaa..299312a 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -2423,7 +2423,7 @@ static int dispatch_exit(sd_event *e) {
r = source_dispatch(p);
- e->state = SD_EVENT_PASSIVE;
+ e->state = SD_EVENT_INITIAL;
sd_event_unref(e);
return r;
@@ -2492,7 +2492,7 @@ _public_ int sd_event_prepare(sd_event *e) {
assert_return(e, -EINVAL);
assert_return(!event_pid_changed(e), -ECHILD);
assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
- assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
+ assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
if (e->exit_requested)
goto pending;
@@ -2526,15 +2526,15 @@ _public_ int sd_event_prepare(sd_event *e) {
if (event_next_pending(e) || e->need_process_child)
goto pending;
- e->state = SD_EVENT_PREPARED;
+ e->state = SD_EVENT_ARMED;
return 0;
pending:
- e->state = SD_EVENT_PREPARED;
+ e->state = SD_EVENT_ARMED;
r = sd_event_wait(e, 0);
if (r == 0)
- e->state = SD_EVENT_PREPARED;
+ e->state = SD_EVENT_ARMED;
return r;
}
@@ -2547,7 +2547,7 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
assert_return(e, -EINVAL);
assert_return(!event_pid_changed(e), -ECHILD);
assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
- assert_return(e->state == SD_EVENT_PREPARED, -EBUSY);
+ assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
if (e->exit_requested) {
e->state = SD_EVENT_PENDING;
@@ -2643,7 +2643,7 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
r = 0;
finish:
- e->state = SD_EVENT_PASSIVE;
+ e->state = SD_EVENT_INITIAL;
return r;
}
@@ -2666,14 +2666,14 @@ _public_ int sd_event_dispatch(sd_event *e) {
e->state = SD_EVENT_RUNNING;
r = source_dispatch(p);
- e->state = SD_EVENT_PASSIVE;
+ e->state = SD_EVENT_INITIAL;
sd_event_unref(e);
return r;
}
- e->state = SD_EVENT_PASSIVE;
+ e->state = SD_EVENT_INITIAL;
return 1;
}
@@ -2684,7 +2684,7 @@ _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
assert_return(e, -EINVAL);
assert_return(!event_pid_changed(e), -ECHILD);
assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
- assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
+ assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
r = sd_event_prepare(e);
if (r > 0)
@@ -2704,7 +2704,7 @@ _public_ int sd_event_loop(sd_event *e) {
assert_return(e, -EINVAL);
assert_return(!event_pid_changed(e), -ECHILD);
- assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
+ assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
sd_event_ref(e);
diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h
index 4957f3a..ffde7c8 100644
--- a/src/systemd/sd-event.h
+++ b/src/systemd/sd-event.h
@@ -51,8 +51,8 @@ enum {
};
enum {
- SD_EVENT_PASSIVE,
- SD_EVENT_PREPARED,
+ SD_EVENT_INITIAL,
+ SD_EVENT_ARMED,
SD_EVENT_PENDING,
SD_EVENT_RUNNING,
SD_EVENT_EXITING,
--
2.17.1

View File

@ -0,0 +1,39 @@
From 01c94571660c44c415ba8bcba62176f45bf84be5 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Wed, 30 Oct 2019 20:26:50 +0100
Subject: [PATCH 12/20] sd-event: refuse running default event loops in any
other thread than the one they are default for
(cherry picked from commit e544601536ac13a288d7476f4400c7b0f22b7ea1)
Related: #1819868
[commit 4c5fdbde7e745126f31542a70b45cc4faec094d2 from
https://github.com/systemd-rhel/rhel-8/
LZ: Dropped the part that won't affect code to simplify the merging.]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 299312a..a2f7868 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -2494,6 +2494,11 @@ _public_ int sd_event_prepare(sd_event *e) {
assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
+ /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
+ * this check here once, since gettid() is typically not cached, and thus want to minimize
+ * syscalls */
+ assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
+
if (e->exit_requested)
goto pending;
--
2.17.1

View File

@ -0,0 +1,106 @@
From f72ca8a711fc406dc52f18c7dbc3bfc5397b26ea Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 23 Nov 2020 17:49:27 +0100
Subject: [PATCH 13/20] sd-event: remove earliest_index/latest_index into
common part of event source objects
So far we used these fields to organize the earliest/latest timer event
priority queue. In a follow-up commit we want to introduce ratelimiting
to event sources, at which point we want any kind of event source to be
able to trigger time wakeups, and hence they all need to be included in
the earliest/latest prioqs. Thus, in preparation let's make this
generic.
No change in behaviour, just some shifting around of struct members from
the type-specific to the generic part.
(cherry picked from commit f41315fceb5208c496145cda2d6c865a5458ce44)
Related: #1819868
[commit 97f599bf57fdaee688ae5750e9b2b2587e2b597a from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 25 +++++++++++++------------
1 file changed, 13 insertions(+), 12 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index a2f7868..82cb9ad 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -94,6 +94,9 @@ struct sd_event_source {
LIST_FIELDS(sd_event_source, sources);
+ unsigned earliest_index;
+ unsigned latest_index;
+
union {
struct {
sd_event_io_handler_t callback;
@@ -105,8 +108,6 @@ struct sd_event_source {
struct {
sd_event_time_handler_t callback;
usec_t next, accuracy;
- unsigned earliest_index;
- unsigned latest_index;
} time;
struct {
sd_event_signal_handler_t callback;
@@ -804,8 +805,8 @@ static void event_source_time_prioq_reshuffle(sd_event_source *s) {
/* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
* pending, enable state. Makes sure the two prioq's are ordered properly again. */
assert_se(d = event_get_clock_data(s->event, s->type));
- prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
- prioq_reshuffle(d->latest, s, &s->time.latest_index);
+ prioq_reshuffle(d->earliest, s, &s->earliest_index);
+ prioq_reshuffle(d->latest, s, &s->latest_index);
d->needs_rearm = true;
}
@@ -816,9 +817,9 @@ static void event_source_time_prioq_remove(
assert(s);
assert(d);
- prioq_remove(d->earliest, s, &s->time.earliest_index);
- prioq_remove(d->latest, s, &s->time.latest_index);
- s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
+ prioq_remove(d->earliest, s, &s->earliest_index);
+ prioq_remove(d->latest, s, &s->latest_index);
+ s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
d->needs_rearm = true;
}
@@ -1104,14 +1105,14 @@ static int event_source_time_prioq_put(
assert(s);
assert(d);
- r = prioq_put(d->earliest, s, &s->time.earliest_index);
+ r = prioq_put(d->earliest, s, &s->earliest_index);
if (r < 0)
return r;
- r = prioq_put(d->latest, s, &s->time.latest_index);
+ r = prioq_put(d->latest, s, &s->latest_index);
if (r < 0) {
- assert_se(prioq_remove(d->earliest, s, &s->time.earliest_index) > 0);
- s->time.earliest_index = PRIOQ_IDX_NULL;
+ assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
+ s->earliest_index = PRIOQ_IDX_NULL;
return r;
}
@@ -1158,7 +1159,7 @@ _public_ int sd_event_add_time(
s->time.next = usec;
s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
s->time.callback = callback;
- s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
+ s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
s->userdata = userdata;
s->enabled = SD_EVENT_ONESHOT;
--
2.17.1

View File

@ -0,0 +1,125 @@
From ad89da1e00919c510596dac78741c98052b1e2f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= <zbyszek@in.waw.pl>
Date: Tue, 10 Nov 2020 10:38:37 +0100
Subject: [PATCH 14/20] sd-event: update state at the end in
event_source_enable
Coverity in CID#1435966 was complaining that s->enabled is not "restored" in
all cases. But the code was actually correct, since it should only be
"restored" in the error paths. But let's still make this prettier by not setting
the state before all operations that may fail are done.
We need to set .enabled for the prioq reshuffling operations, so move those down.
No functional change intended.
(cherry picked from commit d2eafe61ca07f8300dc741a0491a914213fa2b6b)
Related: #1819868
[commit deb9e6ad3a1d7cfbc3b53d1e74cda6ae398a90fd from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 51 +++++++++++++++++-------------
1 file changed, 29 insertions(+), 22 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 82cb9ad..3ff15a2 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -1691,11 +1691,11 @@ static int event_source_disable(sd_event_source *s) {
return 0;
}
-static int event_source_enable(sd_event_source *s, int m) {
+static int event_source_enable(sd_event_source *s, int enable) {
int r;
assert(s);
- assert(IN_SET(m, SD_EVENT_ON, SD_EVENT_ONESHOT));
+ assert(IN_SET(enable, SD_EVENT_ON, SD_EVENT_ONESHOT));
assert(s->enabled == SD_EVENT_OFF);
/* Unset the pending flag when this event source is enabled */
@@ -1705,31 +1705,16 @@ static int event_source_enable(sd_event_source *s, int m) {
return r;
}
- s->enabled = m;
-
switch (s->type) {
-
case SOURCE_IO:
- r = source_io_register(s, m, s->io.events);
- if (r < 0) {
- s->enabled = SD_EVENT_OFF;
+ r = source_io_register(s, enable, s->io.events);
+ if (r < 0)
return r;
- }
-
- break;
-
- case SOURCE_TIME_REALTIME:
- case SOURCE_TIME_BOOTTIME:
- case SOURCE_TIME_MONOTONIC:
- case SOURCE_TIME_REALTIME_ALARM:
- case SOURCE_TIME_BOOTTIME_ALARM:
- event_source_time_prioq_reshuffle(s);
break;
case SOURCE_SIGNAL:
r = event_make_signal_data(s->event, s->signal.sig, NULL);
if (r < 0) {
- s->enabled = SD_EVENT_OFF;
event_gc_signal_data(s->event, &s->priority, s->signal.sig);
return r;
}
@@ -1750,10 +1735,12 @@ static int event_source_enable(sd_event_source *s, int m) {
break;
+ case SOURCE_TIME_REALTIME:
+ case SOURCE_TIME_BOOTTIME:
+ case SOURCE_TIME_MONOTONIC:
+ case SOURCE_TIME_REALTIME_ALARM:
+ case SOURCE_TIME_BOOTTIME_ALARM:
case SOURCE_EXIT:
- prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
- break;
-
case SOURCE_DEFER:
case SOURCE_POST:
break;
@@ -1762,6 +1749,26 @@ static int event_source_enable(sd_event_source *s, int m) {
assert_not_reached("Wut? I shouldn't exist.");
}
+ s->enabled = enable;
+
+ /* Non-failing operations below */
+ switch (s->type) {
+ case SOURCE_TIME_REALTIME:
+ case SOURCE_TIME_BOOTTIME:
+ case SOURCE_TIME_MONOTONIC:
+ case SOURCE_TIME_REALTIME_ALARM:
+ case SOURCE_TIME_BOOTTIME_ALARM:
+ event_source_time_prioq_reshuffle(s);
+ break;
+
+ case SOURCE_EXIT:
+ prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
+ break;
+
+ default:
+ break;
+ }
+
return 0;
}
--
2.17.1

View File

@ -0,0 +1,44 @@
From 04e2ffb437b301963804e6d199be1196d1b4307b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= <zbyszek@in.waw.pl>
Date: Tue, 10 Nov 2020 12:57:34 +0100
Subject: [PATCH 15/20] sd-event: increase n_enabled_child_sources just once
Neither source_child_pidfd_register() nor event_make_signal_data() look at
n_enabled_child_sources.
(cherry picked from commit ac9f2640cb9c107b43f47bba7e068d3b92b5337b)
Related: #1819868
[commit 188465c472996b426a1f22a9fc46d031b722c3b4 from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 3ff15a2..e34fd0b 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -1722,8 +1722,6 @@ static int event_source_enable(sd_event_source *s, int enable) {
break;
case SOURCE_CHILD:
- s->event->n_enabled_child_sources++;
-
r = event_make_signal_data(s->event, SIGCHLD, NULL);
if (r < 0) {
s->enabled = SD_EVENT_OFF;
@@ -1732,6 +1730,7 @@ static int event_source_enable(sd_event_source *s, int enable) {
return r;
}
+ s->event->n_enabled_child_sources++;
break;
--
2.17.1

View File

@ -0,0 +1,97 @@
From 2d07173304abd3f1d3fae5e0f01bf5874b1f04db Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Tue, 29 Sep 2015 20:56:17 +0200
Subject: [PATCH 16/20] sd-event: don't provide priority stability
Currently, we guarantee that if two event-sources with the same priority
fire at the same time, they're always dispatched in the same order. While
this might sound nice in theory, there's is little benefit in providing
stability on that level. We have no control over the order the events are
reported, hence, we cannot guarantee that we get notified about both at
the same time.
By dropping the stability guarantee, we loose roughly 10% Heap swaps in
the prioq on a desktop cold-boot. Krzysztof Kotlenga even reported up to
20% on his tests. This sounds worth optimizing, so drop the stability
guarantee.
[commit 6fe869c251790a0e3cef5b243169dda363723f49 from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 30 ------------------------------
1 file changed, 30 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index e34fd0b..6304991 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -243,12 +243,6 @@ static int pending_prioq_compare(const void *a, const void *b) {
if (x->pending_iteration > y->pending_iteration)
return 1;
- /* Stability for the rest */
- if (x < y)
- return -1;
- if (x > y)
- return 1;
-
return 0;
}
@@ -278,12 +272,6 @@ static int prepare_prioq_compare(const void *a, const void *b) {
if (x->priority > y->priority)
return 1;
- /* Stability for the rest */
- if (x < y)
- return -1;
- if (x > y)
- return 1;
-
return 0;
}
@@ -311,12 +299,6 @@ static int earliest_time_prioq_compare(const void *a, const void *b) {
if (x->time.next > y->time.next)
return 1;
- /* Stability for the rest */
- if (x < y)
- return -1;
- if (x > y)
- return 1;
-
return 0;
}
@@ -344,12 +326,6 @@ static int latest_time_prioq_compare(const void *a, const void *b) {
if (x->time.next + x->time.accuracy > y->time.next + y->time.accuracy)
return 1;
- /* Stability for the rest */
- if (x < y)
- return -1;
- if (x > y)
- return 1;
-
return 0;
}
@@ -371,12 +347,6 @@ static int exit_prioq_compare(const void *a, const void *b) {
if (x->priority > y->priority)
return 1;
- /* Stability for the rest */
- if (x < y)
- return -1;
- if (x > y)
- return 1;
-
return 0;
}
--
2.17.1

View File

@ -0,0 +1,53 @@
From cf0a396c411c78d0d477d2226f89884df207aec2 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 1 Feb 2016 00:19:14 +0100
Subject: [PATCH 17/20] sd-event: when determining the last allowed time a time
event may elapse, deal with overflows
[commit 1bce0ffa66f329bd50d8bfaa943a755caa65b269 from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 6304991..63f77ac 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -302,6 +302,10 @@ static int earliest_time_prioq_compare(const void *a, const void *b) {
return 0;
}
+static usec_t time_event_source_latest(const sd_event_source *s) {
+ return usec_add(s->time.next, s->time.accuracy);
+}
+
static int latest_time_prioq_compare(const void *a, const void *b) {
const sd_event_source *x = a, *y = b;
@@ -321,9 +325,9 @@ static int latest_time_prioq_compare(const void *a, const void *b) {
return 1;
/* Order by time */
- if (x->time.next + x->time.accuracy < y->time.next + y->time.accuracy)
+ if (time_event_source_latest(x) < time_event_source_latest(y))
return -1;
- if (x->time.next + x->time.accuracy > y->time.next + y->time.accuracy)
+ if (time_event_source_latest(x) > time_event_source_latest(y))
return 1;
return 0;
@@ -2014,7 +2018,7 @@ static int event_arm_timer(
b = prioq_peek(d->latest);
assert_se(b && b->enabled != SD_EVENT_OFF);
- t = sleep_between(e, a->time.next, b->time.next + b->time.accuracy);
+ t = sleep_between(e, a->time.next, time_event_source_latest(b));
if (d->next == t)
return 0;
--
2.17.1

View File

@ -0,0 +1,60 @@
From c0521bcf58da1857a2077cd3b3abc330bab33598 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 1 Feb 2016 00:20:18 +0100
Subject: [PATCH 18/20] sd-event: permit a USEC_INFINITY timeout as an
alternative to a disabling an event source
This should simplify handling of time events in clients and is in-line with the USEC_INFINITY macro we already have.
This way setting a timeout to 0 indicates "elapse immediately", and a timeout of USEC_INFINITY "elapse never".
[commit 393003e1debf7c7f75beaacbd532b92c3e3dc729 from
https://github.com/systemd-rhel/rhel-8/
LZ: Dropped the part that won't affect code to simplify the merging.]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 63f77ac..69dd02b 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -1109,7 +1109,6 @@ _public_ int sd_event_add_time(
int r;
assert_return(e, -EINVAL);
- assert_return(usec != (uint64_t) -1, -EINVAL);
assert_return(accuracy != (uint64_t) -1, -EINVAL);
assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
assert_return(!event_pid_changed(e), -ECHILD);
@@ -1791,7 +1790,6 @@ _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
assert_return(s, -EINVAL);
- assert_return(usec != (uint64_t) -1, -EINVAL);
assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
assert_return(!event_pid_changed(s->event), -ECHILD);
@@ -1909,6 +1907,8 @@ static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
if (a <= 0)
return 0;
+ if (a >= USEC_INFINITY)
+ return USEC_INFINITY;
if (b <= a + 1)
return a;
@@ -1998,7 +1998,7 @@ static int event_arm_timer(
d->needs_rearm = false;
a = prioq_peek(d->earliest);
- if (!a || a->enabled == SD_EVENT_OFF) {
+ if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
if (d->fd < 0)
return 0;
--
2.17.1

View File

@ -0,0 +1,841 @@
From 69266c451910d2b57313b2fe7561e07cd5400d27 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 23 Nov 2020 18:02:40 +0100
Subject: [PATCH 19/20] sd-event: add ability to ratelimit event sources
Let's a concept of "rate limiting" to event sources: if specific event
sources fire too often in some time interval temporarily take them
offline, and take them back online once the interval passed.
This is a simple scheme of avoiding starvation of event sources if some
event source fires too often.
This introduces the new conceptual states of "offline" and "online" for
event sources: an event source is "online" only when enabled *and* not
ratelimited, and offline in all other cases. An event source that is
online hence has its fds registered in the epoll, its signals in the
signalfd and so on.
(cherry picked from commit b6d5481b3d9f7c9b1198ab54b54326ec73e855bf)
Related: #1819868
[commit 395eb7753a9772f505102fbbe3ba3261b57abbe9 from
https://github.com/systemd-rhel/rhel-8/
LZ: Moved the changes in libsystemd.sym to libsystemd.sym.m4 from the
file changing history; patch ratelimit.h in its old path; dropped
SOURCE_INOTIFY related parts in sd-event.c because it hasn't been
added in this systemd version.]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/libsystemd.sym.m4 | 7 +
src/libsystemd/sd-event/sd-event.c | 427 +++++++++++++++++++++++------
src/shared/ratelimit.h | 8 +
src/systemd/sd-event.h | 3 +
4 files changed, 365 insertions(+), 80 deletions(-)
diff --git a/src/libsystemd/libsystemd.sym.m4 b/src/libsystemd/libsystemd.sym.m4
index b1c2b43..ceb5d7f 100644
--- a/src/libsystemd/libsystemd.sym.m4
+++ b/src/libsystemd/libsystemd.sym.m4
@@ -169,6 +169,13 @@ global:
sd_journal_has_persistent_files;
} LIBSYSTEMD_219;
+LIBSYSTEMD_248 {
+global:
+ sd_event_source_set_ratelimit;
+ sd_event_source_get_ratelimit;
+ sd_event_source_is_ratelimited;
+} LIBSYSTEMD_229;
+
m4_ifdef(`ENABLE_KDBUS',
LIBSYSTEMD_FUTURE {
global:
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 69dd02b..a3ade40 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -32,6 +32,7 @@
#include "util.h"
#include "time-util.h"
#include "missing.h"
+#include "ratelimit.h"
#include "set.h"
#include "list.h"
@@ -67,7 +68,24 @@ typedef enum WakeupType {
_WAKEUP_TYPE_INVALID = -1,
} WakeupType;
-#define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
+#define EVENT_SOURCE_IS_TIME(t) \
+ IN_SET((t), \
+ SOURCE_TIME_REALTIME, \
+ SOURCE_TIME_BOOTTIME, \
+ SOURCE_TIME_MONOTONIC, \
+ SOURCE_TIME_REALTIME_ALARM, \
+ SOURCE_TIME_BOOTTIME_ALARM)
+
+#define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
+ IN_SET((t), \
+ SOURCE_IO, \
+ SOURCE_TIME_REALTIME, \
+ SOURCE_TIME_BOOTTIME, \
+ SOURCE_TIME_MONOTONIC, \
+ SOURCE_TIME_REALTIME_ALARM, \
+ SOURCE_TIME_BOOTTIME_ALARM, \
+ SOURCE_SIGNAL, \
+ SOURCE_DEFER)
struct sd_event_source {
WakeupType wakeup;
@@ -85,6 +103,7 @@ struct sd_event_source {
bool pending:1;
bool dispatching:1;
bool floating:1;
+ bool ratelimited:1;
int64_t priority;
unsigned pending_index;
@@ -94,6 +113,10 @@ struct sd_event_source {
LIST_FIELDS(sd_event_source, sources);
+ RateLimit rate_limit;
+
+ /* These are primarily fields relevant for time event sources, but since any event source can
+ * effectively become one when rate-limited, this is part of the common fields. */
unsigned earliest_index;
unsigned latest_index;
@@ -188,7 +211,7 @@ struct sd_event {
Hashmap *signal_data; /* indexed by priority */
Hashmap *child_sources;
- unsigned n_enabled_child_sources;
+ unsigned n_online_child_sources;
Set *post_sources;
@@ -219,8 +242,19 @@ struct sd_event {
static void source_disconnect(sd_event_source *s);
+static bool event_source_is_online(sd_event_source *s) {
+ assert(s);
+ return s->enabled != SD_EVENT_OFF && !s->ratelimited;
+}
+
+static bool event_source_is_offline(sd_event_source *s) {
+ assert(s);
+ return s->enabled == SD_EVENT_OFF || s->ratelimited;
+}
+
static int pending_prioq_compare(const void *a, const void *b) {
const sd_event_source *x = a, *y = b;
+ int r;
assert(x->pending);
assert(y->pending);
@@ -231,23 +265,23 @@ static int pending_prioq_compare(const void *a, const void *b) {
if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
return 1;
+ /* Non rate-limited ones first. */
+ r = CMP(!!x->ratelimited, !!y->ratelimited);
+ if (r != 0)
+ return r;
+
/* Lower priority values first */
- if (x->priority < y->priority)
- return -1;
- if (x->priority > y->priority)
- return 1;
+ r = CMP(x->priority, y->priority);
+ if (r != 0)
+ return r;
/* Older entries first */
- if (x->pending_iteration < y->pending_iteration)
- return -1;
- if (x->pending_iteration > y->pending_iteration)
- return 1;
-
- return 0;
+ return CMP(x->pending_iteration, y->pending_iteration);
}
static int prepare_prioq_compare(const void *a, const void *b) {
const sd_event_source *x = a, *y = b;
+ int r;
assert(x->prepare);
assert(y->prepare);
@@ -258,29 +292,46 @@ static int prepare_prioq_compare(const void *a, const void *b) {
if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
return 1;
+ /* Non rate-limited ones first. */
+ r = CMP(!!x->ratelimited, !!y->ratelimited);
+ if (r != 0)
+ return r;
+
/* Move most recently prepared ones last, so that we can stop
* preparing as soon as we hit one that has already been
* prepared in the current iteration */
- if (x->prepare_iteration < y->prepare_iteration)
- return -1;
- if (x->prepare_iteration > y->prepare_iteration)
- return 1;
+ r = CMP(x->prepare_iteration, y->prepare_iteration);
+ if (r != 0)
+ return r;
/* Lower priority values first */
- if (x->priority < y->priority)
- return -1;
- if (x->priority > y->priority)
- return 1;
+ return CMP(x->priority, y->priority);
+}
- return 0;
+static usec_t time_event_source_next(const sd_event_source *s) {
+ assert(s);
+
+ /* We have two kinds of event sources that have elapsation times associated with them: the actual
+ * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
+ * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
+ * looking at here. */
+
+ if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
+ assert(s->rate_limit.begin != 0);
+ assert(s->rate_limit.interval != 0);
+ return usec_add(s->rate_limit.begin, s->rate_limit.interval);
+ }
+
+ /* Otherwise this must be a time event source, if not ratelimited */
+ if (EVENT_SOURCE_IS_TIME(s->type))
+ return s->time.next;
+
+ return USEC_INFINITY;
}
static int earliest_time_prioq_compare(const void *a, const void *b) {
const sd_event_source *x = a, *y = b;
- assert(EVENT_SOURCE_IS_TIME(x->type));
- assert(x->type == y->type);
-
/* Enabled ones first */
if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
return -1;
@@ -294,24 +345,30 @@ static int earliest_time_prioq_compare(const void *a, const void *b) {
return 1;
/* Order by time */
- if (x->time.next < y->time.next)
- return -1;
- if (x->time.next > y->time.next)
- return 1;
-
- return 0;
+ return CMP(time_event_source_next(x), time_event_source_next(y));
}
static usec_t time_event_source_latest(const sd_event_source *s) {
- return usec_add(s->time.next, s->time.accuracy);
+ assert(s);
+
+ if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
+ * same, as we should avoid adding additional inaccuracy on an inaccuracy time
+ * window */
+ assert(s->rate_limit.begin != 0);
+ assert(s->rate_limit.interval != 0);
+ return usec_add(s->rate_limit.begin, s->rate_limit.interval);
+ }
+
+ /* Must be a time event source, if not ratelimited */
+ if (EVENT_SOURCE_IS_TIME(s->type))
+ return usec_add(s->time.next, s->time.accuracy);
+
+ return USEC_INFINITY;
}
static int latest_time_prioq_compare(const void *a, const void *b) {
const sd_event_source *x = a, *y = b;
- assert(EVENT_SOURCE_IS_TIME(x->type));
- assert(x->type == y->type);
-
/* Enabled ones first */
if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
return -1;
@@ -722,12 +779,12 @@ static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig)
* the signalfd for it. */
if (sig == SIGCHLD &&
- e->n_enabled_child_sources > 0)
+ e->n_online_child_sources > 0)
return;
if (e->signal_sources &&
e->signal_sources[sig] &&
- e->signal_sources[sig]->enabled != SD_EVENT_OFF)
+ event_source_is_online(e->signal_sources[sig]))
return;
/*
@@ -774,11 +831,17 @@ static void event_source_time_prioq_reshuffle(sd_event_source *s) {
struct clock_data *d;
assert(s);
- assert(EVENT_SOURCE_IS_TIME(s->type));
/* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
* pending, enable state. Makes sure the two prioq's are ordered properly again. */
- assert_se(d = event_get_clock_data(s->event, s->type));
+
+ if (s->ratelimited)
+ d = &s->event->monotonic;
+ else {
+ assert(EVENT_SOURCE_IS_TIME(s->type));
+ assert_se(d = event_get_clock_data(s->event, s->type));
+ }
+
prioq_reshuffle(d->earliest, s, &s->earliest_index);
prioq_reshuffle(d->latest, s, &s->latest_index);
d->needs_rearm = true;
@@ -819,12 +882,18 @@ static void source_disconnect(sd_event_source *s) {
case SOURCE_TIME_BOOTTIME:
case SOURCE_TIME_MONOTONIC:
case SOURCE_TIME_REALTIME_ALARM:
- case SOURCE_TIME_BOOTTIME_ALARM: {
- struct clock_data *d;
- assert_se(d = event_get_clock_data(s->event, s->type));
- event_source_time_prioq_remove(s, d);
+ case SOURCE_TIME_BOOTTIME_ALARM:
+ /* Only remove this event source from the time event source here if it is not ratelimited. If
+ * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
+ * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
+
+ if (!s->ratelimited) {
+ struct clock_data *d;
+ assert_se(d = event_get_clock_data(s->event, s->type));
+ event_source_time_prioq_remove(s, d);
+ }
+
break;
- }
case SOURCE_SIGNAL:
if (s->signal.sig > 0) {
@@ -839,9 +908,9 @@ static void source_disconnect(sd_event_source *s) {
case SOURCE_CHILD:
if (s->child.pid > 0) {
- if (s->enabled != SD_EVENT_OFF) {
- assert(s->event->n_enabled_child_sources > 0);
- s->event->n_enabled_child_sources--;
+ if (event_source_is_online(s)) {
+ assert(s->event->n_online_child_sources > 0);
+ s->event->n_online_child_sources--;
}
(void) hashmap_remove(s->event->child_sources, INT_TO_PTR(s->child.pid));
@@ -872,6 +941,9 @@ static void source_disconnect(sd_event_source *s) {
if (s->prepare)
prioq_remove(s->event->prepare, s, &s->prepare_index);
+ if (s->ratelimited)
+ event_source_time_prioq_remove(s, &s->event->monotonic);
+
event = s->event;
s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
@@ -1259,11 +1331,11 @@ _public_ int sd_event_add_child(
return r;
}
- e->n_enabled_child_sources ++;
+ e->n_online_child_sources++;
r = event_make_signal_data(e, SIGCHLD, NULL);
if (r < 0) {
- e->n_enabled_child_sources--;
+ e->n_online_child_sources--;
source_free(s);
return r;
}
@@ -1476,7 +1548,7 @@ _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
if (s->io.fd == fd)
return 0;
- if (s->enabled == SD_EVENT_OFF) {
+ if (event_source_is_offline(s)) {
s->io.fd = fd;
s->io.registered = false;
} else {
@@ -1524,7 +1596,7 @@ _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events)
if (s->io.events == events && !(events & EPOLLET))
return 0;
- if (s->enabled != SD_EVENT_OFF) {
+ if (event_source_is_online(s)) {
r = source_io_register(s, s->enabled, events);
if (r < 0)
return r;
@@ -1572,7 +1644,7 @@ _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority)
if (s->priority == priority)
return 0;
- if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
+ if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
struct signal_data *old, *d;
/* Move us from the signalfd belonging to the old
@@ -1609,20 +1681,29 @@ _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
return 0;
}
-static int event_source_disable(sd_event_source *s) {
+static int event_source_offline(
+ sd_event_source *s,
+ int enabled,
+ bool ratelimited) {
+
+ bool was_offline;
int r;
assert(s);
- assert(s->enabled != SD_EVENT_OFF);
+ assert(enabled == SD_EVENT_OFF || ratelimited);
/* Unset the pending flag when this event source is disabled */
- if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
+ if (s->enabled != SD_EVENT_OFF &&
+ enabled == SD_EVENT_OFF &&
+ !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
r = source_set_pending(s, false);
if (r < 0)
return r;
}
- s->enabled = SD_EVENT_OFF;
+ was_offline = event_source_is_offline(s);
+ s->enabled = enabled;
+ s->ratelimited = ratelimited;
switch (s->type) {
@@ -1643,8 +1724,10 @@ static int event_source_disable(sd_event_source *s) {
break;
case SOURCE_CHILD:
- assert(s->event->n_enabled_child_sources > 0);
- s->event->n_enabled_child_sources--;
+ if (!was_offline) {
+ assert(s->event->n_online_child_sources > 0);
+ s->event->n_online_child_sources--;
+ }
event_gc_signal_data(s->event, &s->priority, SIGCHLD);
break;
@@ -1661,26 +1744,42 @@ static int event_source_disable(sd_event_source *s) {
assert_not_reached("Wut? I shouldn't exist.");
}
- return 0;
+ return 1;
}
-static int event_source_enable(sd_event_source *s, int enable) {
+static int event_source_online(
+ sd_event_source *s,
+ int enabled,
+ bool ratelimited) {
+
+ bool was_online;
int r;
assert(s);
- assert(IN_SET(enable, SD_EVENT_ON, SD_EVENT_ONESHOT));
- assert(s->enabled == SD_EVENT_OFF);
+ assert(enabled != SD_EVENT_OFF || !ratelimited);
/* Unset the pending flag when this event source is enabled */
- if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
+ if (s->enabled == SD_EVENT_OFF &&
+ enabled != SD_EVENT_OFF &&
+ !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
r = source_set_pending(s, false);
if (r < 0)
return r;
}
+ /* Are we really ready for onlining? */
+ if (enabled == SD_EVENT_OFF || ratelimited) {
+ /* Nope, we are not ready for onlining, then just update the precise state and exit */
+ s->enabled = enabled;
+ s->ratelimited = ratelimited;
+ return 0;
+ }
+
+ was_online = event_source_is_online(s);
+
switch (s->type) {
case SOURCE_IO:
- r = source_io_register(s, enable, s->io.events);
+ r = source_io_register(s, enabled, s->io.events);
if (r < 0)
return r;
break;
@@ -1698,13 +1797,13 @@ static int event_source_enable(sd_event_source *s, int enable) {
r = event_make_signal_data(s->event, SIGCHLD, NULL);
if (r < 0) {
s->enabled = SD_EVENT_OFF;
- s->event->n_enabled_child_sources--;
+ s->event->n_online_child_sources--;
event_gc_signal_data(s->event, &s->priority, SIGCHLD);
return r;
}
- s->event->n_enabled_child_sources++;
-
+ if (!was_online)
+ s->event->n_online_child_sources++;
break;
case SOURCE_TIME_REALTIME:
@@ -1721,7 +1820,8 @@ static int event_source_enable(sd_event_source *s, int enable) {
assert_not_reached("Wut? I shouldn't exist.");
}
- s->enabled = enable;
+ s->enabled = enabled;
+ s->ratelimited = ratelimited;
/* Non-failing operations below */
switch (s->type) {
@@ -1741,7 +1841,7 @@ static int event_source_enable(sd_event_source *s, int enable) {
break;
}
- return 0;
+ return 1;
}
_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
@@ -1759,7 +1859,7 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
return 0;
if (m == SD_EVENT_OFF)
- r = event_source_disable(s);
+ r = event_source_offline(s, m, s->ratelimited);
else {
if (s->enabled != SD_EVENT_OFF) {
/* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
@@ -1768,7 +1868,7 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
return 0;
}
- r = event_source_enable(s, m);
+ r = event_source_online(s, m, s->ratelimited);
}
if (r < 0)
return r;
@@ -1900,6 +2000,96 @@ _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata)
return ret;
}
+static int event_source_enter_ratelimited(sd_event_source *s) {
+ int r;
+
+ assert(s);
+
+ /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
+ * the end of the rate limit time window, much as if it was a timer event source. */
+
+ if (s->ratelimited)
+ return 0; /* Already ratelimited, this is a NOP hence */
+
+ /* Make sure we can install a CLOCK_MONOTONIC event further down. */
+ r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
+ if (r < 0)
+ return r;
+
+ /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
+ * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
+ * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
+ if (EVENT_SOURCE_IS_TIME(s->type))
+ event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
+
+ /* Now, let's add the event source to the monotonic clock instead */
+ r = event_source_time_prioq_put(s, &s->event->monotonic);
+ if (r < 0)
+ goto fail;
+
+ /* And let's take the event source officially offline */
+ r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
+ if (r < 0) {
+ event_source_time_prioq_remove(s, &s->event->monotonic);
+ goto fail;
+ }
+
+ event_source_pp_prioq_reshuffle(s);
+
+ log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
+ return 0;
+
+fail:
+ /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
+ * space for it should already be allocated. */
+ if (EVENT_SOURCE_IS_TIME(s->type))
+ assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
+
+ return r;
+}
+
+static int event_source_leave_ratelimit(sd_event_source *s) {
+ int r;
+
+ assert(s);
+
+ if (!s->ratelimited)
+ return 0;
+
+ /* Let's take the event source out of the monotonic prioq first. */
+ event_source_time_prioq_remove(s, &s->event->monotonic);
+
+ /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
+ if (EVENT_SOURCE_IS_TIME(s->type)) {
+ r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
+ if (r < 0)
+ goto fail;
+ }
+
+ /* Let's try to take it online again. */
+ r = event_source_online(s, s->enabled, /* ratelimited= */ false);
+ if (r < 0) {
+ /* Do something roughly sensible when this failed: undo the two prioq ops above */
+ if (EVENT_SOURCE_IS_TIME(s->type))
+ event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
+
+ goto fail;
+ }
+
+ event_source_pp_prioq_reshuffle(s);
+ ratelimit_reset(&s->rate_limit);
+
+ log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
+ return 0;
+
+fail:
+ /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
+ * simply put it back in it, maybe we can then process it more successfully next iteration. */
+ assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
+
+ return r;
+}
+
static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
usec_t c;
assert(e);
@@ -1998,7 +2188,7 @@ static int event_arm_timer(
d->needs_rearm = false;
a = prioq_peek(d->earliest);
- if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
+ if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
if (d->fd < 0)
return 0;
@@ -2018,7 +2208,7 @@ static int event_arm_timer(
b = prioq_peek(d->latest);
assert_se(b && b->enabled != SD_EVENT_OFF);
- t = sleep_between(e, a->time.next, time_event_source_latest(b));
+ t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
if (d->next == t)
return 0;
@@ -2097,10 +2287,22 @@ static int process_timer(
for (;;) {
s = prioq_peek(d->earliest);
- if (!s ||
- s->time.next > n ||
- s->enabled == SD_EVENT_OFF ||
- s->pending)
+ if (!s || time_event_source_next(s) > n)
+ break;
+
+ if (s->ratelimited) {
+ /* This is an event sources whose ratelimit window has ended. Let's turn it on
+ * again. */
+ assert(s->ratelimited);
+
+ r = event_source_leave_ratelimit(s);
+ if (r < 0)
+ return r;
+
+ continue;
+ }
+
+ if (s->enabled == SD_EVENT_OFF || s->pending)
break;
r = source_set_pending(s, true);
@@ -2146,7 +2348,7 @@ static int process_child(sd_event *e) {
if (s->pending)
continue;
- if (s->enabled == SD_EVENT_OFF)
+ if (event_source_is_offline(s))
continue;
zero(s->child.siginfo);
@@ -2242,11 +2444,26 @@ static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
}
static int source_dispatch(sd_event_source *s) {
+ _cleanup_(sd_event_unrefp) sd_event *saved_event = NULL;
int r = 0;
assert(s);
assert(s->pending || s->type == SOURCE_EXIT);
+ /* Similar, store a reference to the event loop object, so that we can still access it after the
+ * callback might have invalidated/disconnected the event source. */
+ saved_event = sd_event_ref(s->event);
+
+ /* Check if we hit the ratelimit for this event source, if so, let's disable it. */
+ assert(!s->ratelimited);
+ if (!ratelimit_below(&s->rate_limit)) {
+ r = event_source_enter_ratelimited(s);
+ if (r < 0)
+ return r;
+
+ return 1;
+ }
+
if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
r = source_set_pending(s, false);
if (r < 0)
@@ -2356,7 +2573,7 @@ static int event_prepare(sd_event *e) {
sd_event_source *s;
s = prioq_peek(e->prepare);
- if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
+ if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
break;
s->prepare_iteration = e->iteration;
@@ -2393,7 +2610,7 @@ static int dispatch_exit(sd_event *e) {
assert(e);
p = prioq_peek(e->exit);
- if (!p || p->enabled == SD_EVENT_OFF) {
+ if (!p || event_source_is_offline(p)) {
e->state = SD_EVENT_FINISHED;
return 0;
}
@@ -2419,7 +2636,7 @@ static sd_event_source* event_next_pending(sd_event *e) {
if (!p)
return NULL;
- if (p->enabled == SD_EVENT_OFF)
+ if (event_source_is_offline(p))
return NULL;
return p;
@@ -2879,3 +3096,53 @@ _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
*ret = e->iteration;
return 0;
}
+
+_public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
+ int r;
+
+ assert_return(s, -EINVAL);
+
+ /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
+ * so is a programming error. */
+ assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
+
+ /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
+ * non-ratelimited. */
+ r = event_source_leave_ratelimit(s);
+ if (r < 0)
+ return r;
+
+ RATELIMIT_INIT(s->rate_limit, interval, burst);
+ return 0;
+}
+
+_public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
+ assert_return(s, -EINVAL);
+
+ /* Querying whether an event source has ratelimiting configured is not a loggable offsense, hence
+ * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error */
+ if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
+ return -EDOM;
+
+ if (!ratelimit_configured(&s->rate_limit))
+ return -ENOEXEC;
+
+ if (ret_interval)
+ *ret_interval = s->rate_limit.interval;
+ if (ret_burst)
+ *ret_burst = s->rate_limit.burst;
+
+ return 0;
+}
+
+_public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
+ assert_return(s, -EINVAL);
+
+ if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
+ return false;
+
+ if (!ratelimit_configured(&s->rate_limit))
+ return false;
+
+ return s->ratelimited;
+}
diff --git a/src/shared/ratelimit.h b/src/shared/ratelimit.h
index 58efca7..434089e 100644
--- a/src/shared/ratelimit.h
+++ b/src/shared/ratelimit.h
@@ -55,3 +55,11 @@ typedef struct RateLimit {
} while (false)
bool ratelimit_test(RateLimit *r);
+
+static inline void ratelimit_reset(RateLimit *rl) {
+ rl->num = rl->begin = 0;
+}
+
+static inline bool ratelimit_configured(RateLimit *rl) {
+ return rl->interval > 0 && rl->burst > 0;
+}
diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h
index ffde7c8..f297c6a 100644
--- a/src/systemd/sd-event.h
+++ b/src/systemd/sd-event.h
@@ -130,6 +130,9 @@ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec);
int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock);
int sd_event_source_get_signal(sd_event_source *s);
int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid);
+int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval_usec, unsigned burst);
+int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval_usec, unsigned *ret_burst);
+int sd_event_source_is_ratelimited(sd_event_source *s);
_SD_END_DECLARATIONS;
--
2.17.1

View File

@ -0,0 +1,37 @@
From dc3e079395816ce251c4794992f1816a61c1215d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Sekleta=CC=81r?= <msekleta@redhat.com>
Date: Thu, 9 Jul 2020 18:16:44 +0200
Subject: [PATCH 20/20] core: prevent excessive /proc/self/mountinfo parsing
(cherry picked from commit d586f642fd90e3bb378f7b6d3e3a64a753e51756)
Resolves: #1819868
[commit 51737206afaa10d902c86ec9b5ec97cf425039c2 from
https://github.com/systemd-rhel/rhel-8/]
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/core/mount.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/core/mount.c b/src/core/mount.c
index c7aed23..48427b7 100644
--- a/src/core/mount.c
+++ b/src/core/mount.c
@@ -1692,6 +1692,12 @@ static int mount_enumerate(Manager *m) {
r = sd_event_source_set_priority(m->mount_utab_event_source, -10);
if (r < 0)
goto fail;
+
+ r = sd_event_source_set_ratelimit(m->mount_event_source, 1 * USEC_PER_SEC, 5);
+ if (r < 0) {
+ log_error_errno(r, "Failed to enable rate limit for mount events: %m");
+ goto fail;
+ }
}
r = mount_load_proc_self_mountinfo(m, false);
--
2.17.1

View File

@ -0,0 +1,64 @@
From 15ac2f7ffd502cdc6f4ba47d0dd70fc39c48d8d7 Mon Sep 17 00:00:00 2001
From: Li Zhou <li.zhou@windriver.com>
Date: Wed, 31 Mar 2021 16:08:18 +0800
Subject: [PATCH 21/21] systemd: Fix compiling errors when merging #1819868
A series of patches are merged in for the issue:
https://bugzilla.redhat.com/show_bug.cgi?id=1819868
This commit is for fixing the compiling errors caused by context
conflict.
Signed-off-by: Li Zhou <li.zhou@windriver.com>
---
src/libsystemd/sd-event/sd-event.c | 25 ++++++++++++++++++++++++-
1 file changed, 24 insertions(+), 1 deletion(-)
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 9dc1a27..282b38f 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -37,9 +37,32 @@
#include "list.h"
#include "sd-event.h"
+#include "event-util.h"
#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
+#define CMP(a, b) __CMP(UNIQ, (a), UNIQ, (b))
+#define __CMP(aq, a, bq, b) \
+ ({ \
+ const typeof(a) UNIQ_T(A, aq) = (a); \
+ const typeof(b) UNIQ_T(B, bq) = (b); \
+ UNIQ_T(A, aq) < UNIQ_T(B, bq) ? -1 : \
+ UNIQ_T(A, aq) > UNIQ_T(B, bq) ? 1 : 0; \
+ })
+
+static inline usec_t usec_add(usec_t a, usec_t b) {
+ usec_t c;
+
+ /* Adds two time values, and makes sure USEC_INFINITY as input results as USEC_INFINITY in output, and doesn't
+ * overflow. */
+
+ c = a + b;
+ if (c < a || c < b) /* overflow check */
+ return USEC_INFINITY;
+
+ return c;
+}
+
typedef enum EventSourceType {
SOURCE_IO,
SOURCE_TIME_REALTIME,
@@ -2456,7 +2479,7 @@ static int source_dispatch(sd_event_source *s) {
/* Check if we hit the ratelimit for this event source, if so, let's disable it. */
assert(!s->ratelimited);
- if (!ratelimit_below(&s->rate_limit)) {
+ if (!ratelimit_test(&s->rate_limit)) {
r = event_source_enter_ratelimited(s);
if (r < 0)
return r;
--
2.17.1

View File

@ -129,11 +129,6 @@ sudo
# config files
# net-snmp
net-snmp-utils
net-snmp-libs
#net-snmp-python
# openldap
openldap
openldap-servers
@ -174,6 +169,8 @@ k8s-pod-recovery
# resource-agents
resource-agents
# isolcpus device plugin for K8s
isolcpus-device-plugin
# haproxy
haproxy
@ -212,3 +209,6 @@ kvm-timer-advance
# botocore is an unspecified requirement of boto3
python3-botocore
python3-boto3
# Pf bbdev configuration tool for ACC100 (Mt. Bryce)
pf-bb-config

View File

@ -17,7 +17,6 @@ security/shim-signed
base/sudo
virt/cloud-init
#base/watchdog
base/net-snmp
ldap/openldap
networking/openvswitch
#base/libevent
@ -62,6 +61,7 @@ kubernetes/chartmuseum
kubernetes/armada-helm-toolkit
kubernetes/armada
kubernetes/k8s-pod-recovery
kubernetes/plugins/isolcpus-device-plugin
grub/grubby
base/dpkg
base/cluster-resource-agents
@ -85,3 +85,6 @@ python/python-mechanize
python/python-html5lib
#python/python-webencodings
#python/python-daemon
base/inih
base/pf-bb-config
gpu/gpu-operator

View File

@ -1,4 +1,4 @@
armada-6cc6346cde888c683fec4df910ebefdf6dccb310.tar.gz#armada#https://github.com/airshipit/armada/tarball/6cc6346cde888c683fec4df910ebefdf6dccb310#http##
armada-7ef4b8643b5ec5216a8f6726841e156c0aa54a1a.tar.gz#armada#https://github.com/airshipit/armada/tarball/7ef4b8643b5ec5216a8f6726841e156c0aa54a1a#http##
blkin-f24ceec055ea236a093988237a9821d145f5f7c8.tar.gz#blkin#https://api.github.com/repos/ceph/blkin/tarball/f24ceec055ea236a093988237a9821d145f5f7c8#https##
boost_1_67_0.tar.bz2#boost_1_67_0#https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_67_0.tar.bz2#https##
ceph-erasure-code-corpus-2d7d78b9cc52e8a9529d8cc2d2954c7d375d5dd7.tar.gz#ceph-erasure-code-corpus#https://api.github.com/repos/ceph/ceph-erasure-code-corpus/tarball/2d7d78b9cc52e8a9529d8cc2d2954c7d375d5dd7#https##
@ -69,3 +69,6 @@ trident-installer-20.04.0.tar.gz#trident-installer-20.04.0#https://github.com/Ne
!tss2-930.tar.gz#tss2-930#https://git.code.sf.net/p/ibmtpm20tss/tss#git#v930#
xxHash-1f40c6511fa8dd9d2e337ca8c9bc18b3e87663c9.tar.gz#xxHash#https://api.github.com/repos/ceph/xxHash/tarball/1f40c6511fa8dd9d2e337ca8c9bc18b3e87663c9#https##
zstd-f4340f46b2387bc8de7d5320c0b83bb1499933ad.tar.gz#zstd#https://api.github.com/repos/facebook/zstd/tarball/f4340f46b2387bc8de7d5320c0b83bb1499933ad#https##
inih-b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69.tar.gz#inih-44#https://github.com/benhoyt/inih/tarball/b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69#https##
pf-bb-config-945712e8876be2003f2f31de70353c48501519fa.tar.gz#pf-bb-config-21.3#https://github.com/intel/pf-bb-config/tarball/945712e8876be2003f2f31de70353c48501519fa#https##
gpu-operator-1.6.0.tar.gz#gpu-operator-1.6.0#https://github.com/NVIDIA/gpu-operator/archive/1.6.0.tar.gz##https##

View File

@ -10,6 +10,27 @@ script=$(basename $0)
# Set nullglob so wildcards will return empty string if no match
shopt -s nullglob
state=$(timeout 10 systemctl is-system-running)
case $? in
124)
# If systemctl hangs, proceed with unmounting RBD devices to prevent
# shutdown hang. This maintains any existing edge-case behavior
logger -t ${script} "systemctl timed out. System state unknown."
;;
[01])
# 0 - running; 1 - initializing, starting, degraded, maintenance, stopping
logger -t ${script} "System is $state"
if [ "$state" != "stopping" ]; then
logger -t ${script} "System is not shutting down. Leaving RBD devices mounted"
exit 0
fi
;;
esac
logger -t ${script} "Unmounting RBD devices"
# Unmount the RBD devices as the system is shutting down.
for dev in /dev/rbd[0-9]*; do
for mnt in $(mount | awk -v dev=$dev '($1 == dev) {print $3}'); do
logger -t ${script} "Unmounting $mnt"
@ -27,4 +48,3 @@ lsmod | grep -q '^rbd\>' && /usr/sbin/modprobe -r rbd
lsmod | grep -q '^libceph\>' && /usr/sbin/modprobe -r libceph
exit 0

View File

@ -48,6 +48,17 @@ stop ()
RC=$?
}
# If system is an AIO the mtcClient will run this script twice
# from 2 locations and this generates some errors.
# So we have to exit the script if is called
# from /etc/services.d/worker in order to be executed once
if [[ "$system_type" == "All-in-one" ]]; then
dir_path=$(dirname "$(realpath $0)")
if [[ "$dir_path" == "/etc/services.d/worker" ]]; then
exit 0
fi
fi
RC=0
case "$1" in

View File

@ -0,0 +1,24 @@
From c14a96cb55a3bcd20f772c25f2294eb3a1d376b9 Mon Sep 17 00:00:00 2001
From: Mihnea Saracin <mihnea.saracin@windriver.com>
Date: Fri, 21 May 2021 13:34:41 -0400
Subject: [PATCH 5/5] Wipe 10MB after we lvextend the partitions
---
.../puppet/modules/lvm/lib/puppet/provider/logical_volume/lvm.rb | 1 +
1 file changed, 1 insertion(+)
diff --git a/packstack/puppet/modules/lvm/lib/puppet/provider/logical_volume/lvm.rb b/packstack/puppet/modules/lvm/lib/puppet/provider/logical_volume/lvm.rb
index f9b1c66..a604e96 100755
--- a/packstack/puppet/modules/lvm/lib/puppet/provider/logical_volume/lvm.rb
+++ b/packstack/puppet/modules/lvm/lib/puppet/provider/logical_volume/lvm.rb
@@ -214,6 +214,7 @@ Puppet::Type.type(:logical_volume).provide :lvm do
end
end
lvextend( '-L', "#{new_size}k", path) || fail( "Cannot extend to size #{new_size} because lvextend failed." )
+ exec_cmd("seek_end=$(($(blockdev --getsz #{path})/2048 - 10)); dd if=/dev/zero of=#{path} bs=1M seek=${seek_end} count=10")
exec_cmd('umount', path)
exec_cmd('fsadm', '-y', 'check', path )
r = exec_cmd('fsadm', '-y', 'resize', path, "#{new_size}k")
--
2.29.2

View File

@ -17,6 +17,7 @@ Patch1: 0002-UEFI-pvcreate-fix.patch
Patch2: 0003-US94222-Persistent-Dev-Naming.patch
Patch3: 0004-extendind-nuke_fs_on_resize_failure-functionality.patch
Patch4: Fix-the-logical-statement-for-nuke_fs_on_resize.patch
Patch5: 0005-Wipe-10MB-after-we-lvextend-the-partitions.patch
BuildArch: noarch
@ -36,6 +37,7 @@ A Puppet module for Logical Resource Management (LVM)
%patch2 -p1
%patch3 -p1
%patch4 -p1
%patch5 -p1
%install
install -d -m 0755 %{buildroot}/%{_datadir}/puppet/modules/%{module_dir}

View File

@ -276,6 +276,7 @@ management utility.
%patch0008 -p1
%patch0009 -p1
%patch0010 -p1
%patch0011 -p1
%build
%configure \

View File

@ -0,0 +1,28 @@
From 46962e144b850b6a1dca449f0ee623c8e85596d2 Mon Sep 17 00:00:00 2001
From: David Sullivan <david.sullivan@windriver.com>
Date: Tue, 20 Apr 2021 13:22:19 -0500
Subject: [PATCH] Unmount all targets during drbd stop
When stopping drbd, we need to unmount targets from each device. Devices
with multiple mountpoints can fail to unmount, leading to metadata
corruption. Add --all-targets to the umount command.
---
scripts/drbd | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/scripts/drbd b/scripts/drbd
index 3c7f8a5..eca4e3b 100755
--- a/scripts/drbd
+++ b/scripts/drbd
@@ -182,7 +182,7 @@ case "$1" in
for d in /dev/drbd* ; do
[ -L "$d" ] && continue
[ -b "$d" ] || continue
- M=$(umount "$d" 2>&1)
+ M=$(umount --all-targets "$d" 2>&1)
case $M in
*" not mounted") :;;
*) echo "$M" >&2 ;;
--
2.24.1

View File

@ -0,0 +1,8 @@
VERSION=1.6.0
TAR_NAME=gpu-operator
TAR="$TAR_NAME-$VERSION.tar.gz"
COPY_LIST=" \
$PKG_BASE/files/* \
$STX_BASE/downloads/$TAR"
TIS_PATCH_VER=PKG_GITREVCOUNT

View File

@ -0,0 +1,46 @@
# Build variables
%global helm_ver v3
%global helm_folder /usr/lib/helm
Summary: StarlingX nvidia gpu-operator helm chart
Name: gpu-operator
Version: 1.6.0
Release: 0%{?_tis_dist}.%{tis_patch_ver}
License: Apache-2.0
Group: base
Packager: Wind River <info@windriver.com>
URL: https://github.com/NVIDIA/gpu-operator/tree/gh-pages
Source0: %{name}-%{version}.tar.gz
BuildArch: noarch
Patch01: deployments-setup-configmap-with-assets-for-volumemo.patch
Patch02: enablement-support-on-starlingx-cloud-platform.patch
BuildRequires: helm
%define debug_package %{nil}
%description
StarlingX port of NVIDIA gpu-operator
%prep
%setup
%patch01 -p1
%patch02 -p1
%build
cp -r assets deployments/gpu-operator/assets
helm lint deployments/gpu-operator
mkdir build_results
helm package --version %{helm_ver}-%{version}.%{tis_patch_ver} --app-version %{version} -d build_results deployments/gpu-operator
%install
install -d -m 755 ${RPM_BUILD_ROOT}%{helm_folder}
install -p -D -m 755 build_results/%{name}-%{helm_ver}-%{version}.%{tis_patch_ver}.tgz ${RPM_BUILD_ROOT}%{helm_folder}
%files
%defattr(-,root,root,-)
%{helm_folder}

View File

@ -0,0 +1,137 @@
From b968c69971a195aba4e0c03e8a70df074c128f69 Mon Sep 17 00:00:00 2001
From: Babak Sarashki <babak.sarashki@windriver.com>
Date: Sat, 6 Mar 2021 00:22:40 +0000
Subject: [PATCH 1/2] deployments: setup configmap with assets for volumemounts
This feature allows inclusion of assets/ in the helm chart and their
export to the gpu-operator pod through configmap volumeMounts.
Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com>
---
.../gpu-operator/templates/operator.yaml | 45 +++++++++++++++++++
.../templates/operator_configmap.yaml | 36 +++++++++++++++
deployments/gpu-operator/values.yaml | 2 +
3 files changed, 83 insertions(+)
create mode 100644 deployments/gpu-operator/templates/operator_configmap.yaml
diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml
index 50983b20..1dfd9dbc 100644
--- a/deployments/gpu-operator/templates/operator.yaml
+++ b/deployments/gpu-operator/templates/operator.yaml
@@ -50,6 +50,45 @@ spec:
- name: host-os-release
mountPath: "/host-etc/os-release"
readOnly: true
+
+ {{- if eq .Values.operator.include_assets "include_assets" }}
+ {{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/gpu-feature-discovery/%s" (base $path) }}
+ subPath: {{ printf "gfd_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-container-toolkit/%s" (base $path) }}
+ subPath: {{ printf "state_container_toolkit_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }}
+ subPath: {{ printf "state_device_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin-validation/%s" (base $path) }}
+ subPath: {{ printf "state_device_validation_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-driver/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-driver/%s" (base $path) }}
+ subPath: {{ printf "state_driver_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-monitoring/%s" (base $path) }}
+ subPath: {{ printf "state_monitor_%s" (base $path) }}
+ {{- end }}
+ {{- end }}
+
readinessProbe:
exec:
command: ["stat", "/tmp/operator-sdk-ready"]
@@ -63,6 +102,12 @@ spec:
- name: host-os-release
hostPath:
path: "/etc/os-release"
+ {{- if eq .Values.operator.include_assets "include_assets" }}
+ - name: assets
+ configMap:
+ name: operator-configmap
+ {{- end }}
+
{{- with .Values.operator.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
diff --git a/deployments/gpu-operator/templates/operator_configmap.yaml b/deployments/gpu-operator/templates/operator_configmap.yaml
new file mode 100644
index 00000000..61f366e8
--- /dev/null
+++ b/deployments/gpu-operator/templates/operator_configmap.yaml
@@ -0,0 +1,36 @@
+{{- if eq .Values.operator.include_assets "include_assets" }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: operator-configmap
+data:
+{{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }}
+{{ printf "gfd_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }}
+{{ printf "state_container_toolkit_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
+{{ printf "state_device_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }}
+{{ printf "state_device_validation_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-driver/*" }}
+{{ printf "state_driver_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }}
+{{ printf "state_monitor_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+{{- end }}
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
index 00d94195..8b43c59f 100644
--- a/deployments/gpu-operator/values.yaml
+++ b/deployments/gpu-operator/values.yaml
@@ -39,6 +39,8 @@ operator:
values: [""]
logging:
timeEncoding: epoch
+ # Set to "include_assets" to include assets/gpu-operator with the helm chart
+ include_assets: ""
driver:
repository: nvcr.io/nvidia
--
2.17.1

View File

@ -0,0 +1,540 @@
From 74c08e4ce69b80e8c5687d01c6bd1a4752233e20 Mon Sep 17 00:00:00 2001
From: Babak Sarashki <babak.sarashki@windriver.com>
Date: Sun, 7 Mar 2021 17:19:08 +0000
Subject: [PATCH 2/2] enablement: support on starlingx cloud platform
StarlingX is a cloud infrastructure software stack for edge.
It has an immutable file system, and system configruation. For
instance changes to set containerd runtime by the gpu-operator
will be overriden and must be avoided. The default_runtime is
to remain docker, therefore.
This commit enables gpu-operator on Starlingx (starlingx.io).
The changes to the gpu-operator include bundling modified assets
and a modified version of the nvidia-driver with the helm charts.
The modficiations to the assets include setting the runtimeClassName
on the gpu-operator pods that require nvidia-container-runtime and
host-mounting the kernel headers and build directory. The changes to
the nvidia-driver account for pre-installed kernel packages.
To load the operator on starlingx, define a runtimeclass with name
and handler set to nvidia; thereafter:
$ source /etc/platform/openrc
[...(keystone_admin)]$ system service-parameter-add \
platform container_runtime \
custom_container_runtime=nvidia:/path/to/nvidia-container-runtime
[...(keystone_admin)]$ system host-lock 1; system host-unlock 1
Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com>
---
.../gpu-feature-discovery/0500_daemonset.yaml | 1 +
.../cuda-vector-add.yaml | 1 +
.../0400_device_plugin.yml | 1 +
assets/state-driver/0400_configmap.yaml | 327 +++++++++++++++++-
assets/state-driver/0500_daemonset.yaml | 39 ++-
assets/state-monitoring/0900_daemonset.yaml | 1 +
deployments/gpu-operator/values.yaml | 8 +-
7 files changed, 373 insertions(+), 5 deletions(-)
diff --git a/assets/gpu-feature-discovery/0500_daemonset.yaml b/assets/gpu-feature-discovery/0500_daemonset.yaml
index 9785dc93..1589e710 100644
--- a/assets/gpu-feature-discovery/0500_daemonset.yaml
+++ b/assets/gpu-feature-discovery/0500_daemonset.yaml
@@ -18,6 +18,7 @@ spec:
app.kubernetes.io/part-of: nvidia-gpu
spec:
serviceAccount: nvidia-gpu-feature-discovery
+ runtimeClassName: nvidia
containers:
- image: "FILLED BY THE OPERATOR"
name: gpu-feature-discovery
diff --git a/assets/state-device-plugin-validation/cuda-vector-add.yaml b/assets/state-device-plugin-validation/cuda-vector-add.yaml
index cfb547ad..8269adeb 100644
--- a/assets/state-device-plugin-validation/cuda-vector-add.yaml
+++ b/assets/state-device-plugin-validation/cuda-vector-add.yaml
@@ -12,6 +12,7 @@ spec:
effect: NoSchedule
readOnlyRootFilesystem: true
restartPolicy: OnFailure
+ runtimeClassName: nvidia
initContainers:
- name: device-plugin-validation-init
image: "FILLED BY THE OPERATOR"
diff --git a/assets/state-device-plugin/0400_device_plugin.yml b/assets/state-device-plugin/0400_device_plugin.yml
index a5cf7fae..84e9c534 100644
--- a/assets/state-device-plugin/0400_device_plugin.yml
+++ b/assets/state-device-plugin/0400_device_plugin.yml
@@ -30,6 +30,7 @@ spec:
operator: Exists
effect: NoSchedule
serviceAccount: nvidia-device-plugin
+ runtimeClassName: nvidia
initContainers:
- name: toolkit-validation
image: "FILLED BY THE OPERATOR"
diff --git a/assets/state-driver/0400_configmap.yaml b/assets/state-driver/0400_configmap.yaml
index 48e9f51e..561adc9f 100644
--- a/assets/state-driver/0400_configmap.yaml
+++ b/assets/state-driver/0400_configmap.yaml
@@ -4,7 +4,7 @@ metadata:
name: nvidia-driver
namespace: gpu-operator-resources
data:
- oci-nvidia-hook-json: |
+ oci-nvidia-hook-json: |
{
"version": "1.0.0",
"hook": {
@@ -20,3 +20,328 @@ data:
},
"stages": ["prestart"]
}
+ nvidia-driver-build-script: |
+ #! /bin/bash
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ # Copyright (c) 2021 Wind River Systems, Inc. SPDX-License-Identifier:
+ # Apache-2.0.
+ # This script is from: https://gitlab.com/nvidia/container-images/driver.
+ # It is modified and included under configmap for platforms that require
+ # pre-installed packages. Such platforms have the option to modify the
+ # entrypoint in 0500_daemonset.yaml, or the nvidia-driver script here for
+ # further customizations.
+
+ set -eu
+
+ RUN_DIR=/run/nvidia
+ PID_FILE=${RUN_DIR}/${0##*/}.pid
+ DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"}
+ KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
+ KERNEL_VERSION="$(uname -r)"
+
+ # Default to 0 ; 1 is experimental and not supported
+ export IGNORE_PREEMPT_RT_PRESENCE=0
+
+ # Check if the kernel version requires a new precompiled driver packages.
+ _kernel_requires_package() {
+ local proc_mount_arg=""
+
+ echo "Checking NVIDIA driver packages..."
+ cd /usr/src/nvidia-${DRIVER_VERSION}/kernel
+
+ # When the kernel version is latest on host, this check fails and lead to recompilation, even when precompiled modules exist.
+ #if [ "${KERNEL_VERSION}" != "$(uname -r)" ]; then
+ #Not needed with pre-installed readonly headers, devel and modules
+ #proc_mount_arg="--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc"
+ #fi
+ for pkg_name in $(ls -d -1 precompiled/** 2> /dev/null); do
+ is_match=$(../mkprecompiled --match ${pkg_name} ${proc_mount_arg})
+ if [ "${is_match}" == "kernel interface matches." ]; then
+ echo "Found NVIDIA driver package ${pkg_name##*/}"
+ return 1
+ fi
+ done
+ return 0
+ }
+
+ # Compile the kernel modules, optionally sign them, and generate a precompiled package for use by the nvidia-installer.
+ _create_driver_package() (
+ local pkg_name="nvidia-modules-${KERNEL_VERSION%%-*}${PACKAGE_TAG:+-${PACKAGE_TAG}}"
+ local nvidia_sign_args=""
+ local nvidia_modeset_sign_args=""
+ local nvidia_uvm_sign_args=""
+
+ trap "make -s -j 4 SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT
+
+ echo "Compiling NVIDIA driver kernel modules..."
+ cd /usr/src/nvidia-${DRIVER_VERSION}/kernel
+
+ export IGNORE_CC_MISMATCH=1
+ make -s -j 4 SYSSRC=/lib/modules/${KERNEL_VERSION}/build nv-linux.o nv-modeset-linux.o > /dev/null
+
+ echo "Relinking NVIDIA driver kernel modules..."
+ rm -f nvidia.ko nvidia-modeset.ko
+ ld -d -r -o nvidia.ko ./nv-linux.o ./nvidia/nv-kernel.o_binary
+ ld -d -r -o nvidia-modeset.ko ./nv-modeset-linux.o ./nvidia-modeset/nv-modeset-kernel.o_binary
+
+ if [ -n "${PRIVATE_KEY}" ]; then
+ echo "Signing NVIDIA driver kernel modules..."
+ donkey get ${PRIVATE_KEY} sh -c "PATH=${PATH}:/usr/src/kernels/$(uname -r)/scripts && \
+ sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia.ko nvidia.ko.sign && \
+ sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-modeset.ko nvidia-modeset.ko.sign && \
+ sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-uvm.ko"
+ nvidia_sign_args="--linked-module nvidia.ko --signed-module nvidia.ko.sign"
+ nvidia_modeset_sign_args="--linked-module nvidia-modeset.ko --signed-module nvidia-modeset.ko.sign"
+ nvidia_uvm_sign_args="--signed"
+ fi
+
+ echo "Building NVIDIA driver package ${pkg_name}..."
+ ../mkprecompiled --pack ${pkg_name} --description ${KERNEL_VERSION} \
+ --driver-version ${DRIVER_VERSION} \
+ --kernel-interface nv-linux.o \
+ --linked-module-name nvidia.ko \
+ --core-object-name nvidia/nv-kernel.o_binary \
+ ${nvidia_sign_args} \
+ --target-directory . \
+ --kernel-interface nv-modeset-linux.o \
+ --linked-module-name nvidia-modeset.ko \
+ --core-object-name nvidia-modeset/nv-modeset-kernel.o_binary \
+ ${nvidia_modeset_sign_args} \
+ --target-directory . \
+ --kernel-module nvidia-uvm.ko \
+ ${nvidia_uvm_sign_args} \
+ --target-directory .
+ mkdir -p precompiled
+ mv ${pkg_name} precompiled
+ )
+
+ # Load the kernel modules and start persistenced.
+ _load_driver() {
+ echo "Loading IPMI kernel module..."
+ modprobe ipmi_msghandler
+
+ echo "Loading NVIDIA driver kernel modules..."
+ modprobe -a nvidia nvidia-uvm nvidia-modeset
+
+ echo "Starting NVIDIA persistence daemon..."
+ nvidia-persistenced --persistence-mode
+ }
+
+ # Stop persistenced and unload the kernel modules if they are currently loaded.
+ _unload_driver() {
+ local rmmod_args=()
+ local nvidia_deps=0
+ local nvidia_refs=0
+ local nvidia_uvm_refs=0
+ local nvidia_modeset_refs=0
+
+ echo "Stopping NVIDIA persistence daemon..."
+ if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then
+ local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid)
+
+ kill -SIGTERM "${pid}"
+ for i in $(seq 1 10); do
+ kill -0 "${pid}" 2> /dev/null || break
+ sleep 0.1
+ done
+ if [ $i -eq 10 ]; then
+ echo "Could not stop NVIDIA persistence daemon" >&2
+ return 1
+ fi
+ fi
+
+ echo "Unloading NVIDIA driver kernel modules..."
+ if [ -f /sys/module/nvidia_modeset/refcnt ]; then
+ nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)
+ rmmod_args+=("nvidia-modeset")
+ ((++nvidia_deps))
+ fi
+ if [ -f /sys/module/nvidia_uvm/refcnt ]; then
+ nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt)
+ rmmod_args+=("nvidia-uvm")
+ ((++nvidia_deps))
+ fi
+ if [ -f /sys/module/nvidia/refcnt ]; then
+ nvidia_refs=$(< /sys/module/nvidia/refcnt)
+ rmmod_args+=("nvidia")
+ fi
+ if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ]; then
+ echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2
+ return 1
+ fi
+
+ if [ ${#rmmod_args[@]} -gt 0 ]; then
+ rmmod ${rmmod_args[@]}
+ fi
+ return 0
+ }
+
+ # Link and install the kernel modules from a precompiled package using the nvidia-installer.
+ _install_driver() {
+ local install_args=()
+
+ echo "Installing NVIDIA driver kernel modules..."
+ cd /usr/src/nvidia-${DRIVER_VERSION}
+ rm -rf /lib/modules/${KERNEL_VERSION}/video
+
+ if [ "${ACCEPT_LICENSE}" = "yes" ]; then
+ install_args+=("--accept-license")
+ fi
+ nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"}
+ # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
+ # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
+ # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
+ #nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
+ }
+
+ # Mount the driver rootfs into the run directory with the exception of sysfs.
+ _mount_rootfs() {
+ echo "Mounting NVIDIA driver rootfs..."
+ mount --make-runbindable /sys
+ mount --make-private /sys
+ mkdir -p ${RUN_DIR}/driver
+ mount --rbind / ${RUN_DIR}/driver
+ }
+
+ # Unmount the driver rootfs from the run directory.
+ _unmount_rootfs() {
+ echo "Unmounting NVIDIA driver rootfs..."
+ if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then
+ umount -l -R ${RUN_DIR}/driver
+ fi
+ }
+
+ # Write a kernel postinst.d script to automatically precompile packages on kernel update (similar to DKMS).
+ _write_kernel_update_hook() {
+ if [ ! -d ${KERNEL_UPDATE_HOOK%/*} ]; then
+ return
+ fi
+
+ echo "Writing kernel update hook..."
+ cat > ${KERNEL_UPDATE_HOOK} <<'EOF'
+ #!/bin/bash
+
+ set -eu
+ trap 'echo "ERROR: Failed to update the NVIDIA driver" >&2; exit 0' ERR
+
+ NVIDIA_DRIVER_PID=$(< /run/nvidia/nvidia-driver.pid)
+
+ export "$(grep -z DRIVER_VERSION /proc/${NVIDIA_DRIVER_PID}/environ)"
+ nsenter -t "${NVIDIA_DRIVER_PID}" -m -- nvidia-driver update --kernel "$1"
+ EOF
+ chmod +x ${KERNEL_UPDATE_HOOK}
+ }
+
+ _shutdown() {
+ if _unload_driver; then
+ _unmount_rootfs
+ rm -f ${PID_FILE} ${KERNEL_UPDATE_HOOK}
+ return 0
+ fi
+ return 1
+ }
+
+ init() {
+ echo -e "\n========== NVIDIA Software Installer ==========\n"
+ echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"
+
+ exec 3> ${PID_FILE}
+ if ! flock -n 3; then
+ echo "An instance of the NVIDIA driver is already running, aborting"
+ exit 1
+ fi
+ echo $$ >&3
+
+ trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
+ trap "_shutdown" EXIT
+
+ _unload_driver || exit 1
+ _unmount_rootfs
+
+ if _kernel_requires_package; then
+ _create_driver_package
+ fi
+
+ _install_driver
+ _load_driver
+ _mount_rootfs
+ _write_kernel_update_hook
+
+ echo "Done, now waiting for signal"
+ sleep infinity &
+ trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
+ trap - EXIT
+ while true; do wait $! || continue; done
+ exit 0
+ }
+
+ update() {
+ exec 3>&2
+ if exec 2> /dev/null 4< ${PID_FILE}; then
+ if ! flock -n 4 && read pid <&4 && kill -0 "${pid}"; then
+ exec > >(tee -a "/proc/${pid}/fd/1")
+ exec 2> >(tee -a "/proc/${pid}/fd/2" >&3)
+ else
+ exec 2>&3
+ fi
+ exec 4>&-
+ fi
+ exec 3>&-
+
+ echo -e "\n========== NVIDIA Software Updater ==========\n"
+ echo -e "Starting update of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"
+
+ trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
+
+ if _kernel_requires_package; then
+ _create_driver_package
+ fi
+
+ echo "Done"
+ exit 0
+ }
+
+ usage() {
+ cat >&2 <<EOF
+ Usage: $0 COMMAND [ARG...]
+
+ Commands:
+ init [-a | --accept-license]
+ update [-k | --kernel VERSION] [-s | --sign KEYID] [-t | --tag TAG]
+ EOF
+ exit 1
+ }
+
+ if [ $# -eq 0 ]; then
+ usage
+ fi
+ command=$1; shift
+ case "${command}" in
+ init) options=$(getopt -l accept-license -o a -- "$@") ;;
+ update) options=$(getopt -l kernel:,sign:,tag: -o k:s:t: -- "$@") ;;
+ *) usage ;;
+ esac
+ if [ $? -ne 0 ]; then
+ usage
+ fi
+ eval set -- "${options}"
+
+ ACCEPT_LICENSE=""
+ KERNEL_VERSION=$(uname -r)
+ PRIVATE_KEY=""
+ PACKAGE_TAG=""
+
+ for opt in ${options}; do
+ case "$opt" in
+ -a | --accept-license) ACCEPT_LICENSE="yes"; shift 1 ;;
+ -k | --kernel) KERNEL_VERSION=$2; shift 2 ;;
+ -s | --sign) PRIVATE_KEY=$2; shift 2 ;;
+ -t | --tag) PACKAGE_TAG=$2; shift 2 ;;
+ --) shift; break ;;
+ esac
+ done
+ if [ $# -ne 0 ]; then
+ usage
+ fi
+
+ $command
diff --git a/assets/state-driver/0500_daemonset.yaml b/assets/state-driver/0500_daemonset.yaml
index 3a2dc06b..7a1d8a17 100644
--- a/assets/state-driver/0500_daemonset.yaml
+++ b/assets/state-driver/0500_daemonset.yaml
@@ -32,8 +32,19 @@ spec:
- image: "FILLED BY THE OPERATOR"
imagePullPolicy: Always
name: nvidia-driver-ctr
- command: ["nvidia-driver"]
- args: ["init"]
+ command: ["/bin/bash"]
+ args:
+ - "-c"
+ - "--"
+ - >
+ cat /usr/local/bin/nvidia-driver.22 > /usr/local/bin/nvidia-driver &&
+ chmod 755 /usr/local/bin/nvidia-driver &&
+ mkdir -p /usr/src/kernels &&
+ tar -C /usr/src/host-kernels/ -c $(uname -r) -f - | tar -C /usr/src/kernels/ -xf - &&
+ rm -rf /lib/modules/ && mkdir -p /lib/modules/ &&
+ tar -C /lib/host-modules/ -c $(uname -r) -f - | tar -C /lib/modules/ -xf - &&
+ ln -rfs /usr/lib64/libelf.so.1 /usr/lib/libelf.so &&
+ /usr/local/bin/nvidia-driver init
securityContext:
privileged: true
seLinuxOptions:
@@ -44,10 +55,23 @@ spec:
mountPropagation: Bidirectional
- name: config
mountPath: /etc/containers/oci/hooks.d
+ subPath: oci-nvidia-hook-json
+ - name: config
+ mountPath: /usr/local/bin/nvidia-driver.22
+ subPath: nvidia-driver-build-script
- name: var-log
mountPath: /var/log
- name: dev-log
mountPath: /dev/log
+ - name: host-modules
+ mountPath: /lib/host-modules
+ readOnly: true
+ - name: host-include
+ mountPath: /usr/include
+ readOnly: true
+ - name: host-kernel-devel
+ mountPath: /usr/src/host-kernels
+ readOnly: true
volumes:
- name: run-nvidia
hostPath:
@@ -58,11 +82,22 @@ spec:
- name: dev-log
hostPath:
path: /dev/log
+ - name: host-modules
+ hostPath:
+ path: /lib/modules
+ - name: host-kernel-devel
+ hostPath:
+ path: /usr/src/kernels/
+ - name: host-include
+ hostPath:
+ path: /usr/include
- name: config
configMap:
name: nvidia-driver
items:
- key: oci-nvidia-hook-json
path: oci-nvidia-hook.json
+ - key: nvidia-driver-build-script
+ path: nvidia-driver-build-script
nodeSelector:
nvidia.com/gpu.present: "true"
diff --git a/assets/state-monitoring/0900_daemonset.yaml b/assets/state-monitoring/0900_daemonset.yaml
index 38c4d63a..aebb4297 100644
--- a/assets/state-monitoring/0900_daemonset.yaml
+++ b/assets/state-monitoring/0900_daemonset.yaml
@@ -31,6 +31,7 @@ spec:
effect: NoSchedule
serviceAccount: nvidia-dcgm-exporter
serviceAccountName: nvidia-dcgm-exporter
+ runtimeClassName: nvidia
initContainers:
- name: toolkit-validation
image: "FILLED BY THE OPERATOR"
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
index 8b43c59f..17662729 100644
--- a/deployments/gpu-operator/values.yaml
+++ b/deployments/gpu-operator/values.yaml
@@ -15,6 +15,10 @@ operator:
#version: 1.5.2
imagePullPolicy: IfNotPresent
imagePullSecrets: []
+ # We cannot default to containerd because the operator modifies containerd
+ # configuration by adding itself to it, either as the default runtime or a
+ # runtimeclass, and restarts the service thereafter.
+ # defaultRuntime: containerd
defaultRuntime: docker
validator:
image: cuda-sample
@@ -40,7 +44,7 @@ operator:
logging:
timeEncoding: epoch
# Set to "include_assets" to include assets/gpu-operator with the helm chart
- include_assets: ""
+ include_assets: "include_assets"
driver:
repository: nvcr.io/nvidia
@@ -73,7 +77,7 @@ driver:
toolkit:
repository: nvcr.io/nvidia/k8s
image: container-toolkit
- version: 1.4.5-ubuntu18.04
+ version: 1.4.5-ubi8
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
--
2.17.1

View File

@ -1,4 +1,4 @@
%global git_sha 6cc6346cde888c683fec4df910ebefdf6dccb310
%global git_sha 7ef4b8643b5ec5216a8f6726841e156c0aa54a1a
# Build variables
%global helm_folder /usr/lib/helm
@ -16,6 +16,9 @@ URL: https://airship-armada.readthedocs.io/
Source0: %{name}-%{git_sha}.tar.gz
Patch1: 0001-Add-Helm-v2-client-initialization-using-tiller-postS.patch
Patch2: 0002-Tiller-wait-for-postgres-database-ping.patch
Patch3: 0003-Update-the-liveness-probe-to-verify-postgres-connect.patch
Patch4: 0004-Update-postgres-liveness-check-to-support-IPv6-addre.patch
BuildArch: noarch
@ -29,7 +32,9 @@ BuildRequires: chartmuseum
%prep
%setup -n armada
%patch1 -p1
%patch2 -p1
%patch3 -p1
%patch4 -p1
%build
# Package the armada chart tarball using methodology derived from:

View File

@ -1,6 +1,6 @@
TAR_NAME=armada
VERSION=0.2.0
SHA=6cc6346cde888c683fec4df910ebefdf6dccb310
SHA=7ef4b8643b5ec5216a8f6726841e156c0aa54a1a
TAR="$TAR_NAME-$SHA.tar.gz"
COPY_LIST="${CGCS_BASE}/downloads/$TAR $FILES_BASE/*"

View File

@ -1,7 +1,8 @@
From 6e464edeadab3b2631775326fb12e6d6e6eb1e2a Mon Sep 17 00:00:00 2001
From: Jim Gauld <james.gauld@windriver.com>
Date: Mon, 1 Jun 2020 11:36:46 -0400
Subject: [PATCH] Add Helm v2 client initialization using tiller postStart exec
From 8c6cc4c0ad5569d7de3615463f7d8c4dd7429e63 Mon Sep 17 00:00:00 2001
From: Thiago Brito <thiago.brito@windriver.com>
Date: Thu, 22 Apr 2021 20:00:51 -0300
Subject: [PATCH] Add Helm v2 client initialization using tiller
postStart exec
This adds helm v2 client initialization using the tiller
container postStart exec to access helm v2 binary.
@ -28,16 +29,17 @@ starlingx http://192.168.204.1:8080/helm_charts/starlingx
stx-platform http://192.168.204.1:8080/helm_charts/stx-platform
Signed-off-by: Jim Gauld <james.gauld@windriver.com>
Signed-off-by: Thiago Brito <thiago.brito@windriver.com>
---
charts/armada/templates/deployment-api.yaml | 28 ++++++++++++++++++++++++++++
charts/armada/values.yaml | 4 ++++
2 files changed, 32 insertions(+)
charts/armada/templates/deployment-api.yaml | 33 +++++++++++++++++++++
charts/armada/values.yaml | 10 +++++++
2 files changed, 43 insertions(+)
diff --git a/charts/armada/templates/deployment-api.yaml b/charts/armada/templates/deployment-api.yaml
index a48c8b6..ccbdb4d 100644
index 562e3d0..483ec0b 100644
--- a/charts/armada/templates/deployment-api.yaml
+++ b/charts/armada/templates/deployment-api.yaml
@@ -179,6 +179,34 @@ spec:
@@ -186,6 +186,39 @@ spec:
- -trace
{{- end }}
lifecycle:
@ -54,8 +56,14 @@ index a48c8b6..ccbdb4d 100644
+ # Initialize Helm v2 client.
+ export HELM_HOST=:{{ .Values.conf.tiller.port }}
+ /helm init --client-only --skip-refresh
+ /helm repo rm stable
+ /helm repo rm local
+
+ # Moving the ln up so eventual errors on the next commands doesn't prevent
+ # having helm available
+ ln -s -f /helm /tmp/helm
+
+ # Removes all repos available so we don't get an error removing what
+ # doesn't exist anymore or error re-adding an existing repo
+ /helm repo list | awk '(NR>1){print $1}' | xargs --no-run-if-empty /helm repo rm
+{{- if .Values.conf.tiller.repos }}
+ {{- range $name, $repo := .Values.conf.tiller.repos }}
+ /helm repo add {{ $name }} {{ $repo }}
@ -66,17 +74,16 @@ index a48c8b6..ccbdb4d 100644
+ /helm repo add {{ . }} {{ $envAll.Values.conf.tiller.charts_url }}/{{ . }}
+ {{- end }}
+{{- end }}
+ ln -s -f /helm /tmp/helm
+ exit 0
+ EOF
preStop:
exec:
command:
diff --git a/charts/armada/values.yaml b/charts/armada/values.yaml
index 4c1e603..fb3e5c2 100644
index 3a4427e..da45810 100644
--- a/charts/armada/values.yaml
+++ b/charts/armada/values.yaml
@@ -217,6 +217,10 @@ conf:
@@ -220,6 +220,10 @@ conf:
# Note: Defaulting to the (default) kubernetes grace period, as anything
# greater than that will have no effect.
prestop_sleep: 30
@ -87,6 +94,21 @@ index 4c1e603..fb3e5c2 100644
monitoring:
prometheus:
@@ -325,7 +329,13 @@ pod:
volumes:
- name: kubernetes-client-cache
emptyDir: {}
+ - name: tiller-tmp
+ emptyDir: {}
volumeMounts:
+ - name: tiller-tmp
+ # /tmp is now readOnly due to the security_context on L288, so
+ # mounting an emptyDir
+ mountPath: /tmp
- name: kubernetes-client-cache
# Should be the `$HOME/.kube` of the `runAsUser` above
# as this is where tiller's kubernetes client roots its cache dir.
--
1.8.3.1
2.17.1

View File

@ -0,0 +1,66 @@
From 96e49fcc6d6b988d03a61261511abf64a0af2e2a Mon Sep 17 00:00:00 2001
From: Dan Voiculeasa <dan.voiculeasa@windriver.com>
Date: Tue, 11 May 2021 21:04:18 +0300
Subject: [PATCH] Tiller wait for postgres database ping
Networking might not be correctly initialized when tiller starts.
Modify the pod command to wait for networking to be available before
starting up tiller.
Signed-off-by: Dan Voiculeasa <dan.voiculeasa@windriver.com>
---
charts/armada/templates/deployment-api.yaml | 31 +++++++++++++--------
1 file changed, 19 insertions(+), 12 deletions(-)
diff --git a/charts/armada/templates/deployment-api.yaml b/charts/armada/templates/deployment-api.yaml
index 69036c0..3816366 100644
--- a/charts/armada/templates/deployment-api.yaml
+++ b/charts/armada/templates/deployment-api.yaml
@@ -167,24 +167,31 @@ spec:
- name: TILLER_HISTORY_MAX
value: {{ .Values.conf.tiller.history_max | quote }}
command:
- - /tiller
+ - sh
+ - -c
+ - |
+ /bin/sh <<'EOF'
{{- if .Values.conf.tiller.storage }}
- - --storage={{ .Values.conf.tiller.storage }}
{{- if and (eq .Values.conf.tiller.storage "sql") (.Values.conf.tiller.sql_dialect) (.Values.conf.tiller.sql_connection) }}
- - --sql-dialect={{ .Values.conf.tiller.sql_dialect }}
- - --sql-connection-string={{ .Values.conf.tiller.sql_connection }}
+ while ! /bin/busybox nc -vz -w 1 {{ .Values.conf.tiller.sql_endpoint_ip}} 5432; do continue; done;
{{- end }}
{{- end }}
- - -listen
- - ":{{ .Values.conf.tiller.port }}"
- - -probe-listen
- - ":{{ .Values.conf.tiller.probe_port }}"
- - -logtostderr
- - -v
- - {{ .Values.conf.tiller.verbosity | quote }}
+ /tiller \
+{{- if .Values.conf.tiller.storage }}
+ --storage={{ .Values.conf.tiller.storage }} \
+{{- if and (eq .Values.conf.tiller.storage "sql") (.Values.conf.tiller.sql_dialect) (.Values.conf.tiller.sql_connection) }}
+ --sql-dialect={{ .Values.conf.tiller.sql_dialect }} \
+ --sql-connection-string={{ .Values.conf.tiller.sql_connection }} \
+{{- end }}
+{{- end }}
+ -listen ":{{ .Values.conf.tiller.port }}" \
+ -probe-listen ":{{ .Values.conf.tiller.probe_port }}" \
+ -logtostderr \
+ -v {{ .Values.conf.tiller.verbosity | quote }} \
{{- if .Values.conf.tiller.trace }}
- - -trace
+ -trace
{{- end }}
+ EOF
lifecycle:
postStart:
exec:
--
2.30.0

View File

@ -0,0 +1,45 @@
From be3167e5342f2730ef43012d8fe4f3782c6ef468 Mon Sep 17 00:00:00 2001
From: Robert Church <robert.church@windriver.com>
Date: Wed, 12 May 2021 02:38:52 -0400
Subject: [PATCH 3/3] Update the liveness probe to verify postgres connectivity
Change the tillerLivenessProbeTemplate to test the connectivity to the
postgres backend. We will override the periodSeconds and
failureThreshold when installing the helm chart to trigger a restart of
the tiller pod over a swact when the postgres DB/server moves from one
controller to the other.
This will help guarantee that the tiller connection is always
reestablished if the connectivity to the postgres backend fails.
Signed-off-by: Robert Church <robert.church@windriver.com>
---
charts/armada/templates/deployment-api.yaml | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/charts/armada/templates/deployment-api.yaml b/charts/armada/templates/deployment-api.yaml
index bf23fb2..2b65494 100644
--- a/charts/armada/templates/deployment-api.yaml
+++ b/charts/armada/templates/deployment-api.yaml
@@ -28,10 +28,14 @@ httpGet:
{{- end }}
{{- define "tillerLivenessProbeTemplate" }}
-httpGet:
- path: /liveness
- port: {{ .Values.conf.tiller.probe_port }}
- scheme: HTTP
+exec:
+ command:
+ - nc
+ - -vz
+ - -w
+ - "1"
+ - {{ .Values.conf.tiller.sql_endpoint_ip}}
+ - "5432"
{{- end }}
{{- if .Values.manifests.deployment_api }}
--
2.16.6

View File

@ -0,0 +1,30 @@
From e13416638b103fde04feb31027c3148c9685cf7f Mon Sep 17 00:00:00 2001
From: Robert Church <robert.church@windriver.com>
Date: Sat, 15 May 2021 16:16:41 -0400
Subject: [PATCH 4/4] Update postgres liveness check to support IPv6 addresses
Templating will add square brackets for IPv6 addresses which are
interpreted as an array vs. a string. Quote this so that it interpreted
correctly.
Signed-off-by: Robert Church <robert.church@windriver.com>
---
charts/armada/templates/deployment-api.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/charts/armada/templates/deployment-api.yaml b/charts/armada/templates/deployment-api.yaml
index 2b65494..5c4825c 100644
--- a/charts/armada/templates/deployment-api.yaml
+++ b/charts/armada/templates/deployment-api.yaml
@@ -34,7 +34,7 @@ exec:
- -vz
- -w
- "1"
- - {{ .Values.conf.tiller.sql_endpoint_ip}}
+ - "{{ .Values.conf.tiller.sql_endpoint_ip }}"
- "5432"
{{- end }}
--
2.16.6

View File

@ -1,5 +1,5 @@
BUILDER=docker
LABEL=k8s-cni-sriov
DOCKER_REPO=https://github.com/intel/sriov-cni.git
DOCKER_REF=365c8f8cc1204df84f3e976ea30f113e733ca665
DOCKER_REF=b18123d809f1c010cae19018e3395ed5981e76f7

View File

@ -21,6 +21,7 @@
export PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin
export KUBECONFIG=/etc/kubernetes/admin.conf
CONF_DIR=/etc/k8s-post-recovery.d
SLEEP_DELAY_SEC=15
NAME=$(basename $0)
@ -74,23 +75,24 @@ function _wait_for_systemd {
}
function _wait_for_pod_stabilization {
local extra_args=$1
local time_between_polls=$2
local stable_cycles=$3
last_count=0
stability_count=0
NINETY_SEC_COUNT=$((90/SLEEP_DELAY_SEC))
while true ; do
pods_in_flux=$(KUBECONFIG=/etc/kubernetes/admin.conf kubectl get pods --no-headers --all-namespaces | grep -v -e Running -e Completed | wc -l)
while [[ $stability_count -lt $stable_cycles ]] ; do
pods_in_flux=$(KUBECONFIG=/etc/kubernetes/admin.conf kubectl get pods --no-headers --all-namespaces $extra_args | grep -v -e Running -e Completed | wc -l)
if [[ $pods_in_flux -ne $last_count ]]; then
LOG "Waiting on pod transitions to stabilize... $pods_in_flux pods are not Running/Completed"
last_count=$pods_in_flux
stability_count=0
else
LOG "Pods transitions are stable... for $((stability_count*${SLEEP_DELAY_SEC})) seconds."
if [[ $stability_count -eq $NINETY_SEC_COUNT ]]; then
break
fi
LOG "Pods transitions are stable... for $((stability_count*time_between_polls)) seconds."
stability_count=$((stability_count+1))
fi
sleep ${SLEEP_DELAY_SEC}
sleep $time_between_polls
done
}
@ -98,7 +100,13 @@ function _unknown_pods {
# $1: actions <recover|verify>
# Target specific namespaces and pods on this host
SUPPORTED_NAMESPACES=('openstack' 'monitor')
SUPPORTED_NAMESPACES=('armada' 'openstack' 'monitor')
shopt -s nullglob
for conf_file in ${CONF_DIR}/*.conf; do
grep -q '^namespace=' $conf_file || continue
SUPPORTED_NAMESPACES+=($(grep '^namespace=' $conf_file | awk -F '=' '{print $2}'))
done
if [ "$1" == 'recover' ]; then
# Recovers pods that are: Running/Unknown and Pending/Init:Unknown
@ -148,15 +156,19 @@ function _node_affinity_pods {
function _labeled_pods {
# $1: actions <recover|verify>
if [ "$1" == 'recover' ]; then
POLLING_INTERVAL=5
STABILITY_COUNT=6
_wait_for_pod_stabilization "--selector=restart-on-reboot=true --field-selector=spec.nodeName=${HOST}" $POLLING_INTERVAL $STABILITY_COUNT
# Delete pods with the restart-on-reboot=true label
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
if [ "$1" == 'recover' ]; then
for pod in $PODS; do
LOG "restart-on-reboot labeled pods: Recovering: ${pod//// }"
kubectl delete pods -n ${pod//// } --wait=false
done
elif [ "$1" == 'verify' ]; then
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
for pod in $PODS; do
LOG "restart-on-reboot labeled pods: Verifying: ${pod//// }"
STATUS=$(kubectl get pod --no-headers -n ${pod//// } 2>/dev/null | awk '{print $3}')
@ -214,11 +226,11 @@ function _force_reset_pods {
function _examine_pods {
# $1: actions <recover|verify>
# No need to wait for pod transitions if we know the pod needs to be restarted
# Manage labeled pods first
_labeled_pods $1
# Wait for pods transitions to stop
_wait_for_pod_stabilization
_wait_for_pod_stabilization "" $SLEEP_DELAY_SEC 6
# Check for recovery actions
_unknown_pods $1

View File

@ -17,10 +17,12 @@ Requires: systemd
%define local_dir /usr/local
%define local_sbindir %{local_dir}/sbin
%define k8s_recovery_conf_dir /etc/k8s-post-recovery.d
%prep
%install
install -d %{buildroot}%{k8s_recovery_conf_dir}
install -d %{buildroot}%{local_sbindir}
install -m 755 %{SOURCE0} %{buildroot}%{local_sbindir}/k8s-pod-recovery
install -p -D -m 644 %{SOURCE1} %{buildroot}%{_unitdir}/k8s-pod-recovery.service
@ -50,3 +52,4 @@ fi
%defattr(-,root,root,-)
%{local_sbindir}/k8s-pod-recovery
%{_unitdir}/k8s-pod-recovery.service
%{k8s_recovery_conf_dir}

View File

@ -0,0 +1,138 @@
From c72ad02d7be3edaf17a07bb6b2c40249ba00038e Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@windriver.com>
Date: Tue, 21 Apr 2020 16:06:35 -0600
Subject: [PATCH] Fix exclusive CPU allocations being deleted at container
restart
The expectation is that exclusive CPU allocations happen at pod
creation time. When a container restarts, it should not have its
exclusive CPU allocations removed, and it should not need to
re-allocate CPUs.
There are a few places in the current code that look for containers
that have exited and call CpuManager.RemoveContainer() to clean up
the container. This will end up deleting any exclusive CPU
allocations for that container, and if the container restarts within
the same pod it will end up using the default cpuset rather than
what should be exclusive CPUs.
Removing those calls and adding resource cleanup at allocation
time should get rid of the problem.
Signed-off-by: Chris Friesen <chris.friesen@windriver.com>
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 19 +++++++++----------
pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 12 ++++++++++++
pkg/kubelet/cm/internal_container_lifecycle.go | 9 ---------
3 files changed, 21 insertions(+), 19 deletions(-)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index 08d45c77182..c682f813a8a 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -242,6 +242,9 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe
}
func (m *manager) Allocate(p *v1.Pod, c *v1.Container) error {
+ // Garbage collect any stranded resources before allocating CPUs.
+ m.removeStaleState()
+
m.Lock()
defer m.Unlock()
@@ -422,18 +425,14 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec
}
if cstatus.State.Terminated != nil {
- // Since the container is terminated, we know it is safe to
- // remove it without any reconciliation. Removing the container
- // will also remove it from the `containerMap` so that this
- // container will be skipped next time around the loop.
+ // The container is terminated but we can't call m.RemoveContainer()
+ // here because it could remove the allocated cpuset for the container
+ // which may be in the process of being restarted. That would result
+ // in the container losing any exclusively-allocated CPUs that it
+ // was allocated.
_, _, err := m.containerMap.GetContainerRef(containerID)
if err == nil {
- klog.Warningf("[cpumanager] reconcileState: skipping container; already terminated (pod: %s, container id: %s)", pod.Name, containerID)
- err := m.RemoveContainer(containerID)
- if err != nil {
- klog.Errorf("[cpumanager] reconcileState: failed to remove container (pod: %s, container id: %s, error: %v)", pod.Name, containerID, err)
- failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
- }
+ klog.Warningf("[cpumanager] reconcileState: ignoring terminated (pod: %s, container id: %s)", pod.Name, containerID)
}
continue
}
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
index e806c62e80e..e3e0097cafb 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
@@ -41,6 +41,12 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
)
+type mockSourcesReady struct{}
+
+func (s *mockSourcesReady) AddSource(source string) {}
+
+func (s *mockSourcesReady) AllReady() bool { return false }
+
type mockState struct {
assignments state.ContainerCPUAssignments
defaultCPUSet cpuset.CPUSet
@@ -277,6 +283,8 @@ func TestCPUManagerAdd(t *testing.T) {
podStatusProvider: mockPodStatusProvider{},
}
+ mgr.sourcesReady = &mockSourcesReady{}
+
pod := makePod("fakePod", "fakeContainer", "2", "2")
container := &pod.Spec.Containers[0]
err := mgr.Allocate(pod, container)
@@ -497,6 +505,8 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
podStatusProvider: mockPodStatusProvider{},
}
+ mgr.sourcesReady = &mockSourcesReady{}
+
containers := append(
testCase.pod.Spec.InitContainers,
testCase.pod.Spec.Containers...)
@@ -1038,6 +1048,8 @@ func TestCPUManagerAddWithResvList(t *testing.T) {
podStatusProvider: mockPodStatusProvider{},
}
+ mgr.sourcesReady = &mockSourcesReady{}
+
pod := makePod("fakePod", "fakeContainer", "2", "2")
container := &pod.Spec.Containers[0]
err := mgr.Allocate(pod, container)
diff --git a/pkg/kubelet/cm/internal_container_lifecycle.go b/pkg/kubelet/cm/internal_container_lifecycle.go
index 9e243430269..690718e4e68 100644
--- a/pkg/kubelet/cm/internal_container_lifecycle.go
+++ b/pkg/kubelet/cm/internal_container_lifecycle.go
@@ -54,19 +54,10 @@ func (i *internalContainerLifecycleImpl) PreStartContainer(pod *v1.Pod, containe
}
func (i *internalContainerLifecycleImpl) PreStopContainer(containerID string) error {
- if i.cpuManager != nil {
- return i.cpuManager.RemoveContainer(containerID)
- }
return nil
}
func (i *internalContainerLifecycleImpl) PostStopContainer(containerID string) error {
- if i.cpuManager != nil {
- err := i.cpuManager.RemoveContainer(containerID)
- if err != nil {
- return err
- }
- }
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManager) {
err := i.topologyManager.RemoveContainer(containerID)
if err != nil {
--
2.16.6

View File

@ -0,0 +1,79 @@
From 8b765213a4e6d5cd4eecf361dadfec2851f1dd59 Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@windriver.com>
Date: Fri, 23 Oct 2020 17:46:10 -0600
Subject: [PATCH] enable support for kubernetes to ignore isolcpus
The normal mechanisms for allocating isolated CPUs do not allow
a mix of isolated and exclusive CPUs in the same container. In
order to allow this in *very* limited cases where the pod spec
is known in advance we will add the ability to disable the normal
isolcpus behaviour.
If the file "/etc/kubernetes/ignore_isolcpus" exists, then kubelet
will basically forget everything it knows about isolcpus and just
treat them like regular CPUs.
The admin user can then rely on the fact that CPU allocation is
deterministic to ensure that the isolcpus they configure end up being
allocated to the correct pods.
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 9 +++++++++
pkg/kubelet/cm/cpumanager/policy_static.go | 8 ++++++++
2 files changed, 17 insertions(+)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index c682f813..92992991 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -19,6 +19,7 @@ package cpumanager
import (
"fmt"
"math"
+ "os"
"sync"
"time"
"strings"
@@ -56,6 +57,14 @@ const cpuManagerStateFileName = "cpu_manager_state"
// get the system-level isolated CPUs
func getIsolcpus() cpuset.CPUSet {
+
+ // This is a gross hack to basically turn off awareness of isolcpus to enable
+ // isolated cpus to be allocated to pods the same way as non-isolated CPUs.
+ if _, err := os.Stat("/etc/kubernetes/ignore_isolcpus"); err == nil {
+ klog.Infof("[cpumanager] turning off isolcpus awareness")
+ return cpuset.NewCPUSet()
+ }
+
dat, err := ioutil.ReadFile("/sys/devices/system/cpu/isolated")
if err != nil {
klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir")
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index 1913065e..4fb3202f 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -18,6 +18,7 @@ package cpumanager
import (
"fmt"
+ "os"
"strconv"
v1 "k8s.io/api/core/v1"
@@ -510,6 +511,13 @@ func isKubeInfra(pod *v1.Pod) bool {
// get the isolated CPUs (if any) from the devices associated with a specific container
func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet {
+
+ // This is a gross hack to basically turn off awareness of isolcpus to enable
+ // isolated cpus to be allocated to pods the same way as non-isolated CPUs.
+ if _, err := os.Stat("/etc/kubernetes/ignore_isolcpus"); err == nil {
+ return cpuset.NewCPUSet()
+ }
+
// NOTE: This is required for TestStaticPolicyAdd() since makePod() does
// not create UID. We also need a way to properly stub devicemanager.
if len(string(pod.UID)) == 0 {
--
2.16.6

View File

@ -0,0 +1,12 @@
diff --git a/vendor/golang.org/x/net/http2/transport.go b/vendor/golang.org/x/net/http2/transport.go
index aeac7d8..ec18648 100644
--- a/vendor/golang.org/x/net/http2/transport.go
+++ b/vendor/golang.org/x/net/http2/transport.go
@@ -2404,6 +2404,7 @@ func strSliceContains(ss []string, s string) bool {
type erringRoundTripper struct{ err error }
+func (rt erringRoundTripper) IsHTTP2ErringRoundtripper() {}
func (rt erringRoundTripper) RoundTrip(*http.Request) (*http.Response, error) { return nil, rt.err }
// gzipReader wraps a response body so it can lazily

View File

@ -0,0 +1,106 @@
From 9ff79a463fd4502dd1800198bc0b367e5861baf3 Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@windriver.com>
Date: Fri, 28 Aug 2020 21:17:42 -0600
Subject: [PATCH] kubeadm: create platform pods with zero CPU resources
We want to specify zero CPU resources when creating the manifests
for the static platform pods, as a workaround for the lack of
separate resource tracking for platform resources.
We also specify zero CPU resources for the coredns deployment.
manifests.go appears to be the main file for this, not sure if the
others are used by I changed them just in case.
---
cluster/addons/dns/coredns/coredns.yaml.base | 2 +-
cluster/addons/dns/coredns/coredns.yaml.in | 2 +-
cluster/addons/dns/coredns/coredns.yaml.sed | 2 +-
cmd/kubeadm/app/phases/addons/dns/manifests.go | 2 +-
cmd/kubeadm/app/phases/controlplane/manifests.go | 6 +++---
5 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/cluster/addons/dns/coredns/coredns.yaml.base b/cluster/addons/dns/coredns/coredns.yaml.base
index a8f0afb5085..45054a29420 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.base
+++ b/cluster/addons/dns/coredns/coredns.yaml.base
@@ -124,7 +124,7 @@ spec:
limits:
memory: __PILLAR__DNS__MEMORY__LIMIT__
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cluster/addons/dns/coredns/coredns.yaml.in b/cluster/addons/dns/coredns/coredns.yaml.in
index ad65d946095..9b2b183faec 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.in
+++ b/cluster/addons/dns/coredns/coredns.yaml.in
@@ -124,7 +124,7 @@ spec:
limits:
memory: {{ pillar['dns_memory_limit'] }}
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cluster/addons/dns/coredns/coredns.yaml.sed b/cluster/addons/dns/coredns/coredns.yaml.sed
index 3c86e5749a5..6b0c3388bcd 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.sed
+++ b/cluster/addons/dns/coredns/coredns.yaml.sed
@@ -124,7 +124,7 @@ spec:
limits:
memory: $DNS_MEMORY_LIMIT
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cmd/kubeadm/app/phases/addons/dns/manifests.go b/cmd/kubeadm/app/phases/addons/dns/manifests.go
index 737d9d97cbe..7a3b2d61f37 100644
--- a/cmd/kubeadm/app/phases/addons/dns/manifests.go
+++ b/cmd/kubeadm/app/phases/addons/dns/manifests.go
@@ -250,7 +250,7 @@ spec:
limits:
memory: 170Mi
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cmd/kubeadm/app/phases/controlplane/manifests.go b/cmd/kubeadm/app/phases/controlplane/manifests.go
index ae9cd77e259..9222805ecfb 100644
--- a/cmd/kubeadm/app/phases/controlplane/manifests.go
+++ b/cmd/kubeadm/app/phases/controlplane/manifests.go
@@ -57,7 +57,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
Command: getAPIServerCommand(cfg, endpoint),
VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeAPIServer)),
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/healthz", int(endpoint.BindPort), v1.URISchemeHTTPS),
- Resources: staticpodutil.ComponentResources("250m"),
+ Resources: staticpodutil.ComponentResources("0"),
Env: kubeadmutil.GetProxyEnvVars(),
}, mounts.GetVolumes(kubeadmconstants.KubeAPIServer),
map[string]string{kubeadmconstants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey: endpoint.String()}),
@@ -68,7 +68,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
Command: getControllerManagerCommand(cfg),
VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeControllerManager)),
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS),
- Resources: staticpodutil.ComponentResources("200m"),
+ Resources: staticpodutil.ComponentResources("0"),
Env: kubeadmutil.GetProxyEnvVars(),
}, mounts.GetVolumes(kubeadmconstants.KubeControllerManager), nil),
kubeadmconstants.KubeScheduler: staticpodutil.ComponentPod(v1.Container{
@@ -78,7 +78,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
Command: getSchedulerCommand(cfg),
VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeScheduler)),
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS),
- Resources: staticpodutil.ComponentResources("100m"),
+ Resources: staticpodutil.ComponentResources("0"),
Env: kubeadmutil.GetProxyEnvVars(),
}, mounts.GetVolumes(kubeadmconstants.KubeScheduler), nil),
}
--
2.24.2

View File

@ -0,0 +1,111 @@
From 696c016ebaae6c4cfa24fb5a492d20ebde41d7f8 Mon Sep 17 00:00:00 2001
From: Jim Gauld <james.gauld@windriver.com>
Date: Thu, 5 Sep 2019 10:46:58 -0400
Subject: [PATCH 1/6] kubelet cpumanager disable CFS quota throttling for
Guaranteed pods
This disables CFS CPU quota to avoid performance degradation due to
Linux kernel CFS quota implementation. Note that 4.18 kernel attempts
to solve the CFS throttling problem, but there are reports that it is
not completely effective.
This disables CFS quota throttling for Guaranteed pods for both
parent and container cgroups by writing -1 to cgroup cpu.cfs_quota_us.
Disabling has a dramatic latency improvement for HTTP response times.
Signed-off-by: Jim Gauld <james.gauld@windriver.com>
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 22 ++++++++++++++++++++++
pkg/kubelet/cm/helpers_linux.go | 5 +++++
pkg/kubelet/cm/helpers_linux_test.go | 8 ++++----
3 files changed, 31 insertions(+), 4 deletions(-)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index 616a620f8ce..c0c440453a9 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -36,6 +36,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/status"
+ v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
)
// ActivePodsFunc is a function that returns a list of pods to reconcile.
@@ -230,6 +231,14 @@ func (m *manager) AddContainer(p *v1.Pod, c *v1.Container, containerID string) e
// Get the CPUs assigned to the container during Allocate()
// (or fall back to the default CPUSet if none were assigned).
cpus := m.state.GetCPUSetOrDefault(string(p.UID), c.Name)
+
+ // Guaranteed PODs should not have CFS quota throttle
+ if m.policy.Name() == string(PolicyStatic) && v1qos.GetPodQOS(p) == v1.PodQOSGuaranteed {
+ err := m.disableContainerCPUQuota(containerID)
+ if err != nil {
+ klog.Errorf("[cpumanager] AddContainer disable CPU Quota error: %v", err)
+ }
+ }
m.Unlock()
if !cpus.IsEmpty() {
@@ -462,3 +471,16 @@ func (m *manager) updateContainerCPUSet(containerID string, cpus cpuset.CPUSet)
CpusetCpus: cpus.String(),
})
}
+
+func (m *manager) disableContainerCPUQuota(containerID string) error {
+ // Disable CFS CPU quota to avoid performance degradation due to
+ // Linux kernel CFS throttle implementation.
+ // NOTE: 4.18 kernel attempts to solve CFS throttling problem,
+ // but there are reports that it is not completely effective.
+ return m.containerRuntime.UpdateContainerResources(
+ containerID,
+ &runtimeapi.LinuxContainerResources{
+ CpuPeriod: 100000,
+ CpuQuota: -1,
+ })
+}
diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go
index f6a1d519026..8aa6f87ad49 100644
--- a/pkg/kubelet/cm/helpers_linux.go
+++ b/pkg/kubelet/cm/helpers_linux.go
@@ -157,6 +157,11 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64)
// determine the qos class
qosClass := v1qos.GetPodQOS(pod)
+ // disable cfs quota for guaranteed pods
+ if qosClass == v1.PodQOSGuaranteed {
+ cpuQuota = int64(-1)
+ }
+
// build the result
result := &ResourceConfig{}
if qosClass == v1.PodQOSGuaranteed {
diff --git a/pkg/kubelet/cm/helpers_linux_test.go b/pkg/kubelet/cm/helpers_linux_test.go
index 56d765fbc22..0c43afe5875 100644
--- a/pkg/kubelet/cm/helpers_linux_test.go
+++ b/pkg/kubelet/cm/helpers_linux_test.go
@@ -63,8 +63,8 @@ func TestResourceConfigForPod(t *testing.T) {
burstablePartialShares := MilliCPUToShares(200)
burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod))
guaranteedShares := MilliCPUToShares(100)
- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod))
- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod))
+ guaranteedQuota := int64(-1)
+ guaranteedTunedQuota := int64(-1)
memoryQuantity = resource.MustParse("100Mi")
cpuNoLimit := int64(-1)
guaranteedMemory := memoryQuantity.Value()
@@ -283,8 +283,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
burstablePartialShares := MilliCPUToShares(200)
burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod))
guaranteedShares := MilliCPUToShares(100)
- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod))
- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod))
+ guaranteedQuota := int64(-1)
+ guaranteedTunedQuota := int64(-1)
memoryQuantity = resource.MustParse("100Mi")
cpuNoLimit := int64(-1)
guaranteedMemory := memoryQuantity.Value()
--
2.16.6

View File

@ -0,0 +1,139 @@
From d0e89da9ebcbd9a13051ab5366b6daef2cec9bbe Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@windriver.com>
Date: Fri, 27 Sep 2019 14:11:54 -0600
Subject: [PATCH 4/6] kubelet cpumanager infrastructure pods use system
reserved CPUs
This assigns system infrastructure pods to the "reserved" cpuset
to isolate them from the shared pool of CPUs.
Infrastructure pods include any pods that belong to the kube-system,
armada, cert-manager, vault, platform-deployment-manager, portieris,
or notification namespaces.
The implementation is a bit simplistic, it is assumed that the
"reserved" cpuset is large enough to handle all infrastructure pods
CPU allocations.
This also prevents infrastucture pods from using Guaranteed resources.
Signed-off-by: Chris Friesen <chris.friesen@windriver.com>
---
pkg/kubelet/cm/cpumanager/policy_static.go | 45 +++++++++++++++++++++++++
pkg/kubelet/cm/cpumanager/policy_static_test.go | 19 ++++++++++-
2 files changed, 63 insertions(+), 1 deletion(-)
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index e631d5d6a74..e511caf7ab7 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -32,6 +32,11 @@ import (
// PolicyStatic is the name of the static policy
const PolicyStatic policyName = "static"
+// Define namespaces used by platform infrastructure pods
+var infraNamespaces = [...]string{
+ "kube-system", "armada", "cert-manager", "platform-deployment-manager", "portieris", "vault", "notification",
+}
+
// staticPolicy is a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
@@ -205,6 +210,32 @@ func (p *staticPolicy) assignableCPUs(s state.State) cpuset.CPUSet {
}
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error {
+ // Process infra pods before guaranteed pods
+ if isKubeInfra(pod) {
+ // Container belongs in reserved pool.
+ // We don't want to fall through to the p.guaranteedCPUs() clause below so return either nil or error.
+ if _, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
+ klog.Infof("[cpumanager] static policy: reserved container already present in state, skipping " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s)",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ return nil
+ }
+
+ cpuset := p.reserved
+ if cpuset.IsEmpty() {
+ // If this happens then someone messed up.
+ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved)
+ }
+ s.SetCPUSet(string(pod.UID), container.Name, cpuset)
+ klog.Infof("[cpumanager] static policy: reserved: AddContainer " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset)
+ return nil
+ }
+
+
if numCPUs := p.guaranteedCPUs(pod, container); numCPUs != 0 {
klog.Infof("[cpumanager] static policy: Allocate (pod: %s, container: %s)", pod.Name, container.Name)
// container belongs in an exclusively allocated pool
@@ -300,6 +331,10 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int
if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
return 0
}
+ // Infrastructure pods use reserved CPUs even if they're in the Guaranteed QoS class
+ if isKubeInfra(pod) {
+ return 0
+ }
// Safe downcast to do for all systems with < 2.1 billion CPUs.
// Per the language spec, `int` is guaranteed to be at least 32 bits wide.
// https://golang.org/ref/spec#Numeric_types
@@ -417,3 +452,13 @@ func (p *staticPolicy) generateCPUTopologyHints(availableCPUs cpuset.CPUSet, req
return hints
}
+
+// check if a given pod is in a platform infrastructure namespace
+func isKubeInfra(pod *v1.Pod) bool {
+ for _, namespace := range infraNamespaces {
+ if namespace == pod.Namespace {
+ return true
+ }
+ }
+ return false
+}
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index b2982432c13..04947d28055 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -747,7 +747,8 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
}
func TestStaticPolicyAddWithResvList(t *testing.T) {
-
+ infraPod := makePod("fakePod", "fakeContainer2", "200m", "200m")
+ infraPod.Namespace = "kube-system"
testCases := []staticPolicyTestWithResvList{
{
description: "GuPodSingleCore, SingleSocketHT, ExpectError",
@@ -789,6 +790,22 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(4, 5),
},
+ {
+ description: "InfraPod, SingleSocketHT, ExpectAllocReserved",
+ topo: topoSingleSocketHT,
+ numReservedCPUs: 2,
+ reserved: cpuset.NewCPUSet(0, 1),
+ stAssignments: state.ContainerCPUAssignments{
+ "fakePod": map[string]cpuset.CPUSet{
+ "fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
+ },
+ },
+ stDefaultCPUSet: cpuset.NewCPUSet(4, 5),
+ pod: infraPod,
+ expErr: nil,
+ expCPUAlloc: true,
+ expCSet: cpuset.NewCPUSet(0, 1),
+ },
}
testExcl := true
--
2.16.6

View File

@ -0,0 +1,526 @@
From de3b9749f765398d4064c3225caa0a960d27eff3 Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@windriver.com>
Date: Thu, 9 Apr 2020 12:52:19 -0600
Subject: [PATCH 5/6] kubelet cpumanager introduce concept of isolated CPUs
This introduces the concept of "isolated CPUs", which are CPUs that
have been isolated at the kernel level via the "isolcpus" kernel boot
parameter.
When starting the kubelet process, two separate sets of reserved CPUs
may be specified. With this change CPUs reserved via
'--system-reserved=cpu' will be used for infrastructure pods while the
isolated CPUs should be reserved via '--kube-reserved=cpu' to cause
kubelet to skip over them for "normal" CPU resource tracking. The
kubelet code will double-check that the specified isolated CPUs match
what the kernel exposes in "/sys/devices/system/cpu/isolated".
A plugin (outside the scope of this commit) will expose the isolated
CPUs to kubelet via the device plugin API.
If a pod specifies some number of "isolcpus" resources, the device
manager will allocate them. In this code we check whether such
resources have been allocated, and if so we set the container cpuset to
the isolated CPUs. This does mean that it really only makes sense to
specify "isolcpus" resources for best-effort or burstable pods, not for
guaranteed ones since that would throw off the accounting code. In
order to ensure the accounting still works as designed, if "isolcpus"
are specified for guaranteed pods, the affinity will be set to the
non-isolated CPUs.
Signed-off-by: Chris Friesen <chris.friesen@windriver.com>
Co-authored-by: Jim Gauld <james.gauld@windriver.com>
---
pkg/kubelet/cm/container_manager_linux.go | 1 +
pkg/kubelet/cm/cpumanager/cpu_manager.go | 30 ++++++++-
pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 14 +++-
pkg/kubelet/cm/cpumanager/policy_static.go | 86 +++++++++++++++++++++++--
pkg/kubelet/cm/cpumanager/policy_static_test.go | 46 ++++++++++---
5 files changed, 158 insertions(+), 19 deletions(-)
diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go
index 13c7176bdc2..e6ffb7a6194 100644
--- a/pkg/kubelet/cm/container_manager_linux.go
+++ b/pkg/kubelet/cm/container_manager_linux.go
@@ -325,6 +325,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
cm.GetNodeAllocatableReservation(),
nodeConfig.KubeletRootDir,
cm.topologyManager,
+ cm.deviceManager,
)
if err != nil {
klog.Errorf("failed to initialize cpu manager: %v", err)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index 322a2040a77..08d45c77182 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -21,6 +21,8 @@ import (
"math"
"sync"
"time"
+ "strings"
+ "io/ioutil"
cadvisorapi "github.com/google/cadvisor/info/v1"
v1 "k8s.io/api/core/v1"
@@ -34,6 +36,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/status"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
@@ -51,6 +54,25 @@ type policyName string
// cpuManagerStateFileName is the file name where cpu manager stores its state
const cpuManagerStateFileName = "cpu_manager_state"
+// get the system-level isolated CPUs
+func getIsolcpus() cpuset.CPUSet {
+ dat, err := ioutil.ReadFile("/sys/devices/system/cpu/isolated")
+ if err != nil {
+ klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir")
+ return cpuset.NewCPUSet()
+ }
+
+ // The isolated cpus string ends in a newline
+ cpustring := strings.TrimSuffix(string(dat), "\n")
+ cset, err := cpuset.Parse(cpustring)
+ if err != nil {
+ klog.Errorf("[cpumanager] unable to parse sysfs isolcpus string to cpuset")
+ return cpuset.NewCPUSet()
+ }
+
+ return cset
+}
+
// Manager interface provides methods for Kubelet to manage pod cpus.
type Manager interface {
// Start is called during Kubelet initialization.
@@ -127,7 +149,7 @@ func (s *sourcesReadyStub) AddSource(source string) {}
func (s *sourcesReadyStub) AllReady() bool { return true }
// NewManager creates new cpu manager based on provided policy
-func NewManager(cpuPolicyName string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, numaNodeInfo topology.NUMANodeInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store) (Manager, error) {
+func NewManager(cpuPolicyName string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, numaNodeInfo topology.NUMANodeInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store, deviceManager devicemanager.Manager) (Manager, error) {
var topo *topology.CPUTopology
var policy Policy
@@ -164,7 +186,11 @@ func NewManager(cpuPolicyName string, reconcilePeriod time.Duration, machineInfo
// NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset.
// This variable is primarily to make testing easier.
excludeReserved := true
- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, excludeReserved)
+ // isolCPUs is the set of kernel-isolated CPUs. They should be a subset of specificCPUs or
+ // of the CPUs that NewStaticPolicy() will pick if numReservedCPUs is set. It's only in the
+ // argument list here for ease of testing, it's really internal to the policy.
+ isolCPUs := getIsolcpus()
+ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, isolCPUs, affinity, deviceManager, excludeReserved)
if err != nil {
return nil, fmt.Errorf("new static policy error: %v", err)
}
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
index a4d8f13c853..e806c62e80e 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
@@ -38,6 +38,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
)
type mockState struct {
@@ -207,6 +208,7 @@ func makeMultiContainerPod(initCPUs, appCPUs []struct{ request, limit string })
}
func TestCPUManagerAdd(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
@@ -222,7 +224,8 @@ func TestCPUManagerAdd(t *testing.T) {
},
0,
cpuset.NewCPUSet(),
- topologymanager.NewFakeManager(), testExcl)
+ cpuset.NewCPUSet(),
+ topologymanager.NewFakeManager(), testDM, testExcl)
testCases := []struct {
description string
updateErr error
@@ -476,8 +479,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
}
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testExcl)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, testExcl)
state := &mockState{
assignments: testCase.stAssignments,
@@ -617,7 +621,8 @@ func TestCPUManagerGenerate(t *testing.T) {
}
defer os.RemoveAll(sDir)
- mgr, err := NewManager(testCase.cpuPolicyName, 5*time.Second, machineInfo, nil, cpuset.NewCPUSet(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager())
+ testDM, err := devicemanager.NewManagerStub()
+ mgr, err := NewManager(testCase.cpuPolicyName, 5*time.Second, machineInfo, nil, cpuset.NewCPUSet(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM)
if testCase.expectedError != nil {
if !strings.Contains(err.Error(), testCase.expectedError.Error()) {
t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), testCase.expectedError.Error())
@@ -972,6 +977,7 @@ func TestReconcileState(t *testing.T) {
// the following tests are with --reserved-cpus configured
func TestCPUManagerAddWithResvList(t *testing.T) {
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -986,7 +992,9 @@ func TestCPUManagerAddWithResvList(t *testing.T) {
},
1,
cpuset.NewCPUSet(0),
+ cpuset.NewCPUSet(),
topologymanager.NewFakeManager(),
+ testDM,
testExcl,
)
testCases := []struct {
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index e511caf7ab7..490e7675679 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -18,6 +18,7 @@ package cpumanager
import (
"fmt"
+ "strconv"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
@@ -27,6 +28,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
)
// PolicyStatic is the name of the static policy
@@ -80,6 +82,10 @@ type staticPolicy struct {
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reserved cpuset.CPUSet
+ // subset of reserved CPUs with isolcpus attribute
+ isolcpus cpuset.CPUSet
+ // parent containerManager, used to get device list
+ deviceManager devicemanager.Manager
// If true, default CPUSet should exclude reserved CPUs
excludeReserved bool
// topology manager reference to get container Topology affinity
@@ -92,7 +98,7 @@ var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, excludeReserved bool) (Policy, error) {
+func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, isolCPUs cpuset.CPUSet, affinity topologymanager.Store, deviceManager devicemanager.Manager, excludeReserved bool) (Policy, error) {
allCPUs := topology.CPUDetails.CPUs()
var reserved cpuset.CPUSet
if reservedCPUs.Size() > 0 {
@@ -113,9 +119,17 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
klog.Infof("[cpumanager] reserved %d CPUs (\"%s\") not available for exclusive assignment", reserved.Size(), reserved)
+ if !isolCPUs.IsSubsetOf(reserved) {
+ klog.Errorf("[cpumanager] isolCPUs %v is not a subset of reserved %v", isolCPUs, reserved)
+ reserved = reserved.Union(isolCPUs)
+ klog.Warningf("[cpumanager] mismatch isolCPUs %v, force reserved %v", isolCPUs, reserved)
+ }
+
return &staticPolicy{
topology: topology,
reserved: reserved,
+ isolcpus: isolCPUs,
+ deviceManager: deviceManager,
excludeReserved: excludeReserved,
affinity: affinity,
}, nil
@@ -151,8 +165,8 @@ func (p *staticPolicy) validateState(s state.State) error {
} else {
s.SetDefaultCPUSet(allCPUs)
}
- klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n",
- allCPUs, p.reserved, s.GetDefaultCPUSet())
+ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, isolcpus:%v, default:%v\n",
+ allCPUs, p.reserved, p.isolcpus, s.GetDefaultCPUSet())
return nil
}
@@ -221,12 +235,13 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
return nil
}
- cpuset := p.reserved
+ // TODO: Is the clone actually needed?
+ cpuset := p.reserved.Clone().Difference(p.isolcpus)
if cpuset.IsEmpty() {
// If this happens then someone messed up.
return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus " +
- "(namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v",
- pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved)
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v, isolcpus:%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved, p.isolcpus)
}
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
klog.Infof("[cpumanager] static policy: reserved: AddContainer " +
@@ -267,7 +282,37 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
}
}
}
+ klog.Infof("[cpumanager] guaranteed: AddContainer " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); numCPUS=%d, cpuset=%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, numCPUs, cpuset)
+ return nil
+ }
+
+ if isolcpus := p.podIsolCPUs(pod, container); isolcpus.Size() > 0 {
+ // container has requested isolated CPUs
+ if set, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
+ if set.Equals(isolcpus) {
+ klog.Infof("[cpumanager] isolcpus container already present in state, skipping " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s)",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ return nil
+ } else {
+ klog.Infof("[cpumanager] isolcpus container state has cpus %v, should be %v" +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s)",
+ isolcpus, set, pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ }
+ }
+ // Note that we do not do anything about init containers here.
+ // It looks like devices are allocated per-pod based on effective requests/limits
+ // and extra devices from initContainers are not freed up when the regular containers start.
+ // TODO: confirm this is still true for 1.18
+ s.SetCPUSet(string(pod.UID), container.Name, isolcpus)
+ klog.Infof("[cpumanager] isolcpus: AddContainer " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, isolcpus)
+ return nil
}
+
// container belongs in the shared pool (nothing to do; use default cpuset)
return nil
}
@@ -462,3 +507,32 @@ func isKubeInfra(pod *v1.Pod) bool {
}
return false
}
+
+// get the isolated CPUs (if any) from the devices associated with a specific container
+func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet {
+ // NOTE: This is required for TestStaticPolicyAdd() since makePod() does
+ // not create UID. We also need a way to properly stub devicemanager.
+ if len(string(pod.UID)) == 0 {
+ return cpuset.NewCPUSet()
+ }
+ devices := p.deviceManager.GetDevices(string(pod.UID), container.Name)
+ for _, dev := range devices {
+ // this resource name needs to match the isolcpus device plugin
+ if dev.ResourceName == "windriver.com/isolcpus" {
+ cpuStrList := dev.DeviceIds
+ if len(cpuStrList) > 0 {
+ cpuSet := cpuset.NewCPUSet()
+ // loop over the list of strings, convert each one to int, add to cpuset
+ for _, cpuStr := range cpuStrList {
+ cpu, err := strconv.Atoi(cpuStr)
+ if err != nil {
+ panic(err)
+ }
+ cpuSet = cpuSet.Union(cpuset.NewCPUSet(cpu))
+ }
+ return cpuSet
+ }
+ }
+ }
+ return cpuset.NewCPUSet()
+}
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index 04947d28055..999ab3c1af0 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -27,6 +27,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
)
type staticPolicyTest struct {
@@ -45,8 +46,9 @@ type staticPolicyTest struct {
}
func TestStaticPolicyName(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testExcl := false
- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testExcl)
+ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, testExcl)
policyName := policy.Name()
if policyName != "static" {
@@ -56,6 +58,7 @@ func TestStaticPolicyName(t *testing.T) {
}
func TestStaticPolicyStart(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []staticPolicyTest{
{
description: "non-corrupted state",
@@ -131,7 +134,7 @@ func TestStaticPolicyStart(t *testing.T) {
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testCase.excludeReserved)
+ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, testCase.excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
assignments: testCase.stAssignments,
@@ -179,6 +182,7 @@ func TestStaticPolicyAdd(t *testing.T) {
largeTopoSock0CPUSet := largeTopoSock0Builder.Result()
largeTopoSock1CPUSet := largeTopoSock1Builder.Result()
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []staticPolicyTest{
{
description: "GuPodSingleCore, SingleSocketHT, ExpectError",
@@ -447,7 +451,7 @@ func TestStaticPolicyAdd(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testCase.excludeReserved)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, testCase.excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -490,6 +494,7 @@ func TestStaticPolicyAdd(t *testing.T) {
}
func TestStaticPolicyRemove(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
excludeReserved := false
testCases := []staticPolicyTest{
{
@@ -549,7 +554,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), excludeReserved)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -571,6 +576,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
func TestTopologyAwareAllocateCPUs(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
excludeReserved := false
testCases := []struct {
description string
@@ -640,7 +646,7 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) {
},
}
for _, tc := range testCases {
- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), excludeReserved)
+ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
assignments: tc.stAssignments,
@@ -673,6 +679,7 @@ type staticPolicyTestWithResvList struct {
topo *topology.CPUTopology
numReservedCPUs int
reserved cpuset.CPUSet
+ isolcpus cpuset.CPUSet
stAssignments state.ContainerCPUAssignments
stDefaultCPUSet cpuset.CPUSet
pod *v1.Pod
@@ -713,9 +720,10 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
},
}
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), testExcl)
+ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, testExcl)
if !reflect.DeepEqual(err, testCase.expNewErr) {
t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v",
testCase.description, testCase.expNewErr, err)
@@ -755,6 +763,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 1,
reserved: cpuset.NewCPUSet(0),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"),
@@ -767,6 +776,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"),
@@ -779,6 +789,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{
"fakePod": map[string]cpuset.CPUSet{
"fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
@@ -795,6 +806,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{
"fakePod": map[string]cpuset.CPUSet{
"fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
@@ -806,12 +818,30 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(0, 1),
},
+ {
+ description: "InfraPod, SingleSocketHT, Isolcpus, ExpectAllocReserved",
+ topo: topoSingleSocketHT,
+ numReservedCPUs: 2,
+ reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(1),
+ stAssignments: state.ContainerCPUAssignments{
+ "fakePod": map[string]cpuset.CPUSet{
+ "fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
+ },
+ },
+ stDefaultCPUSet: cpuset.NewCPUSet(4, 5),
+ pod: infraPod,
+ expErr: nil,
+ expCPUAlloc: true,
+ expCSet: cpuset.NewCPUSet(0),
+ },
}
testExcl := true
+ testDM, _ := devicemanager.NewManagerStub()
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), testExcl)
-
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, testCase.isolcpus, topologymanager.NewFakeManager(), testDM, testExcl)
+
st := &mockState{
assignments: testCase.stAssignments,
defaultCPUSet: testCase.stDefaultCPUSet,
--
2.16.6

View File

@ -0,0 +1,312 @@
From c109ab23f98b00ee5f98000c760985da967d47a9 Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@windriver.com>
Date: Tue, 1 Oct 2019 00:16:00 -0600
Subject: [PATCH 3/6] kubelet cpumanager keep normal containers off reserved
CPUs
When starting the kubelet process, two separate sets of reserved CPUs
may be specified. With this change CPUs reserved via '--system-reserved=cpu'
or '--kube-reserved=cpu' will be ignored by kubernetes itself. A small
tweak to the default CPU affinity ensures that "normal" Kubernetes
pods won't run on the reserved CPUs.
Signed-off-by: Chris Friesen <chris.friesen@windriver.com>
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 5 +++-
pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 11 +++++---
pkg/kubelet/cm/cpumanager/policy_static.go | 29 ++++++++++++++++----
pkg/kubelet/cm/cpumanager/policy_static_test.go | 35 ++++++++++++++++++-------
4 files changed, 61 insertions(+), 19 deletions(-)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index c0c440453a9..322a2040a77 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -161,7 +161,10 @@ func NewManager(cpuPolicyName string, reconcilePeriod time.Duration, machineInfo
// exclusively allocated.
reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity)
+ // NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset.
+ // This variable is primarily to make testing easier.
+ excludeReserved := true
+ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, excludeReserved)
if err != nil {
return nil, fmt.Errorf("new static policy error: %v", err)
}
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
index e9c7852c602..a4d8f13c853 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
@@ -207,6 +207,7 @@ func makeMultiContainerPod(initCPUs, appCPUs []struct{ request, limit string })
}
func TestCPUManagerAdd(t *testing.T) {
+ testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -221,7 +222,7 @@ func TestCPUManagerAdd(t *testing.T) {
},
0,
cpuset.NewCPUSet(),
- topologymanager.NewFakeManager())
+ topologymanager.NewFakeManager(), testExcl)
testCases := []struct {
description string
updateErr error
@@ -474,8 +475,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
},
}
+ testExcl := false
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testExcl)
state := &mockState{
assignments: testCase.stAssignments,
@@ -969,6 +971,7 @@ func TestReconcileState(t *testing.T) {
// above test cases are without kubelet --reserved-cpus cmd option
// the following tests are with --reserved-cpus configured
func TestCPUManagerAddWithResvList(t *testing.T) {
+ testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -983,7 +986,9 @@ func TestCPUManagerAddWithResvList(t *testing.T) {
},
1,
cpuset.NewCPUSet(0),
- topologymanager.NewFakeManager())
+ topologymanager.NewFakeManager(),
+ testExcl,
+ )
testCases := []struct {
description string
updateErr error
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index da68ed808bd..e631d5d6a74 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -75,6 +75,8 @@ type staticPolicy struct {
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reserved cpuset.CPUSet
+ // If true, default CPUSet should exclude reserved CPUs
+ excludeReserved bool
// topology manager reference to get container Topology affinity
affinity topologymanager.Store
}
@@ -85,7 +87,7 @@ var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store) (Policy, error) {
+func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, excludeReserved bool) (Policy, error) {
allCPUs := topology.CPUDetails.CPUs()
var reserved cpuset.CPUSet
if reservedCPUs.Size() > 0 {
@@ -109,6 +111,7 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
return &staticPolicy{
topology: topology,
reserved: reserved,
+ excludeReserved: excludeReserved,
affinity: affinity,
}, nil
}
@@ -136,7 +139,15 @@ func (p *staticPolicy) validateState(s state.State) error {
}
// state is empty initialize
allCPUs := p.topology.CPUDetails.CPUs()
- s.SetDefaultCPUSet(allCPUs)
+ if p.excludeReserved {
+ // Exclude reserved CPUs from the default CPUSet to keep containers off them
+ // unless explicitly affined.
+ s.SetDefaultCPUSet(allCPUs.Difference(p.reserved))
+ } else {
+ s.SetDefaultCPUSet(allCPUs)
+ }
+ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n",
+ allCPUs, p.reserved, s.GetDefaultCPUSet())
return nil
}
@@ -144,9 +155,11 @@ func (p *staticPolicy) validateState(s state.State) error {
// 1. Check if the reserved cpuset is not part of default cpuset because:
// - kube/system reserved have changed (increased) - may lead to some containers not being able to start
// - user tampered with file
- if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) {
- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
- p.reserved.String(), tmpDefaultCPUset.String())
+ if !p.excludeReserved {
+ if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) {
+ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
+ p.reserved.String(), tmpDefaultCPUset.String())
+ }
}
// 2. Check if state for static policy is consistent
@@ -175,6 +188,9 @@ func (p *staticPolicy) validateState(s state.State) error {
}
}
totalKnownCPUs = totalKnownCPUs.UnionAll(tmpCPUSets)
+ if p.excludeReserved {
+ totalKnownCPUs = totalKnownCPUs.Union(p.reserved)
+ }
if !totalKnownCPUs.Equals(p.topology.CPUDetails.CPUs()) {
return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"",
p.topology.CPUDetails.CPUs().String(), totalKnownCPUs.String())
@@ -229,6 +245,9 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa
klog.Infof("[cpumanager] static policy: RemoveContainer (pod: %s, container: %s)", podUID, containerName)
if toRelease, ok := s.GetCPUSet(podUID, containerName); ok {
s.Delete(podUID, containerName)
+ if p.excludeReserved {
+ toRelease = toRelease.Difference(p.reserved)
+ }
// Mutate the shared pool, adding released cpus.
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
}
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index ea2bcf11333..b2982432c13 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -33,6 +33,7 @@ type staticPolicyTest struct {
description string
topo *topology.CPUTopology
numReservedCPUs int
+ excludeReserved bool
podUID string
containerName string
stAssignments state.ContainerCPUAssignments
@@ -44,7 +45,8 @@ type staticPolicyTest struct {
}
func TestStaticPolicyName(t *testing.T) {
- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ testExcl := false
+ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testExcl)
policyName := policy.Name()
if policyName != "static" {
@@ -74,6 +76,15 @@ func TestStaticPolicyStart(t *testing.T) {
stDefaultCPUSet: cpuset.NewCPUSet(),
expCSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
},
+ {
+ description: "empty cpuset exclude reserved",
+ topo: topoDualSocketHT,
+ numReservedCPUs: 2,
+ excludeReserved: true,
+ stAssignments: state.ContainerCPUAssignments{},
+ stDefaultCPUSet: cpuset.NewCPUSet(),
+ expCSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 7, 8, 9, 10, 11),
+ },
{
description: "reserved cores 0 & 6 are not present in available cpuset",
topo: topoDualSocketHT,
@@ -120,7 +131,7 @@ func TestStaticPolicyStart(t *testing.T) {
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testCase.excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
assignments: testCase.stAssignments,
@@ -436,7 +447,7 @@ func TestStaticPolicyAdd(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testCase.excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -479,6 +490,7 @@ func TestStaticPolicyAdd(t *testing.T) {
}
func TestStaticPolicyRemove(t *testing.T) {
+ excludeReserved := false
testCases := []staticPolicyTest{
{
description: "SingleSocketHT, DeAllocOneContainer",
@@ -537,7 +549,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -559,6 +571,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
func TestTopologyAwareAllocateCPUs(t *testing.T) {
+ excludeReserved := false
testCases := []struct {
description string
topo *topology.CPUTopology
@@ -627,7 +640,7 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) {
},
}
for _, tc := range testCases {
- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
assignments: tc.stAssignments,
@@ -699,9 +712,10 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"),
},
}
+ testExcl := false
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager())
+ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), testExcl)
if !reflect.DeepEqual(err, testCase.expNewErr) {
t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v",
testCase.description, testCase.expNewErr, err)
@@ -741,7 +755,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
numReservedCPUs: 1,
reserved: cpuset.NewCPUSet(0),
stAssignments: state.ContainerCPUAssignments{},
- stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
+ stDefaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request"),
expCPUAlloc: false,
@@ -753,7 +767,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
stAssignments: state.ContainerCPUAssignments{},
- stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
+ stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"),
expErr: nil,
expCPUAlloc: true,
@@ -769,7 +783,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
"fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
},
},
- stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 4, 5),
+ stDefaultCPUSet: cpuset.NewCPUSet(4, 5),
pod: makePod("fakePod", "fakeContainer3", "2000m", "2000m"),
expErr: nil,
expCPUAlloc: true,
@@ -777,8 +791,9 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
},
}
+ testExcl := true
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager())
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), testExcl)
st := &mockState{
assignments: testCase.stAssignments,
--
2.16.6

View File

@ -57,6 +57,14 @@ Source33: genmanpages.sh
Patch1: 0001-Fix-pagesize-check-to-allow-for-options-already-endi.patch
Patch2: kubelet-service-remove-docker-dependency.patch
Patch3: fix_http2_erringroundtripper_handling.patch
Patch4: kubelet-cpumanager-disable-CFS-quota-throttling-for-.patch
Patch5: kubelet-cpumanager-keep-normal-containers-off-reserv.patch
Patch6: kubelet-cpumanager-infrastructure-pods-use-system-re.patch
Patch7: kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch
Patch8: Fix-exclusive-CPU-allocations-being-deleted-at-conta.patch
Patch9: kubeadm-create-platform-pods-with-zero-CPU-resources.patch
Patch10: add-option-to-disable-isolcpu-awareness.patch
# It obsoletes cadvisor but needs its source code (literally integrated)
Obsoletes: cadvisor
@ -841,6 +849,14 @@ Kubernetes client tools like kubectl
%setup -q -n %{con_repo}-%{con_commit} -T -b 1
%setup -q -n %{repo}-%{commit}
%patch1 -p1
%patch3 -p1
%patch4 -p1
%patch5 -p1
%patch6 -p1
%patch7 -p1
%patch8 -p1
%patch9 -p1
%patch10 -p1
# copy contrib folder
mkdir contrib
@ -877,6 +893,10 @@ export KUBE_EXTRA_GOPATH=$(pwd)/Godeps/_workspace
%ifarch ppc64le
export GOLDFLAGS='-linkmode=external'
%endif
# uncomment these two lines to build unoptimized binaries for debugging.
# export GOLDFLAGS=""
# export GOGCFLAGS="-N -l"
make WHAT="cmd/kube-proxy cmd/kube-apiserver cmd/kube-controller-manager cmd/kubelet cmd/kubeadm cmd/kube-scheduler cmd/kubectl"
# convert md to man

View File

@ -14,6 +14,10 @@ if [ -z "${IMAGE_TAG}" ]; then
exit 1
fi
# https://bugs.launchpad.net/starlingx/+bug/1927153
# pin clearlinux/golang to the last known working tag
sed -i 's!clearlinux/golang:latest!clearlinux/golang:1.15.10!' build/docker/${DEVICE}.Dockerfile || exit 1
make ${DEVICE}
if [ $? -ne 0 ]; then

View File

@ -1,2 +1,2 @@
SRC_DIR="files"
TIS_PATCH_VER=PKG_GITREVCOUNT
BUILD_IS_SLOW=3

View File

@ -0,0 +1,66 @@
#
# SPDX-License-Identifier: Apache-2.0
#
# Copyright (C) 2019 Intel Corporation
#
Summary: isolcpus-device-plugin
Name: isolcpus-device-plugin
Version: 1.0
Release: %{tis_patch_ver}%{?_tis_dist}
License: Apache-2.0
Group: base
Packager: Wind River
URL: unknown
BuildArch: x86_64
Source: %name-%version.tar.gz
BuildRequires: golang
BuildRequires: systemd
Requires: kubernetes-node
Summary: Kubernetes device plugin for isolcpus
%description
Expose isolated CPUs to Kubernetes as devices via the device plugin API
%define local_etc_pmond /etc/pmon.d/
%prep
%autosetup
# The "-mod=vendor" bit is because we want to use the dependencies from the vendor
# directory rather than downloading them on the fly. The "-ldflags=-linkmode=external"
# is there to work around the fact that the RPM infrastructure wants to see
# a ".note.gnu.build-id" build ID, but "go build" gives a ".note.go.build-id" build ID.
%build
go build -mod=vendor -ldflags=-linkmode=external
%install
mkdir -p %{buildroot}%{_exec_prefix}/local/sbin
install -m 755 isolcpu_plugin %{buildroot}%{_exec_prefix}/local/sbin/isolcpu_plugin
mkdir -p %{buildroot}%{_unitdir}
install -m 644 isolcpu_plugin.service %{buildroot}%{_unitdir}/isolcpu_plugin.service
mkdir -p %{buildroot}%{local_etc_pmond}
install -m 644 isolcpu_plugin.conf %{buildroot}%{local_etc_pmond}/isolcpu_plugin.conf
%files
%{_exec_prefix}/local/sbin/isolcpu_plugin
%{_unitdir}/isolcpu_plugin.service
%{local_etc_pmond}/isolcpu_plugin.conf
# Enable the service and start it.
%post
if [ $1 -eq 1 ] ; then
# Initial installation
systemctl enable --now isolcpu_plugin.service >/dev/null 2>&1 || :
fi
# Disable the service and stop it.
%preun
%systemd_preun isolcpu_plugin.service
# Try to restart the service. Usefull for RPM package upgrades during patching.
%postun
%systemd_postun_with_restart isolcpu_plugin.service
exit 0

View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,45 @@
# Isolated CPUs Device Plugin for Kubernetes
## About
This code implements a Kubernetes device plugin. The plugin detects all CPUs
specified via "isolcpus=X" in the kernel boot args, and exports them to
Kubernetes as custom devices using the deviceplugin API.
It makes heavy use of the Intel device plugin manager from github.com/intel/intel-device-plugins-for-kubernetes
and credit is due to them for making a useful helper. A good example of how
to use that framework can be found at
https://github.com/intel/intel-device-plugins-for-kubernetes/blob/master/cmd/gpu_plugin/gpu_plugin.go
## Implementation Notes
There are currently problems with using go modules for the deviceplugin API...it
leads to an "go: error loading module requirements" error when running "go build".
Accordingly, it was necessary to copy a number of files from external packages.
As part of this work I also updated the deviceplugin API files to the latest
versions to pick up in-development upstream changes.
The "intel/intel-device-plugins-for-kubernetes" subdirectory corresponds to
"github.com/intel/intel-device-plugins-for-kubernetes".
The "kubernetes" subdirectory corresponds to "k8s.io/kubernetes"
In an ideal world, these two subdirectories would not be needed, and instead we
would simply include the following imports in isolcpu.go:
"github.com/intel/intel-device-plugins-for-kubernetes/pkg/debug"
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
This would also require updating the Intel package to pick up the latest
deviceplugin API so that the topology field is properly represented.
## Build Notes
In order to avoid the need for a network connection to download dependencies
at build time, I've chosen to include all the dependencies in the "vendor"
directory. This is auto-generated by running "go mod vendor". The binary
is then built with "go build -mod=vendor".

View File

@ -0,0 +1,9 @@
module isolcpu_plugin
require (
github.com/fsnotify/fsnotify v1.4.7
github.com/gogo/protobuf v1.2.1
github.com/pkg/errors v0.8.1
google.golang.org/grpc v1.23.0
k8s.io/klog v0.4.0
)

View File

@ -0,0 +1,38 @@
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas=
github.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE=
github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/net v0.0.0-20190311183353-d8887717615a h1:oWX7TPOiFAMXLq8o0ikBYfCJVlRHBcsciT5bXOrH628=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8 h1:Nw54tB0rB7hY/N0NQvRW8DG4Yk3Q6T9cu9RcFQDu1tc=
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
google.golang.org/grpc v1.23.0 h1:AzbTB6ux+okLTzP8Ru1Xs41C303zdcfEht7MQnYJt5A=
google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
k8s.io/klog v0.4.0 h1:lCJCxf/LIowc2IGS9TPjWDyXY4nOmdGdfcwwDQCOURQ=
k8s.io/klog v0.4.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=

View File

@ -0,0 +1,57 @@
// Copyright 2018 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package debug
import (
"fmt"
"runtime"
"strings"
)
const (
prefix = "DEBUG"
)
var (
isEnabled bool
)
func getFileAndLine() string {
_, file, line, ok := runtime.Caller(2)
if !ok {
return "???:0"
}
parts := strings.Split(file, "/")
return fmt.Sprintf("%s:%d", parts[len(parts)-1], line)
}
// Activate activates debugging output
func Activate() {
isEnabled = true
}
// Print prints its arguments with fmt.Println() if debug output is activated
func Print(obj ...interface{}) {
if isEnabled {
fmt.Println(append([]interface{}{prefix, getFileAndLine()}, obj...)...)
}
}
// Printf prints its arguments with fmt.Printf() if debug output is activated
func Printf(pattern string, obj ...interface{}) {
if isEnabled {
fmt.Printf(strings.Join([]string{prefix, getFileAndLine(), pattern + "\n"}, " "), obj...)
}
}

View File

@ -0,0 +1,73 @@
// Copyright 2018 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright (c) 2019 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
package deviceplugin
import (
pluginapi "isolcpu_plugin/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)
// DeviceInfo contains information about device maintained by Device Plugin
type DeviceInfo struct {
State string
Nodes []pluginapi.DeviceSpec
Mounts []pluginapi.Mount
Envs map[string]string
NumaNode int
}
// DeviceTree contains a tree-like structure of device type -> device ID -> device info.
type DeviceTree map[string]map[string]DeviceInfo
// NewDeviceTree creates an instance of DeviceTree
func NewDeviceTree() DeviceTree {
return make(map[string]map[string]DeviceInfo)
}
// AddDevice adds device info to DeviceTree.
func (tree DeviceTree) AddDevice(devType, id string, info DeviceInfo) {
if _, present := tree[devType]; !present {
tree[devType] = make(map[string]DeviceInfo)
}
tree[devType][id] = info
}
// Notifier receives updates from Scanner, detects changes and sends the
// detected changes to a channel given by the creator of a Notifier object.
type Notifier interface {
// Notify notifies manager with a device tree constructed by device
// plugin during scanning process.
Notify(DeviceTree)
}
// Scanner serves as an interface between Manager and a device plugin.
type Scanner interface {
// Scan scans the host for devices and sends all found devices to
// a Notifier instance. It's called only once for every device plugin by
// Manager in a goroutine and operates in an infinite loop.
Scan(Notifier) error
}
// PostAllocator is an optional interface implemented by device plugins.
type PostAllocator interface {
// PostAllocate modifies responses returned by Allocate() by e.g.
// adding annotations consumed by CRI hooks to the responses.
PostAllocate(*pluginapi.AllocateResponse) error
}

View File

@ -0,0 +1,135 @@
// Copyright 2018 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package deviceplugin
import (
"fmt"
"os"
"reflect"
pluginapi "isolcpu_plugin/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"isolcpu_plugin/intel/intel-device-plugins-for-kubernetes/pkg/debug"
)
// updateInfo contains info for added, updated and deleted devices.
type updateInfo struct {
Added DeviceTree
Updated DeviceTree
Removed DeviceTree
}
// notifier implements Notifier interface.
type notifier struct {
deviceTree DeviceTree
updatesCh chan<- updateInfo
}
func newNotifier(updatesCh chan<- updateInfo) *notifier {
return &notifier{
updatesCh: updatesCh,
}
}
func (n *notifier) Notify(newDeviceTree DeviceTree) {
added := NewDeviceTree()
updated := NewDeviceTree()
for devType, new := range newDeviceTree {
if old, ok := n.deviceTree[devType]; ok {
if !reflect.DeepEqual(old, new) {
updated[devType] = new
}
delete(n.deviceTree, devType)
} else {
added[devType] = new
}
}
if len(added) > 0 || len(updated) > 0 || len(n.deviceTree) > 0 {
n.updatesCh <- updateInfo{
Added: added,
Updated: updated,
Removed: n.deviceTree,
}
}
n.deviceTree = newDeviceTree
}
// Manager manages life cycle of device plugins and handles the scan results
// received from them.
type Manager struct {
devicePlugin Scanner
namespace string
servers map[string]devicePluginServer
createServer func(string, func(*pluginapi.AllocateResponse) error) devicePluginServer
}
// NewManager creates a new instance of Manager
func NewManager(namespace string, devicePlugin Scanner) *Manager {
return &Manager{
devicePlugin: devicePlugin,
namespace: namespace,
servers: make(map[string]devicePluginServer),
createServer: newServer,
}
}
// Run prepares and launches event loop for updates from Scanner
func (m *Manager) Run() {
updatesCh := make(chan updateInfo)
go func() {
err := m.devicePlugin.Scan(newNotifier(updatesCh))
if err != nil {
fmt.Printf("Device scan failed: %+v\n", err)
os.Exit(1)
}
close(updatesCh)
}()
for update := range updatesCh {
m.handleUpdate(update)
}
}
func (m *Manager) handleUpdate(update updateInfo) {
debug.Print("Received dev updates:", update)
for devType, devices := range update.Added {
var postAllocate func(*pluginapi.AllocateResponse) error
if postAllocator, ok := m.devicePlugin.(PostAllocator); ok {
postAllocate = postAllocator.PostAllocate
}
m.servers[devType] = m.createServer(devType, postAllocate)
go func(dt string) {
err := m.servers[dt].Serve(m.namespace)
if err != nil {
fmt.Printf("Failed to serve %s/%s: %+v\n", m.namespace, dt, err)
os.Exit(1)
}
}(devType)
m.servers[devType].Update(devices)
}
for devType, devices := range update.Updated {
m.servers[devType].Update(devices)
}
for devType := range update.Removed {
m.servers[devType].Stop()
delete(m.servers, devType)
}
}

View File

@ -0,0 +1,324 @@
// Copyright 2017 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright (c) 2019 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
package deviceplugin
import (
"context"
"fmt"
"net"
"os"
"path"
"path/filepath"
"sync"
"time"
"github.com/fsnotify/fsnotify"
"github.com/pkg/errors"
"google.golang.org/grpc"
pluginapi "isolcpu_plugin/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"isolcpu_plugin/intel/intel-device-plugins-for-kubernetes/pkg/debug"
)
type serverState int
// Server state
const (
uninitialized serverState = iota
serving
terminating
)
// devicePluginServer maintains a gRPC server satisfying
// pluginapi.PluginInterfaceServer interfaces.
// This internal unexposed interface simplifies unit testing.
type devicePluginServer interface {
Serve(namespace string) error
Stop() error
Update(devices map[string]DeviceInfo)
}
// server implements devicePluginServer and pluginapi.PluginInterfaceServer interfaces.
type server struct {
devType string
grpcServer *grpc.Server
updatesCh chan map[string]DeviceInfo
devices map[string]DeviceInfo
postAllocate func(*pluginapi.AllocateResponse) error
state serverState
stateMutex sync.Mutex
}
// newServer creates a new server satisfying the devicePluginServer interface.
func newServer(devType string, postAllocate func(*pluginapi.AllocateResponse) error) devicePluginServer {
return &server{
devType: devType,
updatesCh: make(chan map[string]DeviceInfo, 1), // TODO: is 1 needed?
devices: make(map[string]DeviceInfo),
postAllocate: postAllocate,
state: uninitialized,
}
}
func (srv *server) GetDevicePluginOptions(ctx context.Context, empty *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
fmt.Println("GetDevicePluginOptions: return empty options")
return new(pluginapi.DevicePluginOptions), nil
}
func (srv *server) sendDevices(stream pluginapi.DevicePlugin_ListAndWatchServer) error {
resp := new(pluginapi.ListAndWatchResponse)
for id, device := range srv.devices {
dev := &pluginapi.Device{
ID: id,
Health: device.State,
}
if device.NumaNode >= 0 {
dev.Topology = &pluginapi.TopologyInfo{
Nodes: []*pluginapi.NUMANode{
&pluginapi.NUMANode{
ID: int64(device.NumaNode),
},
},
}
}
resp.Devices = append(resp.Devices, dev)
}
debug.Print("Sending to kubelet", resp.Devices)
if err := stream.Send(resp); err != nil {
srv.Stop()
return errors.Wrapf(err, "Cannot update device list")
}
return nil
}
func (srv *server) ListAndWatch(empty *pluginapi.Empty, stream pluginapi.DevicePlugin_ListAndWatchServer) error {
debug.Print("Started ListAndWatch for", srv.devType)
if err := srv.sendDevices(stream); err != nil {
return err
}
for srv.devices = range srv.updatesCh {
if err := srv.sendDevices(stream); err != nil {
return err
}
}
return nil
}
func (srv *server) Allocate(ctx context.Context, rqt *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
response := new(pluginapi.AllocateResponse)
for _, crqt := range rqt.ContainerRequests {
cresp := new(pluginapi.ContainerAllocateResponse)
for _, id := range crqt.DevicesIDs {
dev, ok := srv.devices[id]
if !ok {
return nil, errors.Errorf("Invalid allocation request with non-existing device %s", id)
}
if dev.State != pluginapi.Healthy {
return nil, errors.Errorf("Invalid allocation request with unhealthy device %s", id)
}
for i := range dev.Nodes {
cresp.Devices = append(cresp.Devices, &dev.Nodes[i])
}
for i := range dev.Mounts {
cresp.Mounts = append(cresp.Mounts, &dev.Mounts[i])
}
for key, value := range dev.Envs {
if cresp.Envs == nil {
cresp.Envs = make(map[string]string)
}
cresp.Envs[key] = value
}
}
response.ContainerResponses = append(response.ContainerResponses, cresp)
}
if srv.postAllocate != nil {
err := srv.postAllocate(response)
if err != nil {
return nil, err
}
}
return response, nil
}
func (srv *server) PreStartContainer(ctx context.Context, rqt *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
return nil, errors.New("PreStartContainer() should not be called")
}
// Serve starts a gRPC server to serve pluginapi.PluginInterfaceServer interface.
func (srv *server) Serve(namespace string) error {
return srv.setupAndServe(namespace, pluginapi.DevicePluginPath, pluginapi.KubeletSocket)
}
// Stop stops serving pluginapi.PluginInterfaceServer interface.
func (srv *server) Stop() error {
if srv.grpcServer == nil {
return errors.New("Can't stop non-existing gRPC server. Calling Stop() before Serve()?")
}
srv.setState(terminating)
srv.grpcServer.Stop()
close(srv.updatesCh)
return nil
}
// Update sends updates from Manager to ListAndWatch's event loop.
func (srv *server) Update(devices map[string]DeviceInfo) {
srv.updatesCh <- devices
}
func (srv *server) setState(state serverState) {
srv.stateMutex.Lock()
defer srv.stateMutex.Unlock()
srv.state = state
}
func (srv *server) getState() serverState {
srv.stateMutex.Lock()
defer srv.stateMutex.Unlock()
return srv.state
}
// setupAndServe binds given gRPC server to device manager, starts it and registers it with kubelet.
func (srv *server) setupAndServe(namespace string, devicePluginPath string, kubeletSocket string) error {
resourceName := namespace + "/" + srv.devType
pluginPrefix := namespace + "-" + srv.devType
srv.setState(serving)
for srv.getState() == serving {
pluginEndpoint := pluginPrefix + ".sock"
pluginSocket := path.Join(devicePluginPath, pluginEndpoint)
if err := waitForServer(pluginSocket, time.Second); err == nil {
return errors.Errorf("Socket %s is already in use", pluginSocket)
}
os.Remove(pluginSocket)
lis, err := net.Listen("unix", pluginSocket)
if err != nil {
return errors.Wrap(err, "Failed to listen to plugin socket")
}
srv.grpcServer = grpc.NewServer()
pluginapi.RegisterDevicePluginServer(srv.grpcServer, srv)
// Starts device plugin service.
go func() {
fmt.Printf("Start server for %s at: %s\n", srv.devType, pluginSocket)
srv.grpcServer.Serve(lis)
}()
// Wait for the server to start
if err = waitForServer(pluginSocket, 10*time.Second); err != nil {
return err
}
// Register with Kubelet.
err = registerWithKubelet(kubeletSocket, pluginEndpoint, resourceName)
if err != nil {
return err
}
fmt.Printf("Device plugin for %s registered\n", srv.devType)
// Kubelet removes plugin socket when it (re)starts
// plugin must restart in this case
if err = watchFile(pluginSocket); err != nil {
return err
}
if srv.getState() == serving {
srv.grpcServer.Stop()
fmt.Printf("Socket %s removed, restarting\n", pluginSocket)
} else {
fmt.Printf("Socket %s shut down\n", pluginSocket)
}
}
return nil
}
func watchFile(file string) error {
watcher, err := fsnotify.NewWatcher()
if err != nil {
return errors.Wrapf(err, "Failed to create watcher for %s", file)
}
defer watcher.Close()
err = watcher.Add(filepath.Dir(file))
if err != nil {
return errors.Wrapf(err, "Failed to add %s to watcher", file)
}
for {
select {
case ev := <-watcher.Events:
if (ev.Op == fsnotify.Remove || ev.Op == fsnotify.Rename) && ev.Name == file {
return nil
}
case err := <-watcher.Errors:
return errors.WithStack(err)
}
}
}
func registerWithKubelet(kubeletSocket, pluginEndPoint, resourceName string) error {
conn, err := grpc.Dial(kubeletSocket, grpc.WithInsecure(),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}))
if err != nil {
return errors.Wrap(err, "Cannot connect to kubelet service")
}
defer conn.Close()
client := pluginapi.NewRegistrationClient(conn)
reqt := &pluginapi.RegisterRequest{
Version: pluginapi.Version,
Endpoint: pluginEndPoint,
ResourceName: resourceName,
}
_, err = client.Register(context.Background(), reqt)
if err != nil {
return errors.Wrap(err, "Cannot register to kubelet service")
}
return nil
}
// waitForServer checks if grpc server is alive
// by making grpc blocking connection to the server socket
func waitForServer(socket string, timeout time.Duration) error {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
conn, err := grpc.DialContext(ctx, socket, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}),
)
if conn != nil {
conn.Close()
}
return errors.Wrapf(err, "Failed dial context at %s", socket)
}

View File

@ -0,0 +1,141 @@
// Copyright 2017 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright (c) 2019 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
package main
import (
"isolcpu_plugin/intel/intel-device-plugins-for-kubernetes/pkg/debug"
dpapi "isolcpu_plugin/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
pluginapi "isolcpu_plugin/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"isolcpu_plugin/kubernetes/pkg/kubelet/cm/cpuset"
"github.com/pkg/errors"
"io/ioutil"
"strconv"
"strings"
"time"
"flag"
"fmt"
"path"
"regexp"
)
const (
namespace = "windriver.com"
deviceType = "isolcpus"
nodeRE = `^node[0-9]+$`
)
type devicePlugin struct {
nodeReg *regexp.Regexp
}
func newDevicePlugin() *devicePlugin {
return &devicePlugin{
nodeReg: regexp.MustCompile(nodeRE),
}
}
func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error {
for {
devTree, err := dp.scan()
if err != nil {
return err
}
notifier.Notify(devTree)
// This is only a precaution, we don't live-offline CPUs.
time.Sleep(300 * time.Second)
}
}
// GetCPUNode returns the NUMA node of a CPU.
func (dp *devicePlugin) getCPUNode(cpu int) (int, error) {
cpustr := strconv.Itoa(cpu)
cpuPath := "/sys/devices/system/cpu/cpu" + cpustr
files, err := ioutil.ReadDir(cpuPath)
if err != nil {
return -1, errors.Wrap(err, "Can't read sysfs CPU subdir")
}
// there should be only one file of the form "node<num>"
for _, f := range files {
if dp.nodeReg.MatchString(f.Name()) {
nodeStr := strings.TrimPrefix(f.Name(), "node")
node, err := strconv.Atoi(nodeStr)
if err != nil {
return -1, errors.Wrap(err, "Can't convert node to int")
}
return node, nil
}
}
return -1, errors.Wrap(err, "No node file found")
}
func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
dat, err := ioutil.ReadFile("/sys/devices/system/cpu/isolated")
if err != nil {
return nil, errors.Wrap(err, "Can't read sysfs isolcpus subdir")
}
// The isolated cpus string ends in a newline
cpustring := strings.TrimSuffix(string(dat), "\n")
cset, err := cpuset.Parse(cpustring)
if err != nil {
return nil, errors.Wrap(err, "Can't convert isolcpus string to cpuset")
}
isolcpus := cset.ToSlice()
devTree := dpapi.NewDeviceTree()
if len(isolcpus) > 0 {
for _, cpu := range isolcpus {
cpustr := strconv.Itoa(cpu)
numaNode, _ := dp.getCPUNode(cpu)
devPath := path.Join("/dev/cpu", cpustr, "cpuid")
debug.Printf("Adding %s to isolcpus", devPath)
var nodes []pluginapi.DeviceSpec
nodes = append(nodes, pluginapi.DeviceSpec{
HostPath: devPath,
ContainerPath: devPath,
Permissions: "r",
})
devTree.AddDevice(deviceType, cpustr, dpapi.DeviceInfo{
State: pluginapi.Healthy, Nodes: nodes, NumaNode: numaNode,
})
}
}
return devTree, nil
}
func main() {
var debugEnabled bool
flag.BoolVar(&debugEnabled, "debug", false, "enable debug output")
flag.Parse()
if debugEnabled {
debug.Activate()
}
fmt.Println("isolcpus device plugin started")
plugin := newDevicePlugin()
manager := dpapi.NewManager(namespace, plugin)
manager.Run()
}

View File

@ -0,0 +1,16 @@
;
; Copyright (c) 2019 Wind River Systems, Inc.
;
; SPDX-License-Identifier: Apache-2.0
;
[process]
process = isolcpu_plugin
service = isolcpu_plugin
pidfile = /var/run/isolcpu_plugin.pid
style = lsb ; lsb
severity = major ; minor, major, critical
restarts = 3 ; restarts before error assertion
startuptime = 5 ; seconds to wait after process start
interval = 5 ; number of seconds to wait between restarts
debounce = 20 ; number of seconds to wait before degrade clear
subfunction = worker ; pmon will start monitoring once worker config is complete

View File

@ -0,0 +1,15 @@
[Unit]
Description=Kubernetes Isolated CPU Plugin Daemon
Documentation=https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/
After=kubelet.service
Requires=kubelet.service
[Service]
ExecStart=/usr/local/sbin/isolcpu_plugin
ExecStartPost=/bin/bash -c 'echo $MAINPID > /var/run/isolcpu_plugin.pid'
Restart=on-failure
RestartSec=3
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,36 @@
package(default_visibility = ["//visibility:public"])
load(
"@io_bazel_rules_go//go:def.bzl",
"go_library",
)
go_library(
name = "go_default_library",
srcs = [
"api.pb.go",
"constants.go",
],
importpath = "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1",
deps = [
"//vendor/github.com/gogo/protobuf/gogoproto:go_default_library",
"//vendor/github.com/gogo/protobuf/proto:go_default_library",
"//vendor/github.com/gogo/protobuf/sortkeys:go_default_library",
"//vendor/google.golang.org/grpc:go_default_library",
"//vendor/google.golang.org/grpc/codes:go_default_library",
"//vendor/google.golang.org/grpc/status:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
)

View File

@ -0,0 +1,174 @@
// To regenerate api.pb.go run hack/update-device-plugin.sh
syntax = 'proto3';
package v1beta1;
import "github.com/gogo/protobuf/gogoproto/gogo.proto";
option (gogoproto.goproto_stringer_all) = false;
option (gogoproto.stringer_all) = true;
option (gogoproto.goproto_getters_all) = true;
option (gogoproto.marshaler_all) = true;
option (gogoproto.sizer_all) = true;
option (gogoproto.unmarshaler_all) = true;
option (gogoproto.goproto_unrecognized_all) = false;
// Registration is the service advertised by the Kubelet
// Only when Kubelet answers with a success code to a Register Request
// may Device Plugins start their service
// Registration may fail when device plugin version is not supported by
// Kubelet or the registered resourceName is already taken by another
// active device plugin. Device plugin is expected to terminate upon registration failure
service Registration {
rpc Register(RegisterRequest) returns (Empty) {}
}
message DevicePluginOptions {
// Indicates if PreStartContainer call is required before each container start
bool pre_start_required = 1;
}
message RegisterRequest {
// Version of the API the Device Plugin was built against
string version = 1;
// Name of the unix socket the device plugin is listening on
// PATH = path.Join(DevicePluginPath, endpoint)
string endpoint = 2;
// Schedulable resource name. As of now it's expected to be a DNS Label
string resource_name = 3;
// Options to be communicated with Device Manager
DevicePluginOptions options = 4;
}
message Empty {
}
// DevicePlugin is the service advertised by Device Plugins
service DevicePlugin {
// GetDevicePluginOptions returns options to be communicated with Device
// Manager
rpc GetDevicePluginOptions(Empty) returns (DevicePluginOptions) {}
// ListAndWatch returns a stream of List of Devices
// Whenever a Device state change or a Device disappears, ListAndWatch
// returns the new list
rpc ListAndWatch(Empty) returns (stream ListAndWatchResponse) {}
// Allocate is called during container creation so that the Device
// Plugin can run device specific operations and instruct Kubelet
// of the steps to make the Device available in the container
rpc Allocate(AllocateRequest) returns (AllocateResponse) {}
// PreStartContainer is called, if indicated by Device Plugin during registeration phase,
// before each container start. Device plugin can run device specific operations
// such as reseting the device before making devices available to the container
rpc PreStartContainer(PreStartContainerRequest) returns (PreStartContainerResponse) {}
}
// ListAndWatch returns a stream of List of Devices
// Whenever a Device state change or a Device disappears, ListAndWatch
// returns the new list
message ListAndWatchResponse {
repeated Device devices = 1;
}
message TopologyInfo {
repeated NUMANode nodes = 1;
}
message NUMANode {
int64 ID = 1;
}
/* E.g:
* struct Device {
* ID: "GPU-fef8089b-4820-abfc-e83e-94318197576e",
* State: "Healthy",
* Topology:
* Node:
ID: 1
*} */
message Device {
// A unique ID assigned by the device plugin used
// to identify devices during the communication
// Max length of this field is 63 characters
string ID = 1;
// Health of the device, can be healthy or unhealthy, see constants.go
string health = 2;
// Topology for device
TopologyInfo topology = 3;
}
// - PreStartContainer is expected to be called before each container start if indicated by plugin during registration phase.
// - PreStartContainer allows kubelet to pass reinitialized devices to containers.
// - PreStartContainer allows Device Plugin to run device specific operations on
// the Devices requested
message PreStartContainerRequest {
repeated string devicesIDs = 1;
}
// PreStartContainerResponse will be send by plugin in response to PreStartContainerRequest
message PreStartContainerResponse {
}
// - Allocate is expected to be called during pod creation since allocation
// failures for any container would result in pod startup failure.
// - Allocate allows kubelet to exposes additional artifacts in a pod's
// environment as directed by the plugin.
// - Allocate allows Device Plugin to run device specific operations on
// the Devices requested
message AllocateRequest {
repeated ContainerAllocateRequest container_requests = 1;
}
message ContainerAllocateRequest {
repeated string devicesIDs = 1;
}
// AllocateResponse includes the artifacts that needs to be injected into
// a container for accessing 'deviceIDs' that were mentioned as part of
// 'AllocateRequest'.
// Failure Handling:
// if Kubelet sends an allocation request for dev1 and dev2.
// Allocation on dev1 succeeds but allocation on dev2 fails.
// The Device plugin should send a ListAndWatch update and fail the
// Allocation request
message AllocateResponse {
repeated ContainerAllocateResponse container_responses = 1;
}
message ContainerAllocateResponse {
// List of environment variable to be set in the container to access one of more devices.
map<string, string> envs = 1;
// Mounts for the container.
repeated Mount mounts = 2;
// Devices for the container.
repeated DeviceSpec devices = 3;
// Container annotations to pass to the container runtime
map<string, string> annotations = 4;
}
// Mount specifies a host volume to mount into a container.
// where device library or tools are installed on host and container
message Mount {
// Path of the mount within the container.
string container_path = 1;
// Path of the mount on the host.
string host_path = 2;
// If set, the mount is read-only.
bool read_only = 3;
}
// DeviceSpec specifies a host device to mount into a container.
message DeviceSpec {
// Path of the device within the container.
string container_path = 1;
// Path of the device on the host.
string host_path = 2;
// Cgroups permissions of the device, candidates are one or more of
// * r - allows container to read from the specified device.
// * w - allows container to write to the specified device.
// * m - allows container to create device files that do not yet exist.
string permissions = 3;
}

View File

@ -0,0 +1,37 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
const (
// Healthy means that the device is healthy
Healthy = "Healthy"
// UnHealthy means that the device is unhealthy
Unhealthy = "Unhealthy"
// Current version of the API supported by kubelet
Version = "v1beta1"
// DevicePluginPath is the folder the Device Plugin is expecting sockets to be on
// Only privileged pods have access to this path
// Note: Placeholder until we find a "standard path"
DevicePluginPath = "/var/lib/kubelet/device-plugins/"
// KubeletSocket is the path of the Kubelet registry socket
KubeletSocket = DevicePluginPath + "kubelet.sock"
// Timeout duration in secs for PreStartContainer RPC
KubeletPreStartContainerRPCTimeoutInSecs = 30
)
var SupportedVersions = [...]string{"v1beta1"}

View File

@ -0,0 +1,306 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpuset
import (
"bytes"
"fmt"
"k8s.io/klog"
"reflect"
"sort"
"strconv"
"strings"
)
// Builder is a mutable builder for CPUSet. Functions that mutate instances
// of this type are not thread-safe.
type Builder struct {
result CPUSet
done bool
}
// NewBuilder returns a mutable CPUSet builder.
func NewBuilder() Builder {
return Builder{
result: CPUSet{
elems: map[int]struct{}{},
},
}
}
// Add adds the supplied elements to the result. Calling Add after calling
// Result has no effect.
func (b Builder) Add(elems ...int) {
if b.done {
return
}
for _, elem := range elems {
b.result.elems[elem] = struct{}{}
}
}
// Result returns the result CPUSet containing all elements that were
// previously added to this builder. Subsequent calls to Add have no effect.
func (b Builder) Result() CPUSet {
b.done = true
return b.result
}
// CPUSet is a thread-safe, immutable set-like data structure for CPU IDs.
type CPUSet struct {
elems map[int]struct{}
}
// NewCPUSet returns a new CPUSet containing the supplied elements.
func NewCPUSet(cpus ...int) CPUSet {
b := NewBuilder()
for _, c := range cpus {
b.Add(c)
}
return b.Result()
}
// Size returns the number of elements in this set.
func (s CPUSet) Size() int {
return len(s.elems)
}
// IsEmpty returns true if there are zero elements in this set.
func (s CPUSet) IsEmpty() bool {
return s.Size() == 0
}
// Contains returns true if the supplied element is present in this set.
func (s CPUSet) Contains(cpu int) bool {
_, found := s.elems[cpu]
return found
}
// Equals returns true if the supplied set contains exactly the same elements
// as this set (s IsSubsetOf s2 and s2 IsSubsetOf s).
func (s CPUSet) Equals(s2 CPUSet) bool {
return reflect.DeepEqual(s.elems, s2.elems)
}
// Filter returns a new CPU set that contains all of the elements from this
// set that match the supplied predicate, without mutating the source set.
func (s CPUSet) Filter(predicate func(int) bool) CPUSet {
b := NewBuilder()
for cpu := range s.elems {
if predicate(cpu) {
b.Add(cpu)
}
}
return b.Result()
}
// FilterNot returns a new CPU set that contains all of the elements from this
// set that do not match the supplied predicate, without mutating the source
// set.
func (s CPUSet) FilterNot(predicate func(int) bool) CPUSet {
b := NewBuilder()
for cpu := range s.elems {
if !predicate(cpu) {
b.Add(cpu)
}
}
return b.Result()
}
// IsSubsetOf returns true if the supplied set contains all the elements
func (s CPUSet) IsSubsetOf(s2 CPUSet) bool {
result := true
for cpu := range s.elems {
if !s2.Contains(cpu) {
result = false
break
}
}
return result
}
// Union returns a new CPU set that contains all of the elements from this
// set and all of the elements from the supplied set, without mutating
// either source set.
func (s CPUSet) Union(s2 CPUSet) CPUSet {
b := NewBuilder()
for cpu := range s.elems {
b.Add(cpu)
}
for cpu := range s2.elems {
b.Add(cpu)
}
return b.Result()
}
// UnionAll returns a new CPU set that contains all of the elements from this
// set and all of the elements from the supplied sets, without mutating
// either source set.
func (s CPUSet) UnionAll(s2 []CPUSet) CPUSet {
b := NewBuilder()
for cpu := range s.elems {
b.Add(cpu)
}
for _, cs := range s2 {
for cpu := range cs.elems {
b.Add(cpu)
}
}
return b.Result()
}
// Intersection returns a new CPU set that contains all of the elements
// that are present in both this set and the supplied set, without mutating
// either source set.
func (s CPUSet) Intersection(s2 CPUSet) CPUSet {
return s.Filter(func(cpu int) bool { return s2.Contains(cpu) })
}
// Difference returns a new CPU set that contains all of the elements that
// are present in this set and not the supplied set, without mutating either
// source set.
func (s CPUSet) Difference(s2 CPUSet) CPUSet {
return s.FilterNot(func(cpu int) bool { return s2.Contains(cpu) })
}
// ToSlice returns a slice of integers that contains all elements from
// this set.
func (s CPUSet) ToSlice() []int {
result := []int{}
for cpu := range s.elems {
result = append(result, cpu)
}
sort.Ints(result)
return result
}
// ToSliceNoSort returns a slice of integers that contains all elements from
// this set.
func (s CPUSet) ToSliceNoSort() []int {
result := []int{}
for cpu := range s.elems {
result = append(result, cpu)
}
return result
}
// String returns a new string representation of the elements in this CPU set
// in canonical linux CPU list format.
//
// See: http://man7.org/linux/man-pages/man7/cpuset.7.html#FORMATS
func (s CPUSet) String() string {
if s.IsEmpty() {
return ""
}
elems := s.ToSlice()
type rng struct {
start int
end int
}
ranges := []rng{{elems[0], elems[0]}}
for i := 1; i < len(elems); i++ {
lastRange := &ranges[len(ranges)-1]
// if this element is adjacent to the high end of the last range
if elems[i] == lastRange.end+1 {
// then extend the last range to include this element
lastRange.end = elems[i]
continue
}
// otherwise, start a new range beginning with this element
ranges = append(ranges, rng{elems[i], elems[i]})
}
// construct string from ranges
var result bytes.Buffer
for _, r := range ranges {
if r.start == r.end {
result.WriteString(strconv.Itoa(r.start))
} else {
result.WriteString(fmt.Sprintf("%d-%d", r.start, r.end))
}
result.WriteString(",")
}
return strings.TrimRight(result.String(), ",")
}
// MustParse CPUSet constructs a new CPU set from a Linux CPU list formatted
// string. Unlike Parse, it does not return an error but rather panics if the
// input cannot be used to construct a CPU set.
func MustParse(s string) CPUSet {
res, err := Parse(s)
if err != nil {
klog.Fatalf("unable to parse [%s] as CPUSet: %v", s, err)
}
return res
}
// Parse CPUSet constructs a new CPU set from a Linux CPU list formatted string.
//
// See: http://man7.org/linux/man-pages/man7/cpuset.7.html#FORMATS
func Parse(s string) (CPUSet, error) {
b := NewBuilder()
// Handle empty string.
if s == "" {
return b.Result(), nil
}
// Split CPU list string:
// "0-5,34,46-48 => ["0-5", "34", "46-48"]
ranges := strings.Split(s, ",")
for _, r := range ranges {
boundaries := strings.Split(r, "-")
if len(boundaries) == 1 {
// Handle ranges that consist of only one element like "34".
elem, err := strconv.Atoi(boundaries[0])
if err != nil {
return NewCPUSet(), err
}
b.Add(elem)
} else if len(boundaries) == 2 {
// Handle multi-element ranges like "0-5".
start, err := strconv.Atoi(boundaries[0])
if err != nil {
return NewCPUSet(), err
}
end, err := strconv.Atoi(boundaries[1])
if err != nil {
return NewCPUSet(), err
}
// Add all elements to the result.
// e.g. "0-5", "46-48" => [0, 1, 2, 3, 4, 5, 46, 47, 48].
for e := start; e <= end; e++ {
b.Add(e)
}
}
}
return b.Result(), nil
}
// Clone returns a copy of this CPU set.
func (s CPUSet) Clone() CPUSet {
b := NewBuilder()
for elem := range s.elems {
b.Add(elem)
}
return b.Result()
}

View File

@ -0,0 +1,5 @@
root = true
[*]
indent_style = tab
indent_size = 4

View File

@ -0,0 +1,6 @@
# Setup a Global .gitignore for OS and editor generated files:
# https://help.github.com/articles/ignoring-files
# git config --global core.excludesfile ~/.gitignore_global
.vagrant
*.sublime-project

View File

@ -0,0 +1,30 @@
sudo: false
language: go
go:
- 1.8.x
- 1.9.x
- tip
matrix:
allow_failures:
- go: tip
fast_finish: true
before_script:
- go get -u github.com/golang/lint/golint
script:
- go test -v --race ./...
after_script:
- test -z "$(gofmt -s -l -w . | tee /dev/stderr)"
- test -z "$(golint ./... | tee /dev/stderr)"
- go vet ./...
os:
- linux
- osx
notifications:
email: false

View File

@ -0,0 +1,317 @@
# Changelog
## v1.4.7 / 2018-01-09
* BSD/macOS: Fix possible deadlock on closing the watcher on kqueue (thanks @nhooyr and @glycerine)
* Tests: Fix missing verb on format string (thanks @rchiossi)
* Linux: Fix deadlock in Remove (thanks @aarondl)
* Linux: Watch.Add improvements (avoid race, fix consistency, reduce garbage) (thanks @twpayne)
* Docs: Moved FAQ into the README (thanks @vahe)
* Linux: Properly handle inotify's IN_Q_OVERFLOW event (thanks @zeldovich)
* Docs: replace references to OS X with macOS
## v1.4.2 / 2016-10-10
* Linux: use InotifyInit1 with IN_CLOEXEC to stop leaking a file descriptor to a child process when using fork/exec [#178](https://github.com/fsnotify/fsnotify/pull/178) (thanks @pattyshack)
## v1.4.1 / 2016-10-04
* Fix flaky inotify stress test on Linux [#177](https://github.com/fsnotify/fsnotify/pull/177) (thanks @pattyshack)
## v1.4.0 / 2016-10-01
* add a String() method to Event.Op [#165](https://github.com/fsnotify/fsnotify/pull/165) (thanks @oozie)
## v1.3.1 / 2016-06-28
* Windows: fix for double backslash when watching the root of a drive [#151](https://github.com/fsnotify/fsnotify/issues/151) (thanks @brunoqc)
## v1.3.0 / 2016-04-19
* Support linux/arm64 by [patching](https://go-review.googlesource.com/#/c/21971/) x/sys/unix and switching to to it from syscall (thanks @suihkulokki) [#135](https://github.com/fsnotify/fsnotify/pull/135)
## v1.2.10 / 2016-03-02
* Fix golint errors in windows.go [#121](https://github.com/fsnotify/fsnotify/pull/121) (thanks @tiffanyfj)
## v1.2.9 / 2016-01-13
kqueue: Fix logic for CREATE after REMOVE [#111](https://github.com/fsnotify/fsnotify/pull/111) (thanks @bep)
## v1.2.8 / 2015-12-17
* kqueue: fix race condition in Close [#105](https://github.com/fsnotify/fsnotify/pull/105) (thanks @djui for reporting the issue and @ppknap for writing a failing test)
* inotify: fix race in test
* enable race detection for continuous integration (Linux, Mac, Windows)
## v1.2.5 / 2015-10-17
* inotify: use epoll_create1 for arm64 support (requires Linux 2.6.27 or later) [#100](https://github.com/fsnotify/fsnotify/pull/100) (thanks @suihkulokki)
* inotify: fix path leaks [#73](https://github.com/fsnotify/fsnotify/pull/73) (thanks @chamaken)
* kqueue: watch for rename events on subdirectories [#83](https://github.com/fsnotify/fsnotify/pull/83) (thanks @guotie)
* kqueue: avoid infinite loops from symlinks cycles [#101](https://github.com/fsnotify/fsnotify/pull/101) (thanks @illicitonion)
## v1.2.1 / 2015-10-14
* kqueue: don't watch named pipes [#98](https://github.com/fsnotify/fsnotify/pull/98) (thanks @evanphx)
## v1.2.0 / 2015-02-08
* inotify: use epoll to wake up readEvents [#66](https://github.com/fsnotify/fsnotify/pull/66) (thanks @PieterD)
* inotify: closing watcher should now always shut down goroutine [#63](https://github.com/fsnotify/fsnotify/pull/63) (thanks @PieterD)
* kqueue: close kqueue after removing watches, fixes [#59](https://github.com/fsnotify/fsnotify/issues/59)
## v1.1.1 / 2015-02-05
* inotify: Retry read on EINTR [#61](https://github.com/fsnotify/fsnotify/issues/61) (thanks @PieterD)
## v1.1.0 / 2014-12-12
* kqueue: rework internals [#43](https://github.com/fsnotify/fsnotify/pull/43)
* add low-level functions
* only need to store flags on directories
* less mutexes [#13](https://github.com/fsnotify/fsnotify/issues/13)
* done can be an unbuffered channel
* remove calls to os.NewSyscallError
* More efficient string concatenation for Event.String() [#52](https://github.com/fsnotify/fsnotify/pull/52) (thanks @mdlayher)
* kqueue: fix regression in rework causing subdirectories to be watched [#48](https://github.com/fsnotify/fsnotify/issues/48)
* kqueue: cleanup internal watch before sending remove event [#51](https://github.com/fsnotify/fsnotify/issues/51)
## v1.0.4 / 2014-09-07
* kqueue: add dragonfly to the build tags.
* Rename source code files, rearrange code so exported APIs are at the top.
* Add done channel to example code. [#37](https://github.com/fsnotify/fsnotify/pull/37) (thanks @chenyukang)
## v1.0.3 / 2014-08-19
* [Fix] Windows MOVED_TO now translates to Create like on BSD and Linux. [#36](https://github.com/fsnotify/fsnotify/issues/36)
## v1.0.2 / 2014-08-17
* [Fix] Missing create events on macOS. [#14](https://github.com/fsnotify/fsnotify/issues/14) (thanks @zhsso)
* [Fix] Make ./path and path equivalent. (thanks @zhsso)
## v1.0.0 / 2014-08-15
* [API] Remove AddWatch on Windows, use Add.
* Improve documentation for exported identifiers. [#30](https://github.com/fsnotify/fsnotify/issues/30)
* Minor updates based on feedback from golint.
## dev / 2014-07-09
* Moved to [github.com/fsnotify/fsnotify](https://github.com/fsnotify/fsnotify).
* Use os.NewSyscallError instead of returning errno (thanks @hariharan-uno)
## dev / 2014-07-04
* kqueue: fix incorrect mutex used in Close()
* Update example to demonstrate usage of Op.
## dev / 2014-06-28
* [API] Don't set the Write Op for attribute notifications [#4](https://github.com/fsnotify/fsnotify/issues/4)
* Fix for String() method on Event (thanks Alex Brainman)
* Don't build on Plan 9 or Solaris (thanks @4ad)
## dev / 2014-06-21
* Events channel of type Event rather than *Event.
* [internal] use syscall constants directly for inotify and kqueue.
* [internal] kqueue: rename events to kevents and fileEvent to event.
## dev / 2014-06-19
* Go 1.3+ required on Windows (uses syscall.ERROR_MORE_DATA internally).
* [internal] remove cookie from Event struct (unused).
* [internal] Event struct has the same definition across every OS.
* [internal] remove internal watch and removeWatch methods.
## dev / 2014-06-12
* [API] Renamed Watch() to Add() and RemoveWatch() to Remove().
* [API] Pluralized channel names: Events and Errors.
* [API] Renamed FileEvent struct to Event.
* [API] Op constants replace methods like IsCreate().
## dev / 2014-06-12
* Fix data race on kevent buffer (thanks @tilaks) [#98](https://github.com/howeyc/fsnotify/pull/98)
## dev / 2014-05-23
* [API] Remove current implementation of WatchFlags.
* current implementation doesn't take advantage of OS for efficiency
* provides little benefit over filtering events as they are received, but has extra bookkeeping and mutexes
* no tests for the current implementation
* not fully implemented on Windows [#93](https://github.com/howeyc/fsnotify/issues/93#issuecomment-39285195)
## v0.9.3 / 2014-12-31
* kqueue: cleanup internal watch before sending remove event [#51](https://github.com/fsnotify/fsnotify/issues/51)
## v0.9.2 / 2014-08-17
* [Backport] Fix missing create events on macOS. [#14](https://github.com/fsnotify/fsnotify/issues/14) (thanks @zhsso)
## v0.9.1 / 2014-06-12
* Fix data race on kevent buffer (thanks @tilaks) [#98](https://github.com/howeyc/fsnotify/pull/98)
## v0.9.0 / 2014-01-17
* IsAttrib() for events that only concern a file's metadata [#79][] (thanks @abustany)
* [Fix] kqueue: fix deadlock [#77][] (thanks @cespare)
* [NOTICE] Development has moved to `code.google.com/p/go.exp/fsnotify` in preparation for inclusion in the Go standard library.
## v0.8.12 / 2013-11-13
* [API] Remove FD_SET and friends from Linux adapter
## v0.8.11 / 2013-11-02
* [Doc] Add Changelog [#72][] (thanks @nathany)
* [Doc] Spotlight and double modify events on macOS [#62][] (reported by @paulhammond)
## v0.8.10 / 2013-10-19
* [Fix] kqueue: remove file watches when parent directory is removed [#71][] (reported by @mdwhatcott)
* [Fix] kqueue: race between Close and readEvents [#70][] (reported by @bernerdschaefer)
* [Doc] specify OS-specific limits in README (thanks @debrando)
## v0.8.9 / 2013-09-08
* [Doc] Contributing (thanks @nathany)
* [Doc] update package path in example code [#63][] (thanks @paulhammond)
* [Doc] GoCI badge in README (Linux only) [#60][]
* [Doc] Cross-platform testing with Vagrant [#59][] (thanks @nathany)
## v0.8.8 / 2013-06-17
* [Fix] Windows: handle `ERROR_MORE_DATA` on Windows [#49][] (thanks @jbowtie)
## v0.8.7 / 2013-06-03
* [API] Make syscall flags internal
* [Fix] inotify: ignore event changes
* [Fix] race in symlink test [#45][] (reported by @srid)
* [Fix] tests on Windows
* lower case error messages
## v0.8.6 / 2013-05-23
* kqueue: Use EVT_ONLY flag on Darwin
* [Doc] Update README with full example
## v0.8.5 / 2013-05-09
* [Fix] inotify: allow monitoring of "broken" symlinks (thanks @tsg)
## v0.8.4 / 2013-04-07
* [Fix] kqueue: watch all file events [#40][] (thanks @ChrisBuchholz)
## v0.8.3 / 2013-03-13
* [Fix] inoitfy/kqueue memory leak [#36][] (reported by @nbkolchin)
* [Fix] kqueue: use fsnFlags for watching a directory [#33][] (reported by @nbkolchin)
## v0.8.2 / 2013-02-07
* [Doc] add Authors
* [Fix] fix data races for map access [#29][] (thanks @fsouza)
## v0.8.1 / 2013-01-09
* [Fix] Windows path separators
* [Doc] BSD License
## v0.8.0 / 2012-11-09
* kqueue: directory watching improvements (thanks @vmirage)
* inotify: add `IN_MOVED_TO` [#25][] (requested by @cpisto)
* [Fix] kqueue: deleting watched directory [#24][] (reported by @jakerr)
## v0.7.4 / 2012-10-09
* [Fix] inotify: fixes from https://codereview.appspot.com/5418045/ (ugorji)
* [Fix] kqueue: preserve watch flags when watching for delete [#21][] (reported by @robfig)
* [Fix] kqueue: watch the directory even if it isn't a new watch (thanks @robfig)
* [Fix] kqueue: modify after recreation of file
## v0.7.3 / 2012-09-27
* [Fix] kqueue: watch with an existing folder inside the watched folder (thanks @vmirage)
* [Fix] kqueue: no longer get duplicate CREATE events
## v0.7.2 / 2012-09-01
* kqueue: events for created directories
## v0.7.1 / 2012-07-14
* [Fix] for renaming files
## v0.7.0 / 2012-07-02
* [Feature] FSNotify flags
* [Fix] inotify: Added file name back to event path
## v0.6.0 / 2012-06-06
* kqueue: watch files after directory created (thanks @tmc)
## v0.5.1 / 2012-05-22
* [Fix] inotify: remove all watches before Close()
## v0.5.0 / 2012-05-03
* [API] kqueue: return errors during watch instead of sending over channel
* kqueue: match symlink behavior on Linux
* inotify: add `DELETE_SELF` (requested by @taralx)
* [Fix] kqueue: handle EINTR (reported by @robfig)
* [Doc] Godoc example [#1][] (thanks @davecheney)
## v0.4.0 / 2012-03-30
* Go 1 released: build with go tool
* [Feature] Windows support using winfsnotify
* Windows does not have attribute change notifications
* Roll attribute notifications into IsModify
## v0.3.0 / 2012-02-19
* kqueue: add files when watch directory
## v0.2.0 / 2011-12-30
* update to latest Go weekly code
## v0.1.0 / 2011-10-19
* kqueue: add watch on file creation to match inotify
* kqueue: create file event
* inotify: ignore `IN_IGNORED` events
* event String()
* linux: common FileEvent functions
* initial commit
[#79]: https://github.com/howeyc/fsnotify/pull/79
[#77]: https://github.com/howeyc/fsnotify/pull/77
[#72]: https://github.com/howeyc/fsnotify/issues/72
[#71]: https://github.com/howeyc/fsnotify/issues/71
[#70]: https://github.com/howeyc/fsnotify/issues/70
[#63]: https://github.com/howeyc/fsnotify/issues/63
[#62]: https://github.com/howeyc/fsnotify/issues/62
[#60]: https://github.com/howeyc/fsnotify/issues/60
[#59]: https://github.com/howeyc/fsnotify/issues/59
[#49]: https://github.com/howeyc/fsnotify/issues/49
[#45]: https://github.com/howeyc/fsnotify/issues/45
[#40]: https://github.com/howeyc/fsnotify/issues/40
[#36]: https://github.com/howeyc/fsnotify/issues/36
[#33]: https://github.com/howeyc/fsnotify/issues/33
[#29]: https://github.com/howeyc/fsnotify/issues/29
[#25]: https://github.com/howeyc/fsnotify/issues/25
[#24]: https://github.com/howeyc/fsnotify/issues/24
[#21]: https://github.com/howeyc/fsnotify/issues/21

View File

@ -0,0 +1,77 @@
# Contributing
## Issues
* Request features and report bugs using the [GitHub Issue Tracker](https://github.com/fsnotify/fsnotify/issues).
* Please indicate the platform you are using fsnotify on.
* A code example to reproduce the problem is appreciated.
## Pull Requests
### Contributor License Agreement
fsnotify is derived from code in the [golang.org/x/exp](https://godoc.org/golang.org/x/exp) package and it may be included [in the standard library](https://github.com/fsnotify/fsnotify/issues/1) in the future. Therefore fsnotify carries the same [LICENSE](https://github.com/fsnotify/fsnotify/blob/master/LICENSE) as Go. Contributors retain their copyright, so you need to fill out a short form before we can accept your contribution: [Google Individual Contributor License Agreement](https://developers.google.com/open-source/cla/individual).
Please indicate that you have signed the CLA in your pull request.
### How fsnotify is Developed
* Development is done on feature branches.
* Tests are run on BSD, Linux, macOS and Windows.
* Pull requests are reviewed and [applied to master][am] using [hub][].
* Maintainers may modify or squash commits rather than asking contributors to.
* To issue a new release, the maintainers will:
* Update the CHANGELOG
* Tag a version, which will become available through gopkg.in.
### How to Fork
For smooth sailing, always use the original import path. Installing with `go get` makes this easy.
1. Install from GitHub (`go get -u github.com/fsnotify/fsnotify`)
2. Create your feature branch (`git checkout -b my-new-feature`)
3. Ensure everything works and the tests pass (see below)
4. Commit your changes (`git commit -am 'Add some feature'`)
Contribute upstream:
1. Fork fsnotify on GitHub
2. Add your remote (`git remote add fork git@github.com:mycompany/repo.git`)
3. Push to the branch (`git push fork my-new-feature`)
4. Create a new Pull Request on GitHub
This workflow is [thoroughly explained by Katrina Owen](https://splice.com/blog/contributing-open-source-git-repositories-go/).
### Testing
fsnotify uses build tags to compile different code on Linux, BSD, macOS, and Windows.
Before doing a pull request, please do your best to test your changes on multiple platforms, and list which platforms you were able/unable to test on.
To aid in cross-platform testing there is a Vagrantfile for Linux and BSD.
* Install [Vagrant](http://www.vagrantup.com/) and [VirtualBox](https://www.virtualbox.org/)
* Setup [Vagrant Gopher](https://github.com/nathany/vagrant-gopher) in your `src` folder.
* Run `vagrant up` from the project folder. You can also setup just one box with `vagrant up linux` or `vagrant up bsd` (note: the BSD box doesn't support Windows hosts at this time, and NFS may prompt for your host OS password)
* Once setup, you can run the test suite on a given OS with a single command `vagrant ssh linux -c 'cd fsnotify/fsnotify; go test'`.
* When you're done, you will want to halt or destroy the Vagrant boxes.
Notice: fsnotify file system events won't trigger in shared folders. The tests get around this limitation by using the /tmp directory.
Right now there is no equivalent solution for Windows and macOS, but there are Windows VMs [freely available from Microsoft](http://www.modern.ie/en-us/virtualization-tools#downloads).
### Maintainers
Help maintaining fsnotify is welcome. To be a maintainer:
* Submit a pull request and sign the CLA as above.
* You must be able to run the test suite on Mac, Windows, Linux and BSD.
To keep master clean, the fsnotify project uses the "apply mail" workflow outlined in Nathaniel Talbott's post ["Merge pull request" Considered Harmful][am]. This requires installing [hub][].
All code changes should be internal pull requests.
Releases are tagged using [Semantic Versioning](http://semver.org/).
[hub]: https://github.com/github/hub
[am]: http://blog.spreedly.com/2014/06/24/merge-pull-request-considered-harmful/#.VGa5yZPF_Zs

View File

@ -0,0 +1,28 @@
Copyright (c) 2012 The Go Authors. All rights reserved.
Copyright (c) 2012 fsnotify Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,79 @@
# File system notifications for Go
[![GoDoc](https://godoc.org/github.com/fsnotify/fsnotify?status.svg)](https://godoc.org/github.com/fsnotify/fsnotify) [![Go Report Card](https://goreportcard.com/badge/github.com/fsnotify/fsnotify)](https://goreportcard.com/report/github.com/fsnotify/fsnotify)
fsnotify utilizes [golang.org/x/sys](https://godoc.org/golang.org/x/sys) rather than `syscall` from the standard library. Ensure you have the latest version installed by running:
```console
go get -u golang.org/x/sys/...
```
Cross platform: Windows, Linux, BSD and macOS.
|Adapter |OS |Status |
|----------|----------|----------|
|inotify |Linux 2.6.27 or later, Android\*|Supported [![Build Status](https://travis-ci.org/fsnotify/fsnotify.svg?branch=master)](https://travis-ci.org/fsnotify/fsnotify)|
|kqueue |BSD, macOS, iOS\*|Supported [![Build Status](https://travis-ci.org/fsnotify/fsnotify.svg?branch=master)](https://travis-ci.org/fsnotify/fsnotify)|
|ReadDirectoryChangesW|Windows|Supported [![Build status](https://ci.appveyor.com/api/projects/status/ivwjubaih4r0udeh/branch/master?svg=true)](https://ci.appveyor.com/project/NathanYoungman/fsnotify/branch/master)|
|FSEvents |macOS |[Planned](https://github.com/fsnotify/fsnotify/issues/11)|
|FEN |Solaris 11 |[In Progress](https://github.com/fsnotify/fsnotify/issues/12)|
|fanotify |Linux 2.6.37+ | |
|USN Journals |Windows |[Maybe](https://github.com/fsnotify/fsnotify/issues/53)|
|Polling |*All* |[Maybe](https://github.com/fsnotify/fsnotify/issues/9)|
\* Android and iOS are untested.
Please see [the documentation](https://godoc.org/github.com/fsnotify/fsnotify) and consult the [FAQ](#faq) for usage information.
## API stability
fsnotify is a fork of [howeyc/fsnotify](https://godoc.org/github.com/howeyc/fsnotify) with a new API as of v1.0. The API is based on [this design document](http://goo.gl/MrYxyA).
All [releases](https://github.com/fsnotify/fsnotify/releases) are tagged based on [Semantic Versioning](http://semver.org/). Further API changes are [planned](https://github.com/fsnotify/fsnotify/milestones), and will be tagged with a new major revision number.
Go 1.6 supports dependencies located in the `vendor/` folder. Unless you are creating a library, it is recommended that you copy fsnotify into `vendor/github.com/fsnotify/fsnotify` within your project, and likewise for `golang.org/x/sys`.
## Contributing
Please refer to [CONTRIBUTING][] before opening an issue or pull request.
## Example
See [example_test.go](https://github.com/fsnotify/fsnotify/blob/master/example_test.go).
## FAQ
**When a file is moved to another directory is it still being watched?**
No (it shouldn't be, unless you are watching where it was moved to).
**When I watch a directory, are all subdirectories watched as well?**
No, you must add watches for any directory you want to watch (a recursive watcher is on the roadmap [#18][]).
**Do I have to watch the Error and Event channels in a separate goroutine?**
As of now, yes. Looking into making this single-thread friendly (see [howeyc #7][#7])
**Why am I receiving multiple events for the same file on OS X?**
Spotlight indexing on OS X can result in multiple events (see [howeyc #62][#62]). A temporary workaround is to add your folder(s) to the *Spotlight Privacy settings* until we have a native FSEvents implementation (see [#11][]).
**How many files can be watched at once?**
There are OS-specific limits as to how many watches can be created:
* Linux: /proc/sys/fs/inotify/max_user_watches contains the limit, reaching this limit results in a "no space left on device" error.
* BSD / OSX: sysctl variables "kern.maxfiles" and "kern.maxfilesperproc", reaching these limits results in a "too many open files" error.
[#62]: https://github.com/howeyc/fsnotify/issues/62
[#18]: https://github.com/fsnotify/fsnotify/issues/18
[#11]: https://github.com/fsnotify/fsnotify/issues/11
[#7]: https://github.com/howeyc/fsnotify/issues/7
[contributing]: https://github.com/fsnotify/fsnotify/blob/master/CONTRIBUTING.md
## Related Projects
* [notify](https://github.com/rjeczalik/notify)
* [fsevents](https://github.com/fsnotify/fsevents)

View File

@ -0,0 +1,37 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build solaris
package fsnotify
import (
"errors"
)
// Watcher watches a set of files, delivering events to a channel.
type Watcher struct {
Events chan Event
Errors chan error
}
// NewWatcher establishes a new watcher with the underlying OS and begins waiting for events.
func NewWatcher() (*Watcher, error) {
return nil, errors.New("FEN based watcher not yet supported for fsnotify\n")
}
// Close removes all watches and closes the events channel.
func (w *Watcher) Close() error {
return nil
}
// Add starts watching the named file or directory (non-recursively).
func (w *Watcher) Add(name string) error {
return nil
}
// Remove stops watching the the named file or directory (non-recursively).
func (w *Watcher) Remove(name string) error {
return nil
}

View File

@ -0,0 +1,66 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !plan9
// Package fsnotify provides a platform-independent interface for file system notifications.
package fsnotify
import (
"bytes"
"errors"
"fmt"
)
// Event represents a single file system notification.
type Event struct {
Name string // Relative path to the file or directory.
Op Op // File operation that triggered the event.
}
// Op describes a set of file operations.
type Op uint32
// These are the generalized file operations that can trigger a notification.
const (
Create Op = 1 << iota
Write
Remove
Rename
Chmod
)
func (op Op) String() string {
// Use a buffer for efficient string concatenation
var buffer bytes.Buffer
if op&Create == Create {
buffer.WriteString("|CREATE")
}
if op&Remove == Remove {
buffer.WriteString("|REMOVE")
}
if op&Write == Write {
buffer.WriteString("|WRITE")
}
if op&Rename == Rename {
buffer.WriteString("|RENAME")
}
if op&Chmod == Chmod {
buffer.WriteString("|CHMOD")
}
if buffer.Len() == 0 {
return ""
}
return buffer.String()[1:] // Strip leading pipe
}
// String returns a string representation of the event in the form
// "file: REMOVE|WRITE|..."
func (e Event) String() string {
return fmt.Sprintf("%q: %s", e.Name, e.Op.String())
}
// Common errors that can be reported by a watcher
var ErrEventOverflow = errors.New("fsnotify queue overflow")

View File

@ -0,0 +1,337 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build linux
package fsnotify
import (
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"sync"
"unsafe"
"golang.org/x/sys/unix"
)
// Watcher watches a set of files, delivering events to a channel.
type Watcher struct {
Events chan Event
Errors chan error
mu sync.Mutex // Map access
fd int
poller *fdPoller
watches map[string]*watch // Map of inotify watches (key: path)
paths map[int]string // Map of watched paths (key: watch descriptor)
done chan struct{} // Channel for sending a "quit message" to the reader goroutine
doneResp chan struct{} // Channel to respond to Close
}
// NewWatcher establishes a new watcher with the underlying OS and begins waiting for events.
func NewWatcher() (*Watcher, error) {
// Create inotify fd
fd, errno := unix.InotifyInit1(unix.IN_CLOEXEC)
if fd == -1 {
return nil, errno
}
// Create epoll
poller, err := newFdPoller(fd)
if err != nil {
unix.Close(fd)
return nil, err
}
w := &Watcher{
fd: fd,
poller: poller,
watches: make(map[string]*watch),
paths: make(map[int]string),
Events: make(chan Event),
Errors: make(chan error),
done: make(chan struct{}),
doneResp: make(chan struct{}),
}
go w.readEvents()
return w, nil
}
func (w *Watcher) isClosed() bool {
select {
case <-w.done:
return true
default:
return false
}
}
// Close removes all watches and closes the events channel.
func (w *Watcher) Close() error {
if w.isClosed() {
return nil
}
// Send 'close' signal to goroutine, and set the Watcher to closed.
close(w.done)
// Wake up goroutine
w.poller.wake()
// Wait for goroutine to close
<-w.doneResp
return nil
}
// Add starts watching the named file or directory (non-recursively).
func (w *Watcher) Add(name string) error {
name = filepath.Clean(name)
if w.isClosed() {
return errors.New("inotify instance already closed")
}
const agnosticEvents = unix.IN_MOVED_TO | unix.IN_MOVED_FROM |
unix.IN_CREATE | unix.IN_ATTRIB | unix.IN_MODIFY |
unix.IN_MOVE_SELF | unix.IN_DELETE | unix.IN_DELETE_SELF
var flags uint32 = agnosticEvents
w.mu.Lock()
defer w.mu.Unlock()
watchEntry := w.watches[name]
if watchEntry != nil {
flags |= watchEntry.flags | unix.IN_MASK_ADD
}
wd, errno := unix.InotifyAddWatch(w.fd, name, flags)
if wd == -1 {
return errno
}
if watchEntry == nil {
w.watches[name] = &watch{wd: uint32(wd), flags: flags}
w.paths[wd] = name
} else {
watchEntry.wd = uint32(wd)
watchEntry.flags = flags
}
return nil
}
// Remove stops watching the named file or directory (non-recursively).
func (w *Watcher) Remove(name string) error {
name = filepath.Clean(name)
// Fetch the watch.
w.mu.Lock()
defer w.mu.Unlock()
watch, ok := w.watches[name]
// Remove it from inotify.
if !ok {
return fmt.Errorf("can't remove non-existent inotify watch for: %s", name)
}
// We successfully removed the watch if InotifyRmWatch doesn't return an
// error, we need to clean up our internal state to ensure it matches
// inotify's kernel state.
delete(w.paths, int(watch.wd))
delete(w.watches, name)
// inotify_rm_watch will return EINVAL if the file has been deleted;
// the inotify will already have been removed.
// watches and pathes are deleted in ignoreLinux() implicitly and asynchronously
// by calling inotify_rm_watch() below. e.g. readEvents() goroutine receives IN_IGNORE
// so that EINVAL means that the wd is being rm_watch()ed or its file removed
// by another thread and we have not received IN_IGNORE event.
success, errno := unix.InotifyRmWatch(w.fd, watch.wd)
if success == -1 {
// TODO: Perhaps it's not helpful to return an error here in every case.
// the only two possible errors are:
// EBADF, which happens when w.fd is not a valid file descriptor of any kind.
// EINVAL, which is when fd is not an inotify descriptor or wd is not a valid watch descriptor.
// Watch descriptors are invalidated when they are removed explicitly or implicitly;
// explicitly by inotify_rm_watch, implicitly when the file they are watching is deleted.
return errno
}
return nil
}
type watch struct {
wd uint32 // Watch descriptor (as returned by the inotify_add_watch() syscall)
flags uint32 // inotify flags of this watch (see inotify(7) for the list of valid flags)
}
// readEvents reads from the inotify file descriptor, converts the
// received events into Event objects and sends them via the Events channel
func (w *Watcher) readEvents() {
var (
buf [unix.SizeofInotifyEvent * 4096]byte // Buffer for a maximum of 4096 raw events
n int // Number of bytes read with read()
errno error // Syscall errno
ok bool // For poller.wait
)
defer close(w.doneResp)
defer close(w.Errors)
defer close(w.Events)
defer unix.Close(w.fd)
defer w.poller.close()
for {
// See if we have been closed.
if w.isClosed() {
return
}
ok, errno = w.poller.wait()
if errno != nil {
select {
case w.Errors <- errno:
case <-w.done:
return
}
continue
}
if !ok {
continue
}
n, errno = unix.Read(w.fd, buf[:])
// If a signal interrupted execution, see if we've been asked to close, and try again.
// http://man7.org/linux/man-pages/man7/signal.7.html :
// "Before Linux 3.8, reads from an inotify(7) file descriptor were not restartable"
if errno == unix.EINTR {
continue
}
// unix.Read might have been woken up by Close. If so, we're done.
if w.isClosed() {
return
}
if n < unix.SizeofInotifyEvent {
var err error
if n == 0 {
// If EOF is received. This should really never happen.
err = io.EOF
} else if n < 0 {
// If an error occurred while reading.
err = errno
} else {
// Read was too short.
err = errors.New("notify: short read in readEvents()")
}
select {
case w.Errors <- err:
case <-w.done:
return
}
continue
}
var offset uint32
// We don't know how many events we just read into the buffer
// While the offset points to at least one whole event...
for offset <= uint32(n-unix.SizeofInotifyEvent) {
// Point "raw" to the event in the buffer
raw := (*unix.InotifyEvent)(unsafe.Pointer(&buf[offset]))
mask := uint32(raw.Mask)
nameLen := uint32(raw.Len)
if mask&unix.IN_Q_OVERFLOW != 0 {
select {
case w.Errors <- ErrEventOverflow:
case <-w.done:
return
}
}
// If the event happened to the watched directory or the watched file, the kernel
// doesn't append the filename to the event, but we would like to always fill the
// the "Name" field with a valid filename. We retrieve the path of the watch from
// the "paths" map.
w.mu.Lock()
name, ok := w.paths[int(raw.Wd)]
// IN_DELETE_SELF occurs when the file/directory being watched is removed.
// This is a sign to clean up the maps, otherwise we are no longer in sync
// with the inotify kernel state which has already deleted the watch
// automatically.
if ok && mask&unix.IN_DELETE_SELF == unix.IN_DELETE_SELF {
delete(w.paths, int(raw.Wd))
delete(w.watches, name)
}
w.mu.Unlock()
if nameLen > 0 {
// Point "bytes" at the first byte of the filename
bytes := (*[unix.PathMax]byte)(unsafe.Pointer(&buf[offset+unix.SizeofInotifyEvent]))
// The filename is padded with NULL bytes. TrimRight() gets rid of those.
name += "/" + strings.TrimRight(string(bytes[0:nameLen]), "\000")
}
event := newEvent(name, mask)
// Send the events that are not ignored on the events channel
if !event.ignoreLinux(mask) {
select {
case w.Events <- event:
case <-w.done:
return
}
}
// Move to the next event in the buffer
offset += unix.SizeofInotifyEvent + nameLen
}
}
}
// Certain types of events can be "ignored" and not sent over the Events
// channel. Such as events marked ignore by the kernel, or MODIFY events
// against files that do not exist.
func (e *Event) ignoreLinux(mask uint32) bool {
// Ignore anything the inotify API says to ignore
if mask&unix.IN_IGNORED == unix.IN_IGNORED {
return true
}
// If the event is not a DELETE or RENAME, the file must exist.
// Otherwise the event is ignored.
// *Note*: this was put in place because it was seen that a MODIFY
// event was sent after the DELETE. This ignores that MODIFY and
// assumes a DELETE will come or has come if the file doesn't exist.
if !(e.Op&Remove == Remove || e.Op&Rename == Rename) {
_, statErr := os.Lstat(e.Name)
return os.IsNotExist(statErr)
}
return false
}
// newEvent returns an platform-independent Event based on an inotify mask.
func newEvent(name string, mask uint32) Event {
e := Event{Name: name}
if mask&unix.IN_CREATE == unix.IN_CREATE || mask&unix.IN_MOVED_TO == unix.IN_MOVED_TO {
e.Op |= Create
}
if mask&unix.IN_DELETE_SELF == unix.IN_DELETE_SELF || mask&unix.IN_DELETE == unix.IN_DELETE {
e.Op |= Remove
}
if mask&unix.IN_MODIFY == unix.IN_MODIFY {
e.Op |= Write
}
if mask&unix.IN_MOVE_SELF == unix.IN_MOVE_SELF || mask&unix.IN_MOVED_FROM == unix.IN_MOVED_FROM {
e.Op |= Rename
}
if mask&unix.IN_ATTRIB == unix.IN_ATTRIB {
e.Op |= Chmod
}
return e
}

View File

@ -0,0 +1,187 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build linux
package fsnotify
import (
"errors"
"golang.org/x/sys/unix"
)
type fdPoller struct {
fd int // File descriptor (as returned by the inotify_init() syscall)
epfd int // Epoll file descriptor
pipe [2]int // Pipe for waking up
}
func emptyPoller(fd int) *fdPoller {
poller := new(fdPoller)
poller.fd = fd
poller.epfd = -1
poller.pipe[0] = -1
poller.pipe[1] = -1
return poller
}
// Create a new inotify poller.
// This creates an inotify handler, and an epoll handler.
func newFdPoller(fd int) (*fdPoller, error) {
var errno error
poller := emptyPoller(fd)
defer func() {
if errno != nil {
poller.close()
}
}()
poller.fd = fd
// Create epoll fd
poller.epfd, errno = unix.EpollCreate1(0)
if poller.epfd == -1 {
return nil, errno
}
// Create pipe; pipe[0] is the read end, pipe[1] the write end.
errno = unix.Pipe2(poller.pipe[:], unix.O_NONBLOCK)
if errno != nil {
return nil, errno
}
// Register inotify fd with epoll
event := unix.EpollEvent{
Fd: int32(poller.fd),
Events: unix.EPOLLIN,
}
errno = unix.EpollCtl(poller.epfd, unix.EPOLL_CTL_ADD, poller.fd, &event)
if errno != nil {
return nil, errno
}
// Register pipe fd with epoll
event = unix.EpollEvent{
Fd: int32(poller.pipe[0]),
Events: unix.EPOLLIN,
}
errno = unix.EpollCtl(poller.epfd, unix.EPOLL_CTL_ADD, poller.pipe[0], &event)
if errno != nil {
return nil, errno
}
return poller, nil
}
// Wait using epoll.
// Returns true if something is ready to be read,
// false if there is not.
func (poller *fdPoller) wait() (bool, error) {
// 3 possible events per fd, and 2 fds, makes a maximum of 6 events.
// I don't know whether epoll_wait returns the number of events returned,
// or the total number of events ready.
// I decided to catch both by making the buffer one larger than the maximum.
events := make([]unix.EpollEvent, 7)
for {
n, errno := unix.EpollWait(poller.epfd, events, -1)
if n == -1 {
if errno == unix.EINTR {
continue
}
return false, errno
}
if n == 0 {
// If there are no events, try again.
continue
}
if n > 6 {
// This should never happen. More events were returned than should be possible.
return false, errors.New("epoll_wait returned more events than I know what to do with")
}
ready := events[:n]
epollhup := false
epollerr := false
epollin := false
for _, event := range ready {
if event.Fd == int32(poller.fd) {
if event.Events&unix.EPOLLHUP != 0 {
// This should not happen, but if it does, treat it as a wakeup.
epollhup = true
}
if event.Events&unix.EPOLLERR != 0 {
// If an error is waiting on the file descriptor, we should pretend
// something is ready to read, and let unix.Read pick up the error.
epollerr = true
}
if event.Events&unix.EPOLLIN != 0 {
// There is data to read.
epollin = true
}
}
if event.Fd == int32(poller.pipe[0]) {
if event.Events&unix.EPOLLHUP != 0 {
// Write pipe descriptor was closed, by us. This means we're closing down the
// watcher, and we should wake up.
}
if event.Events&unix.EPOLLERR != 0 {
// If an error is waiting on the pipe file descriptor.
// This is an absolute mystery, and should never ever happen.
return false, errors.New("Error on the pipe descriptor.")
}
if event.Events&unix.EPOLLIN != 0 {
// This is a regular wakeup, so we have to clear the buffer.
err := poller.clearWake()
if err != nil {
return false, err
}
}
}
}
if epollhup || epollerr || epollin {
return true, nil
}
return false, nil
}
}
// Close the write end of the poller.
func (poller *fdPoller) wake() error {
buf := make([]byte, 1)
n, errno := unix.Write(poller.pipe[1], buf)
if n == -1 {
if errno == unix.EAGAIN {
// Buffer is full, poller will wake.
return nil
}
return errno
}
return nil
}
func (poller *fdPoller) clearWake() error {
// You have to be woken up a LOT in order to get to 100!
buf := make([]byte, 100)
n, errno := unix.Read(poller.pipe[0], buf)
if n == -1 {
if errno == unix.EAGAIN {
// Buffer is empty, someone else cleared our wake.
return nil
}
return errno
}
return nil
}
// Close all poller file descriptors, but not the one passed to it.
func (poller *fdPoller) close() {
if poller.pipe[1] != -1 {
unix.Close(poller.pipe[1])
}
if poller.pipe[0] != -1 {
unix.Close(poller.pipe[0])
}
if poller.epfd != -1 {
unix.Close(poller.epfd)
}
}

View File

@ -0,0 +1,521 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build freebsd openbsd netbsd dragonfly darwin
package fsnotify
import (
"errors"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"sync"
"time"
"golang.org/x/sys/unix"
)
// Watcher watches a set of files, delivering events to a channel.
type Watcher struct {
Events chan Event
Errors chan error
done chan struct{} // Channel for sending a "quit message" to the reader goroutine
kq int // File descriptor (as returned by the kqueue() syscall).
mu sync.Mutex // Protects access to watcher data
watches map[string]int // Map of watched file descriptors (key: path).
externalWatches map[string]bool // Map of watches added by user of the library.
dirFlags map[string]uint32 // Map of watched directories to fflags used in kqueue.
paths map[int]pathInfo // Map file descriptors to path names for processing kqueue events.
fileExists map[string]bool // Keep track of if we know this file exists (to stop duplicate create events).
isClosed bool // Set to true when Close() is first called
}
type pathInfo struct {
name string
isDir bool
}
// NewWatcher establishes a new watcher with the underlying OS and begins waiting for events.
func NewWatcher() (*Watcher, error) {
kq, err := kqueue()
if err != nil {
return nil, err
}
w := &Watcher{
kq: kq,
watches: make(map[string]int),
dirFlags: make(map[string]uint32),
paths: make(map[int]pathInfo),
fileExists: make(map[string]bool),
externalWatches: make(map[string]bool),
Events: make(chan Event),
Errors: make(chan error),
done: make(chan struct{}),
}
go w.readEvents()
return w, nil
}
// Close removes all watches and closes the events channel.
func (w *Watcher) Close() error {
w.mu.Lock()
if w.isClosed {
w.mu.Unlock()
return nil
}
w.isClosed = true
// copy paths to remove while locked
var pathsToRemove = make([]string, 0, len(w.watches))
for name := range w.watches {
pathsToRemove = append(pathsToRemove, name)
}
w.mu.Unlock()
// unlock before calling Remove, which also locks
for _, name := range pathsToRemove {
w.Remove(name)
}
// send a "quit" message to the reader goroutine
close(w.done)
return nil
}
// Add starts watching the named file or directory (non-recursively).
func (w *Watcher) Add(name string) error {
w.mu.Lock()
w.externalWatches[name] = true
w.mu.Unlock()
_, err := w.addWatch(name, noteAllEvents)
return err
}
// Remove stops watching the the named file or directory (non-recursively).
func (w *Watcher) Remove(name string) error {
name = filepath.Clean(name)
w.mu.Lock()
watchfd, ok := w.watches[name]
w.mu.Unlock()
if !ok {
return fmt.Errorf("can't remove non-existent kevent watch for: %s", name)
}
const registerRemove = unix.EV_DELETE
if err := register(w.kq, []int{watchfd}, registerRemove, 0); err != nil {
return err
}
unix.Close(watchfd)
w.mu.Lock()
isDir := w.paths[watchfd].isDir
delete(w.watches, name)
delete(w.paths, watchfd)
delete(w.dirFlags, name)
w.mu.Unlock()
// Find all watched paths that are in this directory that are not external.
if isDir {
var pathsToRemove []string
w.mu.Lock()
for _, path := range w.paths {
wdir, _ := filepath.Split(path.name)
if filepath.Clean(wdir) == name {
if !w.externalWatches[path.name] {
pathsToRemove = append(pathsToRemove, path.name)
}
}
}
w.mu.Unlock()
for _, name := range pathsToRemove {
// Since these are internal, not much sense in propagating error
// to the user, as that will just confuse them with an error about
// a path they did not explicitly watch themselves.
w.Remove(name)
}
}
return nil
}
// Watch all events (except NOTE_EXTEND, NOTE_LINK, NOTE_REVOKE)
const noteAllEvents = unix.NOTE_DELETE | unix.NOTE_WRITE | unix.NOTE_ATTRIB | unix.NOTE_RENAME
// keventWaitTime to block on each read from kevent
var keventWaitTime = durationToTimespec(100 * time.Millisecond)
// addWatch adds name to the watched file set.
// The flags are interpreted as described in kevent(2).
// Returns the real path to the file which was added, if any, which may be different from the one passed in the case of symlinks.
func (w *Watcher) addWatch(name string, flags uint32) (string, error) {
var isDir bool
// Make ./name and name equivalent
name = filepath.Clean(name)
w.mu.Lock()
if w.isClosed {
w.mu.Unlock()
return "", errors.New("kevent instance already closed")
}
watchfd, alreadyWatching := w.watches[name]
// We already have a watch, but we can still override flags.
if alreadyWatching {
isDir = w.paths[watchfd].isDir
}
w.mu.Unlock()
if !alreadyWatching {
fi, err := os.Lstat(name)
if err != nil {
return "", err
}
// Don't watch sockets.
if fi.Mode()&os.ModeSocket == os.ModeSocket {
return "", nil
}
// Don't watch named pipes.
if fi.Mode()&os.ModeNamedPipe == os.ModeNamedPipe {
return "", nil
}
// Follow Symlinks
// Unfortunately, Linux can add bogus symlinks to watch list without
// issue, and Windows can't do symlinks period (AFAIK). To maintain
// consistency, we will act like everything is fine. There will simply
// be no file events for broken symlinks.
// Hence the returns of nil on errors.
if fi.Mode()&os.ModeSymlink == os.ModeSymlink {
name, err = filepath.EvalSymlinks(name)
if err != nil {
return "", nil
}
w.mu.Lock()
_, alreadyWatching = w.watches[name]
w.mu.Unlock()
if alreadyWatching {
return name, nil
}
fi, err = os.Lstat(name)
if err != nil {
return "", nil
}
}
watchfd, err = unix.Open(name, openMode, 0700)
if watchfd == -1 {
return "", err
}
isDir = fi.IsDir()
}
const registerAdd = unix.EV_ADD | unix.EV_CLEAR | unix.EV_ENABLE
if err := register(w.kq, []int{watchfd}, registerAdd, flags); err != nil {
unix.Close(watchfd)
return "", err
}
if !alreadyWatching {
w.mu.Lock()
w.watches[name] = watchfd
w.paths[watchfd] = pathInfo{name: name, isDir: isDir}
w.mu.Unlock()
}
if isDir {
// Watch the directory if it has not been watched before,
// or if it was watched before, but perhaps only a NOTE_DELETE (watchDirectoryFiles)
w.mu.Lock()
watchDir := (flags&unix.NOTE_WRITE) == unix.NOTE_WRITE &&
(!alreadyWatching || (w.dirFlags[name]&unix.NOTE_WRITE) != unix.NOTE_WRITE)
// Store flags so this watch can be updated later
w.dirFlags[name] = flags
w.mu.Unlock()
if watchDir {
if err := w.watchDirectoryFiles(name); err != nil {
return "", err
}
}
}
return name, nil
}
// readEvents reads from kqueue and converts the received kevents into
// Event values that it sends down the Events channel.
func (w *Watcher) readEvents() {
eventBuffer := make([]unix.Kevent_t, 10)
loop:
for {
// See if there is a message on the "done" channel
select {
case <-w.done:
break loop
default:
}
// Get new events
kevents, err := read(w.kq, eventBuffer, &keventWaitTime)
// EINTR is okay, the syscall was interrupted before timeout expired.
if err != nil && err != unix.EINTR {
select {
case w.Errors <- err:
case <-w.done:
break loop
}
continue
}
// Flush the events we received to the Events channel
for len(kevents) > 0 {
kevent := &kevents[0]
watchfd := int(kevent.Ident)
mask := uint32(kevent.Fflags)
w.mu.Lock()
path := w.paths[watchfd]
w.mu.Unlock()
event := newEvent(path.name, mask)
if path.isDir && !(event.Op&Remove == Remove) {
// Double check to make sure the directory exists. This can happen when
// we do a rm -fr on a recursively watched folders and we receive a
// modification event first but the folder has been deleted and later
// receive the delete event
if _, err := os.Lstat(event.Name); os.IsNotExist(err) {
// mark is as delete event
event.Op |= Remove
}
}
if event.Op&Rename == Rename || event.Op&Remove == Remove {
w.Remove(event.Name)
w.mu.Lock()
delete(w.fileExists, event.Name)
w.mu.Unlock()
}
if path.isDir && event.Op&Write == Write && !(event.Op&Remove == Remove) {
w.sendDirectoryChangeEvents(event.Name)
} else {
// Send the event on the Events channel.
select {
case w.Events <- event:
case <-w.done:
break loop
}
}
if event.Op&Remove == Remove {
// Look for a file that may have overwritten this.
// For example, mv f1 f2 will delete f2, then create f2.
if path.isDir {
fileDir := filepath.Clean(event.Name)
w.mu.Lock()
_, found := w.watches[fileDir]
w.mu.Unlock()
if found {
// make sure the directory exists before we watch for changes. When we
// do a recursive watch and perform rm -fr, the parent directory might
// have gone missing, ignore the missing directory and let the
// upcoming delete event remove the watch from the parent directory.
if _, err := os.Lstat(fileDir); err == nil {
w.sendDirectoryChangeEvents(fileDir)
}
}
} else {
filePath := filepath.Clean(event.Name)
if fileInfo, err := os.Lstat(filePath); err == nil {
w.sendFileCreatedEventIfNew(filePath, fileInfo)
}
}
}
// Move to next event
kevents = kevents[1:]
}
}
// cleanup
err := unix.Close(w.kq)
if err != nil {
// only way the previous loop breaks is if w.done was closed so we need to async send to w.Errors.
select {
case w.Errors <- err:
default:
}
}
close(w.Events)
close(w.Errors)
}
// newEvent returns an platform-independent Event based on kqueue Fflags.
func newEvent(name string, mask uint32) Event {
e := Event{Name: name}
if mask&unix.NOTE_DELETE == unix.NOTE_DELETE {
e.Op |= Remove
}
if mask&unix.NOTE_WRITE == unix.NOTE_WRITE {
e.Op |= Write
}
if mask&unix.NOTE_RENAME == unix.NOTE_RENAME {
e.Op |= Rename
}
if mask&unix.NOTE_ATTRIB == unix.NOTE_ATTRIB {
e.Op |= Chmod
}
return e
}
func newCreateEvent(name string) Event {
return Event{Name: name, Op: Create}
}
// watchDirectoryFiles to mimic inotify when adding a watch on a directory
func (w *Watcher) watchDirectoryFiles(dirPath string) error {
// Get all files
files, err := ioutil.ReadDir(dirPath)
if err != nil {
return err
}
for _, fileInfo := range files {
filePath := filepath.Join(dirPath, fileInfo.Name())
filePath, err = w.internalWatch(filePath, fileInfo)
if err != nil {
return err
}
w.mu.Lock()
w.fileExists[filePath] = true
w.mu.Unlock()
}
return nil
}
// sendDirectoryEvents searches the directory for newly created files
// and sends them over the event channel. This functionality is to have
// the BSD version of fsnotify match Linux inotify which provides a
// create event for files created in a watched directory.
func (w *Watcher) sendDirectoryChangeEvents(dirPath string) {
// Get all files
files, err := ioutil.ReadDir(dirPath)
if err != nil {
select {
case w.Errors <- err:
case <-w.done:
return
}
}
// Search for new files
for _, fileInfo := range files {
filePath := filepath.Join(dirPath, fileInfo.Name())
err := w.sendFileCreatedEventIfNew(filePath, fileInfo)
if err != nil {
return
}
}
}
// sendFileCreatedEvent sends a create event if the file isn't already being tracked.
func (w *Watcher) sendFileCreatedEventIfNew(filePath string, fileInfo os.FileInfo) (err error) {
w.mu.Lock()
_, doesExist := w.fileExists[filePath]
w.mu.Unlock()
if !doesExist {
// Send create event
select {
case w.Events <- newCreateEvent(filePath):
case <-w.done:
return
}
}
// like watchDirectoryFiles (but without doing another ReadDir)
filePath, err = w.internalWatch(filePath, fileInfo)
if err != nil {
return err
}
w.mu.Lock()
w.fileExists[filePath] = true
w.mu.Unlock()
return nil
}
func (w *Watcher) internalWatch(name string, fileInfo os.FileInfo) (string, error) {
if fileInfo.IsDir() {
// mimic Linux providing delete events for subdirectories
// but preserve the flags used if currently watching subdirectory
w.mu.Lock()
flags := w.dirFlags[name]
w.mu.Unlock()
flags |= unix.NOTE_DELETE | unix.NOTE_RENAME
return w.addWatch(name, flags)
}
// watch file to mimic Linux inotify
return w.addWatch(name, noteAllEvents)
}
// kqueue creates a new kernel event queue and returns a descriptor.
func kqueue() (kq int, err error) {
kq, err = unix.Kqueue()
if kq == -1 {
return kq, err
}
return kq, nil
}
// register events with the queue
func register(kq int, fds []int, flags int, fflags uint32) error {
changes := make([]unix.Kevent_t, len(fds))
for i, fd := range fds {
// SetKevent converts int to the platform-specific types:
unix.SetKevent(&changes[i], fd, unix.EVFILT_VNODE, flags)
changes[i].Fflags = fflags
}
// register the events
success, err := unix.Kevent(kq, changes, nil, nil)
if success == -1 {
return err
}
return nil
}
// read retrieves pending events, or waits until an event occurs.
// A timeout of nil blocks indefinitely, while 0 polls the queue.
func read(kq int, events []unix.Kevent_t, timeout *unix.Timespec) ([]unix.Kevent_t, error) {
n, err := unix.Kevent(kq, nil, events, timeout)
if err != nil {
return nil, err
}
return events[0:n], nil
}
// durationToTimespec prepares a timeout value
func durationToTimespec(d time.Duration) unix.Timespec {
return unix.NsecToTimespec(d.Nanoseconds())
}

View File

@ -0,0 +1,11 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build freebsd openbsd netbsd dragonfly
package fsnotify
import "golang.org/x/sys/unix"
const openMode = unix.O_NONBLOCK | unix.O_RDONLY

View File

@ -0,0 +1,12 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build darwin
package fsnotify
import "golang.org/x/sys/unix"
// note: this constant is not defined on BSD
const openMode = unix.O_EVTONLY

View File

@ -0,0 +1,561 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build windows
package fsnotify
import (
"errors"
"fmt"
"os"
"path/filepath"
"runtime"
"sync"
"syscall"
"unsafe"
)
// Watcher watches a set of files, delivering events to a channel.
type Watcher struct {
Events chan Event
Errors chan error
isClosed bool // Set to true when Close() is first called
mu sync.Mutex // Map access
port syscall.Handle // Handle to completion port
watches watchMap // Map of watches (key: i-number)
input chan *input // Inputs to the reader are sent on this channel
quit chan chan<- error
}
// NewWatcher establishes a new watcher with the underlying OS and begins waiting for events.
func NewWatcher() (*Watcher, error) {
port, e := syscall.CreateIoCompletionPort(syscall.InvalidHandle, 0, 0, 0)
if e != nil {
return nil, os.NewSyscallError("CreateIoCompletionPort", e)
}
w := &Watcher{
port: port,
watches: make(watchMap),
input: make(chan *input, 1),
Events: make(chan Event, 50),
Errors: make(chan error),
quit: make(chan chan<- error, 1),
}
go w.readEvents()
return w, nil
}
// Close removes all watches and closes the events channel.
func (w *Watcher) Close() error {
if w.isClosed {
return nil
}
w.isClosed = true
// Send "quit" message to the reader goroutine
ch := make(chan error)
w.quit <- ch
if err := w.wakeupReader(); err != nil {
return err
}
return <-ch
}
// Add starts watching the named file or directory (non-recursively).
func (w *Watcher) Add(name string) error {
if w.isClosed {
return errors.New("watcher already closed")
}
in := &input{
op: opAddWatch,
path: filepath.Clean(name),
flags: sysFSALLEVENTS,
reply: make(chan error),
}
w.input <- in
if err := w.wakeupReader(); err != nil {
return err
}
return <-in.reply
}
// Remove stops watching the the named file or directory (non-recursively).
func (w *Watcher) Remove(name string) error {
in := &input{
op: opRemoveWatch,
path: filepath.Clean(name),
reply: make(chan error),
}
w.input <- in
if err := w.wakeupReader(); err != nil {
return err
}
return <-in.reply
}
const (
// Options for AddWatch
sysFSONESHOT = 0x80000000
sysFSONLYDIR = 0x1000000
// Events
sysFSACCESS = 0x1
sysFSALLEVENTS = 0xfff
sysFSATTRIB = 0x4
sysFSCLOSE = 0x18
sysFSCREATE = 0x100
sysFSDELETE = 0x200
sysFSDELETESELF = 0x400
sysFSMODIFY = 0x2
sysFSMOVE = 0xc0
sysFSMOVEDFROM = 0x40
sysFSMOVEDTO = 0x80
sysFSMOVESELF = 0x800
// Special events
sysFSIGNORED = 0x8000
sysFSQOVERFLOW = 0x4000
)
func newEvent(name string, mask uint32) Event {
e := Event{Name: name}
if mask&sysFSCREATE == sysFSCREATE || mask&sysFSMOVEDTO == sysFSMOVEDTO {
e.Op |= Create
}
if mask&sysFSDELETE == sysFSDELETE || mask&sysFSDELETESELF == sysFSDELETESELF {
e.Op |= Remove
}
if mask&sysFSMODIFY == sysFSMODIFY {
e.Op |= Write
}
if mask&sysFSMOVE == sysFSMOVE || mask&sysFSMOVESELF == sysFSMOVESELF || mask&sysFSMOVEDFROM == sysFSMOVEDFROM {
e.Op |= Rename
}
if mask&sysFSATTRIB == sysFSATTRIB {
e.Op |= Chmod
}
return e
}
const (
opAddWatch = iota
opRemoveWatch
)
const (
provisional uint64 = 1 << (32 + iota)
)
type input struct {
op int
path string
flags uint32
reply chan error
}
type inode struct {
handle syscall.Handle
volume uint32
index uint64
}
type watch struct {
ov syscall.Overlapped
ino *inode // i-number
path string // Directory path
mask uint64 // Directory itself is being watched with these notify flags
names map[string]uint64 // Map of names being watched and their notify flags
rename string // Remembers the old name while renaming a file
buf [4096]byte
}
type indexMap map[uint64]*watch
type watchMap map[uint32]indexMap
func (w *Watcher) wakeupReader() error {
e := syscall.PostQueuedCompletionStatus(w.port, 0, 0, nil)
if e != nil {
return os.NewSyscallError("PostQueuedCompletionStatus", e)
}
return nil
}
func getDir(pathname string) (dir string, err error) {
attr, e := syscall.GetFileAttributes(syscall.StringToUTF16Ptr(pathname))
if e != nil {
return "", os.NewSyscallError("GetFileAttributes", e)
}
if attr&syscall.FILE_ATTRIBUTE_DIRECTORY != 0 {
dir = pathname
} else {
dir, _ = filepath.Split(pathname)
dir = filepath.Clean(dir)
}
return
}
func getIno(path string) (ino *inode, err error) {
h, e := syscall.CreateFile(syscall.StringToUTF16Ptr(path),
syscall.FILE_LIST_DIRECTORY,
syscall.FILE_SHARE_READ|syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_DELETE,
nil, syscall.OPEN_EXISTING,
syscall.FILE_FLAG_BACKUP_SEMANTICS|syscall.FILE_FLAG_OVERLAPPED, 0)
if e != nil {
return nil, os.NewSyscallError("CreateFile", e)
}
var fi syscall.ByHandleFileInformation
if e = syscall.GetFileInformationByHandle(h, &fi); e != nil {
syscall.CloseHandle(h)
return nil, os.NewSyscallError("GetFileInformationByHandle", e)
}
ino = &inode{
handle: h,
volume: fi.VolumeSerialNumber,
index: uint64(fi.FileIndexHigh)<<32 | uint64(fi.FileIndexLow),
}
return ino, nil
}
// Must run within the I/O thread.
func (m watchMap) get(ino *inode) *watch {
if i := m[ino.volume]; i != nil {
return i[ino.index]
}
return nil
}
// Must run within the I/O thread.
func (m watchMap) set(ino *inode, watch *watch) {
i := m[ino.volume]
if i == nil {
i = make(indexMap)
m[ino.volume] = i
}
i[ino.index] = watch
}
// Must run within the I/O thread.
func (w *Watcher) addWatch(pathname string, flags uint64) error {
dir, err := getDir(pathname)
if err != nil {
return err
}
if flags&sysFSONLYDIR != 0 && pathname != dir {
return nil
}
ino, err := getIno(dir)
if err != nil {
return err
}
w.mu.Lock()
watchEntry := w.watches.get(ino)
w.mu.Unlock()
if watchEntry == nil {
if _, e := syscall.CreateIoCompletionPort(ino.handle, w.port, 0, 0); e != nil {
syscall.CloseHandle(ino.handle)
return os.NewSyscallError("CreateIoCompletionPort", e)
}
watchEntry = &watch{
ino: ino,
path: dir,
names: make(map[string]uint64),
}
w.mu.Lock()
w.watches.set(ino, watchEntry)
w.mu.Unlock()
flags |= provisional
} else {
syscall.CloseHandle(ino.handle)
}
if pathname == dir {
watchEntry.mask |= flags
} else {
watchEntry.names[filepath.Base(pathname)] |= flags
}
if err = w.startRead(watchEntry); err != nil {
return err
}
if pathname == dir {
watchEntry.mask &= ^provisional
} else {
watchEntry.names[filepath.Base(pathname)] &= ^provisional
}
return nil
}
// Must run within the I/O thread.
func (w *Watcher) remWatch(pathname string) error {
dir, err := getDir(pathname)
if err != nil {
return err
}
ino, err := getIno(dir)
if err != nil {
return err
}
w.mu.Lock()
watch := w.watches.get(ino)
w.mu.Unlock()
if watch == nil {
return fmt.Errorf("can't remove non-existent watch for: %s", pathname)
}
if pathname == dir {
w.sendEvent(watch.path, watch.mask&sysFSIGNORED)
watch.mask = 0
} else {
name := filepath.Base(pathname)
w.sendEvent(filepath.Join(watch.path, name), watch.names[name]&sysFSIGNORED)
delete(watch.names, name)
}
return w.startRead(watch)
}
// Must run within the I/O thread.
func (w *Watcher) deleteWatch(watch *watch) {
for name, mask := range watch.names {
if mask&provisional == 0 {
w.sendEvent(filepath.Join(watch.path, name), mask&sysFSIGNORED)
}
delete(watch.names, name)
}
if watch.mask != 0 {
if watch.mask&provisional == 0 {
w.sendEvent(watch.path, watch.mask&sysFSIGNORED)
}
watch.mask = 0
}
}
// Must run within the I/O thread.
func (w *Watcher) startRead(watch *watch) error {
if e := syscall.CancelIo(watch.ino.handle); e != nil {
w.Errors <- os.NewSyscallError("CancelIo", e)
w.deleteWatch(watch)
}
mask := toWindowsFlags(watch.mask)
for _, m := range watch.names {
mask |= toWindowsFlags(m)
}
if mask == 0 {
if e := syscall.CloseHandle(watch.ino.handle); e != nil {
w.Errors <- os.NewSyscallError("CloseHandle", e)
}
w.mu.Lock()
delete(w.watches[watch.ino.volume], watch.ino.index)
w.mu.Unlock()
return nil
}
e := syscall.ReadDirectoryChanges(watch.ino.handle, &watch.buf[0],
uint32(unsafe.Sizeof(watch.buf)), false, mask, nil, &watch.ov, 0)
if e != nil {
err := os.NewSyscallError("ReadDirectoryChanges", e)
if e == syscall.ERROR_ACCESS_DENIED && watch.mask&provisional == 0 {
// Watched directory was probably removed
if w.sendEvent(watch.path, watch.mask&sysFSDELETESELF) {
if watch.mask&sysFSONESHOT != 0 {
watch.mask = 0
}
}
err = nil
}
w.deleteWatch(watch)
w.startRead(watch)
return err
}
return nil
}
// readEvents reads from the I/O completion port, converts the
// received events into Event objects and sends them via the Events channel.
// Entry point to the I/O thread.
func (w *Watcher) readEvents() {
var (
n, key uint32
ov *syscall.Overlapped
)
runtime.LockOSThread()
for {
e := syscall.GetQueuedCompletionStatus(w.port, &n, &key, &ov, syscall.INFINITE)
watch := (*watch)(unsafe.Pointer(ov))
if watch == nil {
select {
case ch := <-w.quit:
w.mu.Lock()
var indexes []indexMap
for _, index := range w.watches {
indexes = append(indexes, index)
}
w.mu.Unlock()
for _, index := range indexes {
for _, watch := range index {
w.deleteWatch(watch)
w.startRead(watch)
}
}
var err error
if e := syscall.CloseHandle(w.port); e != nil {
err = os.NewSyscallError("CloseHandle", e)
}
close(w.Events)
close(w.Errors)
ch <- err
return
case in := <-w.input:
switch in.op {
case opAddWatch:
in.reply <- w.addWatch(in.path, uint64(in.flags))
case opRemoveWatch:
in.reply <- w.remWatch(in.path)
}
default:
}
continue
}
switch e {
case syscall.ERROR_MORE_DATA:
if watch == nil {
w.Errors <- errors.New("ERROR_MORE_DATA has unexpectedly null lpOverlapped buffer")
} else {
// The i/o succeeded but the buffer is full.
// In theory we should be building up a full packet.
// In practice we can get away with just carrying on.
n = uint32(unsafe.Sizeof(watch.buf))
}
case syscall.ERROR_ACCESS_DENIED:
// Watched directory was probably removed
w.sendEvent(watch.path, watch.mask&sysFSDELETESELF)
w.deleteWatch(watch)
w.startRead(watch)
continue
case syscall.ERROR_OPERATION_ABORTED:
// CancelIo was called on this handle
continue
default:
w.Errors <- os.NewSyscallError("GetQueuedCompletionPort", e)
continue
case nil:
}
var offset uint32
for {
if n == 0 {
w.Events <- newEvent("", sysFSQOVERFLOW)
w.Errors <- errors.New("short read in readEvents()")
break
}
// Point "raw" to the event in the buffer
raw := (*syscall.FileNotifyInformation)(unsafe.Pointer(&watch.buf[offset]))
buf := (*[syscall.MAX_PATH]uint16)(unsafe.Pointer(&raw.FileName))
name := syscall.UTF16ToString(buf[:raw.FileNameLength/2])
fullname := filepath.Join(watch.path, name)
var mask uint64
switch raw.Action {
case syscall.FILE_ACTION_REMOVED:
mask = sysFSDELETESELF
case syscall.FILE_ACTION_MODIFIED:
mask = sysFSMODIFY
case syscall.FILE_ACTION_RENAMED_OLD_NAME:
watch.rename = name
case syscall.FILE_ACTION_RENAMED_NEW_NAME:
if watch.names[watch.rename] != 0 {
watch.names[name] |= watch.names[watch.rename]
delete(watch.names, watch.rename)
mask = sysFSMOVESELF
}
}
sendNameEvent := func() {
if w.sendEvent(fullname, watch.names[name]&mask) {
if watch.names[name]&sysFSONESHOT != 0 {
delete(watch.names, name)
}
}
}
if raw.Action != syscall.FILE_ACTION_RENAMED_NEW_NAME {
sendNameEvent()
}
if raw.Action == syscall.FILE_ACTION_REMOVED {
w.sendEvent(fullname, watch.names[name]&sysFSIGNORED)
delete(watch.names, name)
}
if w.sendEvent(fullname, watch.mask&toFSnotifyFlags(raw.Action)) {
if watch.mask&sysFSONESHOT != 0 {
watch.mask = 0
}
}
if raw.Action == syscall.FILE_ACTION_RENAMED_NEW_NAME {
fullname = filepath.Join(watch.path, watch.rename)
sendNameEvent()
}
// Move to the next event in the buffer
if raw.NextEntryOffset == 0 {
break
}
offset += raw.NextEntryOffset
// Error!
if offset >= n {
w.Errors <- errors.New("Windows system assumed buffer larger than it is, events have likely been missed.")
break
}
}
if err := w.startRead(watch); err != nil {
w.Errors <- err
}
}
}
func (w *Watcher) sendEvent(name string, mask uint64) bool {
if mask == 0 {
return false
}
event := newEvent(name, uint32(mask))
select {
case ch := <-w.quit:
w.quit <- ch
case w.Events <- event:
}
return true
}
func toWindowsFlags(mask uint64) uint32 {
var m uint32
if mask&sysFSACCESS != 0 {
m |= syscall.FILE_NOTIFY_CHANGE_LAST_ACCESS
}
if mask&sysFSMODIFY != 0 {
m |= syscall.FILE_NOTIFY_CHANGE_LAST_WRITE
}
if mask&sysFSATTRIB != 0 {
m |= syscall.FILE_NOTIFY_CHANGE_ATTRIBUTES
}
if mask&(sysFSMOVE|sysFSCREATE|sysFSDELETE) != 0 {
m |= syscall.FILE_NOTIFY_CHANGE_FILE_NAME | syscall.FILE_NOTIFY_CHANGE_DIR_NAME
}
return m
}
func toFSnotifyFlags(action uint32) uint64 {
switch action {
case syscall.FILE_ACTION_ADDED:
return sysFSCREATE
case syscall.FILE_ACTION_REMOVED:
return sysFSDELETE
case syscall.FILE_ACTION_MODIFIED:
return sysFSMODIFY
case syscall.FILE_ACTION_RENAMED_OLD_NAME:
return sysFSMOVEDFROM
case syscall.FILE_ACTION_RENAMED_NEW_NAME:
return sysFSMOVEDTO
}
return 0
}

Some files were not shown because too many files have changed in this diff Show More