Skip to content

Commit 63e9caf

Browse files
Wrap compilation in a cross-os-process lock (#13860)
1 parent fb31253 commit 63e9caf

File tree

9 files changed

+555
-189
lines changed

9 files changed

+555
-189
lines changed

lib/mix/lib/mix/lock.ex

Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
defmodule Mix.Lock do
2+
@moduledoc false
3+
4+
# Lock implementation working across multiple OS processes.
5+
#
6+
# The lock is implemented using TCP sockets and hard links.
7+
#
8+
# A process holds the lock if it owns a TCP socket, whose port is
9+
# written in the lock_0 file. We need to create such lock files
10+
# atomically, so the process first writes its port to a port_P
11+
# file and then attempts to create a hard link to it at lock_0.
12+
#
13+
# An inherent problem with lock files is that the lock owner may
14+
# terminate abruptly, leaving a "stale" file. Other processes can
15+
# detect a stale file by reading the port written in that file,
16+
# trying to connect to that port and failing. In order for another
17+
# process to link to the same path, the file needs to be replaced.
18+
# However, we need to guarantee that only a single process can
19+
# remove or replace the file, otherwise a concurrent process may
20+
# end up removing a newly linked file.
21+
#
22+
# To address this problem we employ a chained locking procedure.
23+
# Specifically, we attempt to link our port to lock_0, if that
24+
# fails, we try to connect to the lock_0 port. If we manage to
25+
# connect, it means the lock is taken, so we wait for it to close
26+
# and start over. If we fail to connect, it means the lock is stale,
27+
# so we want to replace it. In order to do that, we try to obtain
28+
# lock_1. Again, we try to link and connect. Eventually, we should
29+
# successfully link to lock_N. At that point we can clean up all
30+
# the files, so we perform these steps:
31+
#
32+
# * move our port_P to lock_0
33+
# * remove all the other port_P files
34+
# * remove all lock_1+ files
35+
#
36+
# It is important to perform these steps in this order, to avoid
37+
# race conditions. By moving to lock_0, we make sure that all new
38+
# processes trying to lock will connect to our port. By removing
39+
# all port_P files we make sure that currently paused processes
40+
# that are about to link port_P at lock_N will fail to link, since
41+
# the port_P file will no longer exist (once lock_N is removed).
42+
#
43+
# Finally, note that we do not remove the lock file in `unlock/1`.
44+
# If we did that, another process could try to connect and fail
45+
# because the file would not exist, in such case the process would
46+
# assume the file is stale and needs to be replaced, therefore
47+
# possibly replacing another process who successfully links at the
48+
# empty spot. This means we effectively always leave a stale file,
49+
# however, in order to shortcut the port check for future processes,
50+
# we atomically replace the file content with port 0, to indicate
51+
# the file is stale.
52+
#
53+
# The main caveat of using ephemeral TCP ports is that they are not
54+
# unique. This creates a theoretical scenario where the lock holder
55+
# terminates abruptly and leaves its port in lock_0, then the port
56+
# is assigned to a unrelated process (unaware of the locking). To
57+
# handle this scenario, when we connect to a lock_N port, we expect
58+
# it to immediately send us `@probe_data`. If this does not happen
59+
# within `@probe_timeout_ms`, we assume the port is taken by an
60+
# unrelated process and the lock file is stale. Note that it is ok
61+
# to use a long timeout, because this scenario is very unlikely.
62+
# Theoretically, if an actual lock owner is not able to send the
63+
# probe data within the timeout, the lock will fail, however with
64+
# a high enough timeout, this should not be a problem in practice.
65+
66+
@loopback {127, 0, 0, 1}
67+
@listen_opts [:binary, ip: @loopback, packet: :raw, nodelay: true, backlog: 128, active: false]
68+
@connect_opts [:binary, packet: :raw, nodelay: true, active: false]
69+
@probe_data "elixirlock"
70+
@probe_data_size byte_size(@probe_data)
71+
@probe_timeout_ms 5_000
72+
73+
@doc """
74+
Acquires a lock identified by the given key.
75+
76+
This function blocks until the lock is acquired by this process,
77+
and then executes `fun`, returning its return value.
78+
79+
This function can also be called if this process already has the
80+
lock. In such case the function is executed immediately.
81+
82+
## Options
83+
84+
* `:on_taken` - a one-arity function called if the lock is held
85+
by a different process. The operating system PID of that process
86+
is given as the first argument (as a string). This function may
87+
be called multiple times, if the lock owner changes, until it
88+
is successfully acquired by this process.
89+
90+
"""
91+
@spec with_lock(iodata(), (-> term()), keyword()) :: term()
92+
def with_lock(key, fun, opts \\ []) do
93+
opts = Keyword.validate!(opts, [:on_taken])
94+
95+
key = key |> :erlang.md5() |> Base.url_encode64(padding: false)
96+
path = Path.join([System.tmp_dir!(), "mix_lock", key])
97+
98+
pdict_key = {__MODULE__, path}
99+
has_lock? = Process.get(pdict_key)
100+
101+
if has_lock? do
102+
fun.()
103+
else
104+
lock = lock(path, opts[:on_taken])
105+
Process.put(pdict_key, true)
106+
107+
try do
108+
fun.()
109+
after
110+
# Unlocking will always close the socket, but it may raise,
111+
# so we remove key from the dictionary first
112+
Process.delete(pdict_key)
113+
unlock(lock)
114+
end
115+
end
116+
end
117+
118+
defp lock(path, on_taken) do
119+
File.mkdir_p!(path)
120+
121+
case listen() do
122+
{:ok, socket, port} ->
123+
spawn_link(fn -> accept_loop(socket) end)
124+
125+
try do
126+
try_lock(path, socket, port, on_taken)
127+
rescue
128+
exception ->
129+
# Close the socket to make sure we don't block the lock
130+
:gen_tcp.close(socket)
131+
reraise exception, __STACKTRACE__
132+
end
133+
134+
{:error, reason} ->
135+
Mix.raise(
136+
"failed to open a TCP socket while acquiring a lock, reason: #{inspect(reason)}"
137+
)
138+
end
139+
end
140+
141+
defp listen do
142+
with {:ok, socket} <- :gen_tcp.listen(0, @listen_opts) do
143+
case :inet.port(socket) do
144+
{:ok, port} ->
145+
{:ok, socket, port}
146+
147+
{:error, reason} ->
148+
:gen_tcp.close(socket)
149+
{:error, reason}
150+
end
151+
end
152+
end
153+
154+
defp try_lock(path, socket, port, on_taken) do
155+
port_path = Path.join(path, "port_#{port}")
156+
os_pid = System.pid()
157+
158+
File.write!(port_path, <<port::unsigned-integer-32, os_pid::binary>>, [:raw])
159+
160+
case grab_lock(path, port_path, 0) do
161+
{:ok, 0} ->
162+
# We grabbed lock_0, so all good
163+
%{socket: socket, path: path}
164+
165+
{:ok, _n} ->
166+
# We grabbed lock_1+, so we need to replace lock_0 and clean up
167+
take_over(path, port_path)
168+
%{socket: socket, path: path}
169+
170+
{:taken, probe_socket, os_pid} ->
171+
# Another process has the lock, wait for close and start over
172+
if on_taken, do: on_taken.(os_pid)
173+
await_close(probe_socket)
174+
try_lock(path, socket, port, on_taken)
175+
176+
:invalidated ->
177+
try_lock(path, socket, port, on_taken)
178+
end
179+
end
180+
181+
defp grab_lock(path, port_path, n) do
182+
lock_path = Path.join(path, "lock_#{n}")
183+
184+
case File.ln(port_path, lock_path) do
185+
:ok ->
186+
{:ok, n}
187+
188+
{:error, :eexist} ->
189+
case probe(lock_path) do
190+
{:ok, probe_socket, os_pid} ->
191+
{:taken, probe_socket, os_pid}
192+
193+
{:error, _reason} ->
194+
grab_lock(path, port_path, n + 1)
195+
end
196+
197+
{:error, :enoent} ->
198+
:invalidated
199+
200+
{:error, reason} ->
201+
raise File.LinkError,
202+
reason: reason,
203+
action: "create hard link",
204+
existing: port_path,
205+
new: lock_path
206+
end
207+
end
208+
209+
defp accept_loop(listen_socket) do
210+
case :gen_tcp.accept(listen_socket) do
211+
{:ok, socket} ->
212+
_ = :gen_tcp.send(socket, @probe_data)
213+
accept_loop(listen_socket)
214+
215+
# eintr is "Interrupted system call".
216+
{:error, :eintr} ->
217+
accept_loop(listen_socket)
218+
219+
{:error, reason} when reason in [:closed, :einval] ->
220+
:ok
221+
end
222+
end
223+
224+
defp probe(port_path) do
225+
with {:ok, port, os_pid} <- fetch_probe_port(port_path),
226+
{:ok, socket} <- connect(port),
227+
{:ok, socket} <- await_probe_data(socket) do
228+
{:ok, socket, os_pid}
229+
end
230+
end
231+
232+
defp fetch_probe_port(port_path) do
233+
case File.read(port_path) do
234+
{:ok, <<0::unsigned-integer-32>>} -> {:error, :ignore}
235+
{:ok, <<port::unsigned-integer-32, os_pid::binary>>} -> {:ok, port, os_pid}
236+
{:error, reason} -> {:error, reason}
237+
end
238+
end
239+
240+
defp connect(port) do
241+
# On Windows connecting to an unbound port takes a few seconds to
242+
# fail, so instead we shortcut the check by attempting a listen,
243+
# which succeeds or fails immediately. Note that `reuseaddr` here
244+
# ensures that if the listening socket closed recently, we can
245+
# immediately reclaim the same port.
246+
case :gen_tcp.listen(port, [reuseaddr: true] ++ @listen_opts) do
247+
{:ok, socket} ->
248+
:gen_tcp.close(socket)
249+
# The port is free, so connecting would fail
250+
{:error, :econnrefused}
251+
252+
{:error, _reason} ->
253+
:gen_tcp.connect(@loopback, port, @connect_opts)
254+
end
255+
end
256+
257+
defp await_probe_data(socket) do
258+
case :gen_tcp.recv(socket, @probe_data_size, @probe_timeout_ms) do
259+
{:ok, @probe_data} ->
260+
{:ok, socket}
261+
262+
{:ok, _data} ->
263+
:gen_tcp.close(socket)
264+
{:error, :unexpected_port_owner}
265+
266+
{:error, :eintr} ->
267+
await_probe_data(socket)
268+
269+
{:error, reason} ->
270+
:gen_tcp.close(socket)
271+
{:error, reason}
272+
end
273+
end
274+
275+
defp take_over(path, port_path) do
276+
# The operations here must happen in precise order, so if anything
277+
# fails, we keep the files as is and the next process that grabs
278+
# the lock will do the cleanup
279+
280+
lock_path = Path.join(path, "lock_0")
281+
282+
# We linked to lock_N successfully, so port_path should exist
283+
File.rename!(port_path, lock_path)
284+
285+
names = File.ls!(path)
286+
287+
for "port_" <> _ = name <- names do
288+
File.rm!(Path.join(path, name))
289+
end
290+
291+
for "lock_" <> _ = name <- names, name != "lock_0" do
292+
File.rm!(Path.join(path, name))
293+
end
294+
end
295+
296+
defp await_close(socket) do
297+
case :gen_tcp.recv(socket, 0) do
298+
{:error, :closed} ->
299+
:ok
300+
301+
{:error, :eintr} ->
302+
await_close(socket)
303+
304+
{:error, _other} ->
305+
# In case of an unexpected error, we close the socket ourselves
306+
# to retry
307+
:gen_tcp.close(socket)
308+
end
309+
end
310+
311+
defp unlock(lock) do
312+
port_path = Path.join(lock.path, "port_0")
313+
lock_path = Path.join(lock.path, "lock_0")
314+
315+
File.write!(port_path, <<0::unsigned-integer-32>>, [:raw])
316+
File.rename!(port_path, lock_path)
317+
after
318+
# Closing the socket will cause the accepting process to finish
319+
# and all accepted sockets (tied to that process) will get closed
320+
:gen_tcp.close(lock.socket)
321+
end
322+
end

lib/mix/lib/mix/project.ex

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -882,6 +882,23 @@ defmodule Mix.Project do
882882
end
883883
end
884884

885+
@doc false
886+
def with_build_lock(config \\ config(), fun) do
887+
# To avoid duplicated compilation, we wrap compilation tasks, such
888+
# as compile.all, deps.compile, compile.elixir, compile.erlang in
889+
# a lock. Note that compile.all covers compile.elixir, but the
890+
# latter can still be invoked directly, so we put the lock over
891+
# each individual task.
892+
893+
build_path = build_path(config)
894+
895+
on_taken = fn os_pid ->
896+
Mix.shell().info("Waiting for lock on the build directory (held by process #{os_pid})")
897+
end
898+
899+
Mix.Lock.with_lock(build_path, fun, on_taken: on_taken)
900+
end
901+
885902
# Loads mix.exs in the current directory or loads the project from the
886903
# mixfile cache and pushes the project onto the project stack.
887904
defp load_project(app, post_config) do

0 commit comments

Comments
 (0)