|
| 1 | +defmodule Mix.Lock do |
| 2 | + @moduledoc false |
| 3 | + |
| 4 | + # Lock implementation working across multiple OS processes. |
| 5 | + # |
| 6 | + # The lock is implemented using TCP sockets and hard links. |
| 7 | + # |
| 8 | + # A process holds the lock if it owns a TCP socket, whose port is |
| 9 | + # written in the lock_0 file. We need to create such lock files |
| 10 | + # atomically, so the process first writes its port to a port_P |
| 11 | + # file and then attempts to create a hard link to it at lock_0. |
| 12 | + # |
| 13 | + # An inherent problem with lock files is that the lock owner may |
| 14 | + # terminate abruptly, leaving a "stale" file. Other processes can |
| 15 | + # detect a stale file by reading the port written in that file, |
| 16 | + # trying to connect to that port and failing. In order for another |
| 17 | + # process to link to the same path, the file needs to be replaced. |
| 18 | + # However, we need to guarantee that only a single process can |
| 19 | + # remove or replace the file, otherwise a concurrent process may |
| 20 | + # end up removing a newly linked file. |
| 21 | + # |
| 22 | + # To address this problem we employ a chained locking procedure. |
| 23 | + # Specifically, we attempt to link our port to lock_0, if that |
| 24 | + # fails, we try to connect to the lock_0 port. If we manage to |
| 25 | + # connect, it means the lock is taken, so we wait for it to close |
| 26 | + # and start over. If we fail to connect, it means the lock is stale, |
| 27 | + # so we want to replace it. In order to do that, we try to obtain |
| 28 | + # lock_1. Again, we try to link and connect. Eventually, we should |
| 29 | + # successfully link to lock_N. At that point we can clean up all |
| 30 | + # the files, so we perform these steps: |
| 31 | + # |
| 32 | + # * move our port_P to lock_0 |
| 33 | + # * remove all the other port_P files |
| 34 | + # * remove all lock_1+ files |
| 35 | + # |
| 36 | + # It is important to perform these steps in this order, to avoid |
| 37 | + # race conditions. By moving to lock_0, we make sure that all new |
| 38 | + # processes trying to lock will connect to our port. By removing |
| 39 | + # all port_P files we make sure that currently paused processes |
| 40 | + # that are about to link port_P at lock_N will fail to link, since |
| 41 | + # the port_P file will no longer exist (once lock_N is removed). |
| 42 | + # |
| 43 | + # Finally, note that we do not remove the lock file in `unlock/1`. |
| 44 | + # If we did that, another process could try to connect and fail |
| 45 | + # because the file would not exist, in such case the process would |
| 46 | + # assume the file is stale and needs to be replaced, therefore |
| 47 | + # possibly replacing another process who successfully links at the |
| 48 | + # empty spot. This means we effectively always leave a stale file, |
| 49 | + # however, in order to shortcut the port check for future processes, |
| 50 | + # we atomically replace the file content with port 0, to indicate |
| 51 | + # the file is stale. |
| 52 | + # |
| 53 | + # The main caveat of using ephemeral TCP ports is that they are not |
| 54 | + # unique. This creates a theoretical scenario where the lock holder |
| 55 | + # terminates abruptly and leaves its port in lock_0, then the port |
| 56 | + # is assigned to a unrelated process (unaware of the locking). To |
| 57 | + # handle this scenario, when we connect to a lock_N port, we expect |
| 58 | + # it to immediately send us `@probe_data`. If this does not happen |
| 59 | + # within `@probe_timeout_ms`, we assume the port is taken by an |
| 60 | + # unrelated process and the lock file is stale. Note that it is ok |
| 61 | + # to use a long timeout, because this scenario is very unlikely. |
| 62 | + # Theoretically, if an actual lock owner is not able to send the |
| 63 | + # probe data within the timeout, the lock will fail, however with |
| 64 | + # a high enough timeout, this should not be a problem in practice. |
| 65 | + |
| 66 | + @loopback {127, 0, 0, 1} |
| 67 | + @listen_opts [:binary, ip: @loopback, packet: :raw, nodelay: true, backlog: 128, active: false] |
| 68 | + @connect_opts [:binary, packet: :raw, nodelay: true, active: false] |
| 69 | + @probe_data "elixirlock" |
| 70 | + @probe_data_size byte_size(@probe_data) |
| 71 | + @probe_timeout_ms 5_000 |
| 72 | + |
| 73 | + @doc """ |
| 74 | + Acquires a lock identified by the given key. |
| 75 | +
|
| 76 | + This function blocks until the lock is acquired by this process, |
| 77 | + and then executes `fun`, returning its return value. |
| 78 | +
|
| 79 | + This function can also be called if this process already has the |
| 80 | + lock. In such case the function is executed immediately. |
| 81 | +
|
| 82 | + ## Options |
| 83 | +
|
| 84 | + * `:on_taken` - a one-arity function called if the lock is held |
| 85 | + by a different process. The operating system PID of that process |
| 86 | + is given as the first argument (as a string). This function may |
| 87 | + be called multiple times, if the lock owner changes, until it |
| 88 | + is successfully acquired by this process. |
| 89 | +
|
| 90 | + """ |
| 91 | + @spec with_lock(iodata(), (-> term()), keyword()) :: term() |
| 92 | + def with_lock(key, fun, opts \\ []) do |
| 93 | + opts = Keyword.validate!(opts, [:on_taken]) |
| 94 | + |
| 95 | + key = key |> :erlang.md5() |> Base.url_encode64(padding: false) |
| 96 | + path = Path.join([System.tmp_dir!(), "mix_lock", key]) |
| 97 | + |
| 98 | + pdict_key = {__MODULE__, path} |
| 99 | + has_lock? = Process.get(pdict_key) |
| 100 | + |
| 101 | + if has_lock? do |
| 102 | + fun.() |
| 103 | + else |
| 104 | + lock = lock(path, opts[:on_taken]) |
| 105 | + Process.put(pdict_key, true) |
| 106 | + |
| 107 | + try do |
| 108 | + fun.() |
| 109 | + after |
| 110 | + # Unlocking will always close the socket, but it may raise, |
| 111 | + # so we remove key from the dictionary first |
| 112 | + Process.delete(pdict_key) |
| 113 | + unlock(lock) |
| 114 | + end |
| 115 | + end |
| 116 | + end |
| 117 | + |
| 118 | + defp lock(path, on_taken) do |
| 119 | + File.mkdir_p!(path) |
| 120 | + |
| 121 | + case listen() do |
| 122 | + {:ok, socket, port} -> |
| 123 | + spawn_link(fn -> accept_loop(socket) end) |
| 124 | + |
| 125 | + try do |
| 126 | + try_lock(path, socket, port, on_taken) |
| 127 | + rescue |
| 128 | + exception -> |
| 129 | + # Close the socket to make sure we don't block the lock |
| 130 | + :gen_tcp.close(socket) |
| 131 | + reraise exception, __STACKTRACE__ |
| 132 | + end |
| 133 | + |
| 134 | + {:error, reason} -> |
| 135 | + Mix.raise( |
| 136 | + "failed to open a TCP socket while acquiring a lock, reason: #{inspect(reason)}" |
| 137 | + ) |
| 138 | + end |
| 139 | + end |
| 140 | + |
| 141 | + defp listen do |
| 142 | + with {:ok, socket} <- :gen_tcp.listen(0, @listen_opts) do |
| 143 | + case :inet.port(socket) do |
| 144 | + {:ok, port} -> |
| 145 | + {:ok, socket, port} |
| 146 | + |
| 147 | + {:error, reason} -> |
| 148 | + :gen_tcp.close(socket) |
| 149 | + {:error, reason} |
| 150 | + end |
| 151 | + end |
| 152 | + end |
| 153 | + |
| 154 | + defp try_lock(path, socket, port, on_taken) do |
| 155 | + port_path = Path.join(path, "port_#{port}") |
| 156 | + os_pid = System.pid() |
| 157 | + |
| 158 | + File.write!(port_path, <<port::unsigned-integer-32, os_pid::binary>>, [:raw]) |
| 159 | + |
| 160 | + case grab_lock(path, port_path, 0) do |
| 161 | + {:ok, 0} -> |
| 162 | + # We grabbed lock_0, so all good |
| 163 | + %{socket: socket, path: path} |
| 164 | + |
| 165 | + {:ok, _n} -> |
| 166 | + # We grabbed lock_1+, so we need to replace lock_0 and clean up |
| 167 | + take_over(path, port_path) |
| 168 | + %{socket: socket, path: path} |
| 169 | + |
| 170 | + {:taken, probe_socket, os_pid} -> |
| 171 | + # Another process has the lock, wait for close and start over |
| 172 | + if on_taken, do: on_taken.(os_pid) |
| 173 | + await_close(probe_socket) |
| 174 | + try_lock(path, socket, port, on_taken) |
| 175 | + |
| 176 | + :invalidated -> |
| 177 | + try_lock(path, socket, port, on_taken) |
| 178 | + end |
| 179 | + end |
| 180 | + |
| 181 | + defp grab_lock(path, port_path, n) do |
| 182 | + lock_path = Path.join(path, "lock_#{n}") |
| 183 | + |
| 184 | + case File.ln(port_path, lock_path) do |
| 185 | + :ok -> |
| 186 | + {:ok, n} |
| 187 | + |
| 188 | + {:error, :eexist} -> |
| 189 | + case probe(lock_path) do |
| 190 | + {:ok, probe_socket, os_pid} -> |
| 191 | + {:taken, probe_socket, os_pid} |
| 192 | + |
| 193 | + {:error, _reason} -> |
| 194 | + grab_lock(path, port_path, n + 1) |
| 195 | + end |
| 196 | + |
| 197 | + {:error, :enoent} -> |
| 198 | + :invalidated |
| 199 | + |
| 200 | + {:error, reason} -> |
| 201 | + raise File.LinkError, |
| 202 | + reason: reason, |
| 203 | + action: "create hard link", |
| 204 | + existing: port_path, |
| 205 | + new: lock_path |
| 206 | + end |
| 207 | + end |
| 208 | + |
| 209 | + defp accept_loop(listen_socket) do |
| 210 | + case :gen_tcp.accept(listen_socket) do |
| 211 | + {:ok, socket} -> |
| 212 | + _ = :gen_tcp.send(socket, @probe_data) |
| 213 | + accept_loop(listen_socket) |
| 214 | + |
| 215 | + # eintr is "Interrupted system call". |
| 216 | + {:error, :eintr} -> |
| 217 | + accept_loop(listen_socket) |
| 218 | + |
| 219 | + {:error, reason} when reason in [:closed, :einval] -> |
| 220 | + :ok |
| 221 | + end |
| 222 | + end |
| 223 | + |
| 224 | + defp probe(port_path) do |
| 225 | + with {:ok, port, os_pid} <- fetch_probe_port(port_path), |
| 226 | + {:ok, socket} <- connect(port), |
| 227 | + {:ok, socket} <- await_probe_data(socket) do |
| 228 | + {:ok, socket, os_pid} |
| 229 | + end |
| 230 | + end |
| 231 | + |
| 232 | + defp fetch_probe_port(port_path) do |
| 233 | + case File.read(port_path) do |
| 234 | + {:ok, <<0::unsigned-integer-32>>} -> {:error, :ignore} |
| 235 | + {:ok, <<port::unsigned-integer-32, os_pid::binary>>} -> {:ok, port, os_pid} |
| 236 | + {:error, reason} -> {:error, reason} |
| 237 | + end |
| 238 | + end |
| 239 | + |
| 240 | + defp connect(port) do |
| 241 | + # On Windows connecting to an unbound port takes a few seconds to |
| 242 | + # fail, so instead we shortcut the check by attempting a listen, |
| 243 | + # which succeeds or fails immediately. Note that `reuseaddr` here |
| 244 | + # ensures that if the listening socket closed recently, we can |
| 245 | + # immediately reclaim the same port. |
| 246 | + case :gen_tcp.listen(port, [reuseaddr: true] ++ @listen_opts) do |
| 247 | + {:ok, socket} -> |
| 248 | + :gen_tcp.close(socket) |
| 249 | + # The port is free, so connecting would fail |
| 250 | + {:error, :econnrefused} |
| 251 | + |
| 252 | + {:error, _reason} -> |
| 253 | + :gen_tcp.connect(@loopback, port, @connect_opts) |
| 254 | + end |
| 255 | + end |
| 256 | + |
| 257 | + defp await_probe_data(socket) do |
| 258 | + case :gen_tcp.recv(socket, @probe_data_size, @probe_timeout_ms) do |
| 259 | + {:ok, @probe_data} -> |
| 260 | + {:ok, socket} |
| 261 | + |
| 262 | + {:ok, _data} -> |
| 263 | + :gen_tcp.close(socket) |
| 264 | + {:error, :unexpected_port_owner} |
| 265 | + |
| 266 | + {:error, :eintr} -> |
| 267 | + await_probe_data(socket) |
| 268 | + |
| 269 | + {:error, reason} -> |
| 270 | + :gen_tcp.close(socket) |
| 271 | + {:error, reason} |
| 272 | + end |
| 273 | + end |
| 274 | + |
| 275 | + defp take_over(path, port_path) do |
| 276 | + # The operations here must happen in precise order, so if anything |
| 277 | + # fails, we keep the files as is and the next process that grabs |
| 278 | + # the lock will do the cleanup |
| 279 | + |
| 280 | + lock_path = Path.join(path, "lock_0") |
| 281 | + |
| 282 | + # We linked to lock_N successfully, so port_path should exist |
| 283 | + File.rename!(port_path, lock_path) |
| 284 | + |
| 285 | + names = File.ls!(path) |
| 286 | + |
| 287 | + for "port_" <> _ = name <- names do |
| 288 | + File.rm!(Path.join(path, name)) |
| 289 | + end |
| 290 | + |
| 291 | + for "lock_" <> _ = name <- names, name != "lock_0" do |
| 292 | + File.rm!(Path.join(path, name)) |
| 293 | + end |
| 294 | + end |
| 295 | + |
| 296 | + defp await_close(socket) do |
| 297 | + case :gen_tcp.recv(socket, 0) do |
| 298 | + {:error, :closed} -> |
| 299 | + :ok |
| 300 | + |
| 301 | + {:error, :eintr} -> |
| 302 | + await_close(socket) |
| 303 | + |
| 304 | + {:error, _other} -> |
| 305 | + # In case of an unexpected error, we close the socket ourselves |
| 306 | + # to retry |
| 307 | + :gen_tcp.close(socket) |
| 308 | + end |
| 309 | + end |
| 310 | + |
| 311 | + defp unlock(lock) do |
| 312 | + port_path = Path.join(lock.path, "port_0") |
| 313 | + lock_path = Path.join(lock.path, "lock_0") |
| 314 | + |
| 315 | + File.write!(port_path, <<0::unsigned-integer-32>>, [:raw]) |
| 316 | + File.rename!(port_path, lock_path) |
| 317 | + after |
| 318 | + # Closing the socket will cause the accepting process to finish |
| 319 | + # and all accepted sockets (tied to that process) will get closed |
| 320 | + :gen_tcp.close(lock.socket) |
| 321 | + end |
| 322 | +end |
0 commit comments