feat: add host-side automation socket for programmatic VM control

Add a Unix domain socket server (vm/vphone.sock) that accepts JSON
commands from external processes, enabling programmatic E2E testing
of iOS apps running in the VM.

Supported commands:
- screenshot: capture VM display to file (PNG/JPEG by extension)
- tap: inject touch at pixel coordinates (matching screenshot dims)
- swipe: inject swipe gesture between two points
- key: send hardware keys (home/power/volup/voldown) via HID

The socket uses a simple one-line JSON protocol: connect, send request,
receive response, disconnect.  Example usage from CLI:

  echo '{"t":"screenshot","path":"/tmp/s.png"}' | nc -U vm/vphone.sock
  echo '{"t":"tap","x":500,"y":1900}' | nc -U vm/vphone.sock
  echo '{"t":"key","name":"home"}' | nc -U vm/vphone.sock

New files:
- VPhoneHostControl.swift: socket server, command dispatch

Modified:
- VPhoneScreenRecorder: add saveScreenshot(view:to:) with PNG support
- VPhoneVirtualMachineView: add injectTap/injectSwipe via synthetic
  NSEvents routed through the existing mouse event handlers
- VPhoneAppDelegate: wire up VPhoneHostControl lifecycle

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
pluginslab
2026-03-28 23:42:06 +00:00
committed by zqxwce
parent a36b797a6e
commit dfdec8c393
4 changed files with 421 additions and 3 deletions

View File

@@ -12,6 +12,7 @@ class VPhoneAppDelegate: NSObject, NSApplicationDelegate {
private var keychainWindowController: VPhoneKeychainWindowController?
private var appWindowController: VPhoneAppWindowController?
private var locationProvider: VPhoneLocationProvider?
private var hostControl: VPhoneHostControl?
private var sigintSource: DispatchSourceSignal?
private var didAttemptAutoInstall = false
@@ -133,9 +134,23 @@ class VPhoneAppDelegate: NSObject, NSApplicationDelegate {
if let provider = locationProvider {
mc.locationProvider = provider
}
mc.screenRecorder = VPhoneScreenRecorder()
let recorder = VPhoneScreenRecorder()
mc.screenRecorder = recorder
menuController = mc
let socketPath = options.configURL
.deletingLastPathComponent()
.appendingPathComponent("vphone.sock").path
let hc = VPhoneHostControl(socketPath: socketPath)
hc.start(
captureView: wc.captureView!,
screenRecorder: recorder,
control: control,
screenWidth: options.screenWidth,
screenHeight: options.screenHeight
)
hostControl = hc
// Wire location toggle through onConnect/onDisconnect
control.onConnect = { [weak mc, weak provider = locationProvider] caps in
mc?.updateConnectAvailability(available: true)
@@ -225,6 +240,10 @@ class VPhoneAppDelegate: NSObject, NSApplicationDelegate {
}
}
func applicationWillTerminate(_: Notification) {
hostControl?.stop()
}
func applicationShouldTerminateAfterLastWindowClosed(_: NSApplication) -> Bool {
!cli.noGraphics
}

View File

@@ -0,0 +1,310 @@
import AppKit
import Foundation
// MARK: - Host Control Socket
/// Lightweight Unix domain socket server that accepts automation commands from
/// local processes (e.g. Claude Code via `nc -U`). One JSON line in, one JSON
/// line out, then the connection closes.
///
/// Supported commands:
/// {"t":"screenshot"} save to default Desktop path
/// {"t":"screenshot","path":"/tmp/shot.png"} save to explicit path (PNG/JPEG by extension)
/// {"t":"tap","x":645,"y":1398} tap at pixel coordinates (matching screenshot)
/// {"t":"swipe","x1":645,"y1":2600,"x2":645,"y2":1400} swipe between points
/// {"t":"swipe","x1":645,"y1":2600,"x2":645,"y2":1400,"ms":300} swipe with duration
/// {"t":"key","name":"home"} hardware key (home/power/volup/voldown)
@MainActor
class VPhoneHostControl {
private let socketPath: String
private var listenFD: Int32 = -1
private let acceptQueue = DispatchQueue(label: "vphone.hostcontrol.accept")
private weak var captureView: VPhoneVirtualMachineView?
private var screenRecorder: VPhoneScreenRecorder?
private weak var control: VPhoneControl?
/// Thread-safe box for passing results between main actor and accept queue.
private final class ResultBox: @unchecked Sendable {
var path: String?
var error: String?
var ok = false
}
/// Screen pixel dimensions for coordinate mapping.
private var screenWidth: Int = 1290
private var screenHeight: Int = 2796
init(socketPath: String) {
self.socketPath = socketPath
}
func start(
captureView: VPhoneVirtualMachineView,
screenRecorder: VPhoneScreenRecorder,
control: VPhoneControl,
screenWidth: Int,
screenHeight: Int
) {
self.captureView = captureView
self.screenRecorder = screenRecorder
self.control = control
self.screenWidth = screenWidth
self.screenHeight = screenHeight
// Clean up stale socket from previous run
unlink(socketPath)
let fd = socket(AF_UNIX, SOCK_STREAM, 0)
guard fd >= 0 else {
print("[hostctl] failed to create socket: \(String(cString: strerror(errno)))")
return
}
var addr = sockaddr_un()
addr.sun_family = sa_family_t(AF_UNIX)
let pathBytes = socketPath.utf8CString
guard pathBytes.count <= MemoryLayout.size(ofValue: addr.sun_path) else {
print("[hostctl] socket path too long")
close(fd)
return
}
withUnsafeMutablePointer(to: &addr.sun_path) { ptr in
ptr.withMemoryRebound(to: CChar.self, capacity: pathBytes.count) { dst in
for (i, byte) in pathBytes.enumerated() {
dst[i] = byte
}
}
}
let addrLen = socklen_t(MemoryLayout<sockaddr_un>.size)
let bindResult = withUnsafePointer(to: &addr) { ptr in
ptr.withMemoryRebound(to: sockaddr.self, capacity: 1) { sockPtr in
bind(fd, sockPtr, addrLen)
}
}
guard bindResult == 0 else {
print("[hostctl] bind failed: \(String(cString: strerror(errno)))")
close(fd)
return
}
guard listen(fd, 4) == 0 else {
print("[hostctl] listen failed: \(String(cString: strerror(errno)))")
close(fd)
return
}
listenFD = fd
print("[hostctl] listening on \(socketPath)")
let capturedFD = fd
acceptQueue.async { [weak self] in
Self.acceptLoop(listenFD: capturedFD, controller: self)
}
}
func stop() {
if listenFD >= 0 {
close(listenFD)
listenFD = -1
}
unlink(socketPath)
}
// MARK: - Accept Loop
private nonisolated static func acceptLoop(listenFD: Int32, controller: VPhoneHostControl?) {
while true {
let clientFD = accept(listenFD, nil, nil)
guard clientFD >= 0 else { break }
handleClient(clientFD, controller: controller)
}
}
private nonisolated static func handleClient(_ fd: Int32, controller: VPhoneHostControl?) {
defer { close(fd) }
guard let line = readLine(from: fd) else { return }
guard let data = line.data(using: .utf8),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let type = json["t"] as? String
else {
writeResponse(fd, ok: false, error: "invalid JSON")
return
}
switch type {
case "screenshot":
let outputPath = json["path"] as? String
let semaphore = DispatchSemaphore(value: 0)
let result = ResultBox()
Task { @MainActor in
defer { semaphore.signal() }
guard let controller,
let recorder = controller.screenRecorder,
let view = controller.captureView,
view.window != nil
else {
result.error = "no active VM view"
return
}
do {
let url: URL
if let outputPath {
url = try await recorder.saveScreenshot(view: view, to: URL(fileURLWithPath: outputPath))
} else {
url = try await recorder.saveScreenshot(view: view)
}
result.path = url.path
} catch {
result.error = "\(error)"
}
}
semaphore.wait()
if let path = result.path {
writeResponse(fd, ok: true, path: path)
} else {
writeResponse(fd, ok: false, error: result.error ?? "unknown error")
}
case "tap":
guard let x = json["x"] as? Double, let y = json["y"] as? Double else {
writeResponse(fd, ok: false, error: "tap requires x and y (pixel coordinates)")
return
}
let semaphore = DispatchSemaphore(value: 0)
let result = ResultBox()
Task { @MainActor in
defer { semaphore.signal() }
guard let controller, let view = controller.captureView, view.window != nil else {
result.error = "no active VM view"
return
}
view.injectTap(
pixelX: x, pixelY: y,
screenWidth: controller.screenWidth, screenHeight: controller.screenHeight
)
result.ok = true
}
semaphore.wait()
if result.ok {
writeResponse(fd, ok: true)
} else {
writeResponse(fd, ok: false, error: result.error ?? "tap failed")
}
case "swipe":
guard let x1 = json["x1"] as? Double, let y1 = json["y1"] as? Double,
let x2 = json["x2"] as? Double, let y2 = json["y2"] as? Double
else {
writeResponse(fd, ok: false, error: "swipe requires x1, y1, x2, y2")
return
}
let durationMs = json["ms"] as? Int ?? 300
let semaphore = DispatchSemaphore(value: 0)
let result = ResultBox()
Task { @MainActor in
defer { semaphore.signal() }
guard let controller, let view = controller.captureView, view.window != nil else {
result.error = "no active VM view"
return
}
view.injectSwipe(
fromX: x1, fromY: y1, toX: x2, toY: y2,
screenWidth: controller.screenWidth, screenHeight: controller.screenHeight,
durationMs: durationMs
)
result.ok = true
}
semaphore.wait()
if result.ok {
writeResponse(fd, ok: true)
} else {
writeResponse(fd, ok: false, error: result.error ?? "swipe failed")
}
case "key":
guard let name = json["name"] as? String else {
writeResponse(fd, ok: false, error: "key requires name (home/power/volup/voldown)")
return
}
let hidKey: (page: UInt32, usage: UInt32)? = switch name {
case "home": (0x0C, 0x40)
case "power": (0x0C, 0x30)
case "volup": (0x0C, 0xE9)
case "voldown": (0x0C, 0xEA)
default: nil
}
guard let key = hidKey else {
writeResponse(fd, ok: false, error: "unknown key: \(name)")
return
}
let semaphore = DispatchSemaphore(value: 0)
let result = ResultBox()
Task { @MainActor in
defer { semaphore.signal() }
guard let controller, let ctl = controller.control, ctl.isConnected else {
result.error = "guest not connected"
return
}
ctl.sendHIDPress(page: key.page, usage: key.usage)
result.ok = true
}
semaphore.wait()
if result.ok {
writeResponse(fd, ok: true)
} else {
writeResponse(fd, ok: false, error: result.error ?? "key failed")
}
default:
writeResponse(fd, ok: false, error: "unknown command: \(type)")
}
}
// MARK: - Socket I/O
private nonisolated static func readLine(from fd: Int32) -> String? {
var buffer = [UInt8](repeating: 0, count: 4096)
var accumulated = Data()
while accumulated.count < 4096 {
let n = read(fd, &buffer, buffer.count)
guard n > 0 else { break }
accumulated.append(contentsOf: buffer[..<n])
if accumulated.contains(0x0A) { break }
}
if let nlRange = accumulated.firstIndex(of: 0x0A) {
return String(data: accumulated[..<nlRange], encoding: .utf8)
}
return accumulated.isEmpty ? nil : String(data: accumulated, encoding: .utf8)
}
private nonisolated static func writeResponse(_ fd: Int32, ok: Bool, path: String? = nil, error: String? = nil) {
var dict: [String: Any] = ["ok": ok]
if let path { dict["path"] = path }
if let error { dict["error"] = error }
guard let data = try? JSONSerialization.data(withJSONObject: dict),
var json = String(data: data, encoding: .utf8)
else { return }
json += "\n"
_ = json.withCString { ptr in
write(fd, ptr, strlen(ptr))
}
}
}

View File

@@ -149,11 +149,15 @@ class VPhoneScreenRecorder {
}
func saveScreenshot(view: NSView) async throws -> URL {
try await saveScreenshot(view: view, to: screenshotOutputURL())
}
func saveScreenshot(view: NSView, to url: URL) async throws -> URL {
let cgImage = try await captureStillImage(from: view)
let url = screenshotOutputURL()
let utType = url.pathExtension.lowercased() == "png" ? "public.png" : "public.jpeg"
guard let dest = CGImageDestinationCreateWithURL(
url as CFURL, "public.jpeg" as CFString, 1, nil
url as CFURL, utType as CFString, 1, nil
) else {
throw CaptureError.encodingFailed
}

View File

@@ -164,6 +164,91 @@ class VPhoneVirtualMachineView: VZVirtualMachineView {
}
}
// MARK: - Programmatic Touch (for automation)
/// Convert screenshot pixel coordinates to NSView local coordinates.
private func pixelToLocal(pixelX: Double, pixelY: Double, screenWidth: Int, screenHeight: Int) -> NSPoint {
let w = bounds.width
let h = bounds.height
let localX = pixelX / Double(screenWidth) * w
// Screenshot y=0 is top, NSView y=0 is bottom (non-flipped)
let localY = (1.0 - pixelY / Double(screenHeight)) * h
return NSPoint(x: localX, y: localY)
}
/// Synthesize an NSEvent at a given window point.
private func synthesizeMouseEvent(type: NSEvent.EventType, at windowPoint: NSPoint) -> NSEvent? {
NSEvent.mouseEvent(
with: type,
location: windowPoint,
modifierFlags: [],
timestamp: ProcessInfo.processInfo.systemUptime,
windowNumber: window?.windowNumber ?? 0,
context: nil,
eventNumber: 0,
clickCount: type == .leftMouseUp ? 0 : 1,
pressure: type == .leftMouseUp ? 0.0 : 1.0
)
}
/// Inject a tap at pixel coordinates (matching screenshot image dimensions).
func injectTap(pixelX: Double, pixelY: Double, screenWidth: Int, screenHeight: Int) {
let localPoint = pixelToLocal(pixelX: pixelX, pixelY: pixelY, screenWidth: screenWidth, screenHeight: screenHeight)
let windowPoint = convert(localPoint, to: nil)
if let downEvent = synthesizeMouseEvent(type: .leftMouseDown, at: windowPoint) {
mouseDown(with: downEvent)
}
DispatchQueue.main.asyncAfter(deadline: .now() + 0.08) { [weak self] in
guard let self else { return }
if let upEvent = self.synthesizeMouseEvent(type: .leftMouseUp, at: windowPoint) {
self.mouseUp(with: upEvent)
}
}
}
/// Inject a swipe from one pixel coordinate to another.
func injectSwipe(
fromX: Double, fromY: Double, toX: Double, toY: Double,
screenWidth: Int, screenHeight: Int, durationMs: Int = 300
) {
let startLocal = pixelToLocal(pixelX: fromX, pixelY: fromY, screenWidth: screenWidth, screenHeight: screenHeight)
let endLocal = pixelToLocal(pixelX: toX, pixelY: toY, screenWidth: screenWidth, screenHeight: screenHeight)
let startWindow = convert(startLocal, to: nil)
let endWindow = convert(endLocal, to: nil)
let steps = max(10, durationMs / 16)
let stepInterval = Double(durationMs) / Double(steps) / 1000.0
if let downEvent = synthesizeMouseEvent(type: .leftMouseDown, at: startWindow) {
mouseDown(with: downEvent)
}
for i in 1...steps {
let t = Double(i) / Double(steps)
let x = startWindow.x + (endWindow.x - startWindow.x) * t
let y = startWindow.y + (endWindow.y - startWindow.y) * t
let pt = NSPoint(x: x, y: y)
let delay = stepInterval * Double(i)
if i < steps {
DispatchQueue.main.asyncAfter(deadline: .now() + delay) { [weak self] in
guard let self else { return }
if let dragEvent = self.synthesizeMouseEvent(type: .leftMouseDragged, at: pt) {
self.mouseDragged(with: dragEvent)
}
}
} else {
DispatchQueue.main.asyncAfter(deadline: .now() + delay) { [weak self] in
guard let self else { return }
if let upEvent = self.synthesizeMouseEvent(type: .leftMouseUp, at: pt) {
self.mouseUp(with: upEvent)
}
}
}
}
}
// MARK: - Legacy Touch Injection (macOS 15)
@discardableResult