mirror of
https://github.com/Lakr233/vphone-cli.git
synced 2026-04-05 13:09:06 +08:00
feat: add host-side automation socket for programmatic VM control
Add a Unix domain socket server (vm/vphone.sock) that accepts JSON
commands from external processes, enabling programmatic E2E testing
of iOS apps running in the VM.
Supported commands:
- screenshot: capture VM display to file (PNG/JPEG by extension)
- tap: inject touch at pixel coordinates (matching screenshot dims)
- swipe: inject swipe gesture between two points
- key: send hardware keys (home/power/volup/voldown) via HID
The socket uses a simple one-line JSON protocol: connect, send request,
receive response, disconnect. Example usage from CLI:
echo '{"t":"screenshot","path":"/tmp/s.png"}' | nc -U vm/vphone.sock
echo '{"t":"tap","x":500,"y":1900}' | nc -U vm/vphone.sock
echo '{"t":"key","name":"home"}' | nc -U vm/vphone.sock
New files:
- VPhoneHostControl.swift: socket server, command dispatch
Modified:
- VPhoneScreenRecorder: add saveScreenshot(view:to:) with PNG support
- VPhoneVirtualMachineView: add injectTap/injectSwipe via synthetic
NSEvents routed through the existing mouse event handlers
- VPhoneAppDelegate: wire up VPhoneHostControl lifecycle
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ class VPhoneAppDelegate: NSObject, NSApplicationDelegate {
|
||||
private var keychainWindowController: VPhoneKeychainWindowController?
|
||||
private var appWindowController: VPhoneAppWindowController?
|
||||
private var locationProvider: VPhoneLocationProvider?
|
||||
private var hostControl: VPhoneHostControl?
|
||||
private var sigintSource: DispatchSourceSignal?
|
||||
private var didAttemptAutoInstall = false
|
||||
|
||||
@@ -133,9 +134,23 @@ class VPhoneAppDelegate: NSObject, NSApplicationDelegate {
|
||||
if let provider = locationProvider {
|
||||
mc.locationProvider = provider
|
||||
}
|
||||
mc.screenRecorder = VPhoneScreenRecorder()
|
||||
let recorder = VPhoneScreenRecorder()
|
||||
mc.screenRecorder = recorder
|
||||
menuController = mc
|
||||
|
||||
let socketPath = options.configURL
|
||||
.deletingLastPathComponent()
|
||||
.appendingPathComponent("vphone.sock").path
|
||||
let hc = VPhoneHostControl(socketPath: socketPath)
|
||||
hc.start(
|
||||
captureView: wc.captureView!,
|
||||
screenRecorder: recorder,
|
||||
control: control,
|
||||
screenWidth: options.screenWidth,
|
||||
screenHeight: options.screenHeight
|
||||
)
|
||||
hostControl = hc
|
||||
|
||||
// Wire location toggle through onConnect/onDisconnect
|
||||
control.onConnect = { [weak mc, weak provider = locationProvider] caps in
|
||||
mc?.updateConnectAvailability(available: true)
|
||||
@@ -225,6 +240,10 @@ class VPhoneAppDelegate: NSObject, NSApplicationDelegate {
|
||||
}
|
||||
}
|
||||
|
||||
func applicationWillTerminate(_: Notification) {
|
||||
hostControl?.stop()
|
||||
}
|
||||
|
||||
func applicationShouldTerminateAfterLastWindowClosed(_: NSApplication) -> Bool {
|
||||
!cli.noGraphics
|
||||
}
|
||||
|
||||
310
sources/vphone-cli/VPhoneHostControl.swift
Normal file
310
sources/vphone-cli/VPhoneHostControl.swift
Normal file
@@ -0,0 +1,310 @@
|
||||
import AppKit
|
||||
import Foundation
|
||||
|
||||
// MARK: - Host Control Socket
|
||||
|
||||
/// Lightweight Unix domain socket server that accepts automation commands from
|
||||
/// local processes (e.g. Claude Code via `nc -U`). One JSON line in, one JSON
|
||||
/// line out, then the connection closes.
|
||||
///
|
||||
/// Supported commands:
|
||||
/// {"t":"screenshot"} → save to default Desktop path
|
||||
/// {"t":"screenshot","path":"/tmp/shot.png"} → save to explicit path (PNG/JPEG by extension)
|
||||
/// {"t":"tap","x":645,"y":1398} → tap at pixel coordinates (matching screenshot)
|
||||
/// {"t":"swipe","x1":645,"y1":2600,"x2":645,"y2":1400} → swipe between points
|
||||
/// {"t":"swipe","x1":645,"y1":2600,"x2":645,"y2":1400,"ms":300} → swipe with duration
|
||||
/// {"t":"key","name":"home"} → hardware key (home/power/volup/voldown)
|
||||
@MainActor
|
||||
class VPhoneHostControl {
|
||||
private let socketPath: String
|
||||
private var listenFD: Int32 = -1
|
||||
private let acceptQueue = DispatchQueue(label: "vphone.hostcontrol.accept")
|
||||
|
||||
private weak var captureView: VPhoneVirtualMachineView?
|
||||
private var screenRecorder: VPhoneScreenRecorder?
|
||||
private weak var control: VPhoneControl?
|
||||
|
||||
/// Thread-safe box for passing results between main actor and accept queue.
|
||||
private final class ResultBox: @unchecked Sendable {
|
||||
var path: String?
|
||||
var error: String?
|
||||
var ok = false
|
||||
}
|
||||
|
||||
/// Screen pixel dimensions for coordinate mapping.
|
||||
private var screenWidth: Int = 1290
|
||||
private var screenHeight: Int = 2796
|
||||
|
||||
init(socketPath: String) {
|
||||
self.socketPath = socketPath
|
||||
}
|
||||
|
||||
func start(
|
||||
captureView: VPhoneVirtualMachineView,
|
||||
screenRecorder: VPhoneScreenRecorder,
|
||||
control: VPhoneControl,
|
||||
screenWidth: Int,
|
||||
screenHeight: Int
|
||||
) {
|
||||
self.captureView = captureView
|
||||
self.screenRecorder = screenRecorder
|
||||
self.control = control
|
||||
self.screenWidth = screenWidth
|
||||
self.screenHeight = screenHeight
|
||||
|
||||
// Clean up stale socket from previous run
|
||||
unlink(socketPath)
|
||||
|
||||
let fd = socket(AF_UNIX, SOCK_STREAM, 0)
|
||||
guard fd >= 0 else {
|
||||
print("[hostctl] failed to create socket: \(String(cString: strerror(errno)))")
|
||||
return
|
||||
}
|
||||
|
||||
var addr = sockaddr_un()
|
||||
addr.sun_family = sa_family_t(AF_UNIX)
|
||||
let pathBytes = socketPath.utf8CString
|
||||
guard pathBytes.count <= MemoryLayout.size(ofValue: addr.sun_path) else {
|
||||
print("[hostctl] socket path too long")
|
||||
close(fd)
|
||||
return
|
||||
}
|
||||
withUnsafeMutablePointer(to: &addr.sun_path) { ptr in
|
||||
ptr.withMemoryRebound(to: CChar.self, capacity: pathBytes.count) { dst in
|
||||
for (i, byte) in pathBytes.enumerated() {
|
||||
dst[i] = byte
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let addrLen = socklen_t(MemoryLayout<sockaddr_un>.size)
|
||||
let bindResult = withUnsafePointer(to: &addr) { ptr in
|
||||
ptr.withMemoryRebound(to: sockaddr.self, capacity: 1) { sockPtr in
|
||||
bind(fd, sockPtr, addrLen)
|
||||
}
|
||||
}
|
||||
guard bindResult == 0 else {
|
||||
print("[hostctl] bind failed: \(String(cString: strerror(errno)))")
|
||||
close(fd)
|
||||
return
|
||||
}
|
||||
|
||||
guard listen(fd, 4) == 0 else {
|
||||
print("[hostctl] listen failed: \(String(cString: strerror(errno)))")
|
||||
close(fd)
|
||||
return
|
||||
}
|
||||
|
||||
listenFD = fd
|
||||
|
||||
print("[hostctl] listening on \(socketPath)")
|
||||
|
||||
let capturedFD = fd
|
||||
acceptQueue.async { [weak self] in
|
||||
Self.acceptLoop(listenFD: capturedFD, controller: self)
|
||||
}
|
||||
}
|
||||
|
||||
func stop() {
|
||||
if listenFD >= 0 {
|
||||
close(listenFD)
|
||||
listenFD = -1
|
||||
}
|
||||
unlink(socketPath)
|
||||
}
|
||||
|
||||
// MARK: - Accept Loop
|
||||
|
||||
private nonisolated static func acceptLoop(listenFD: Int32, controller: VPhoneHostControl?) {
|
||||
while true {
|
||||
let clientFD = accept(listenFD, nil, nil)
|
||||
guard clientFD >= 0 else { break }
|
||||
handleClient(clientFD, controller: controller)
|
||||
}
|
||||
}
|
||||
|
||||
private nonisolated static func handleClient(_ fd: Int32, controller: VPhoneHostControl?) {
|
||||
defer { close(fd) }
|
||||
|
||||
guard let line = readLine(from: fd) else { return }
|
||||
|
||||
guard let data = line.data(using: .utf8),
|
||||
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||
let type = json["t"] as? String
|
||||
else {
|
||||
writeResponse(fd, ok: false, error: "invalid JSON")
|
||||
return
|
||||
}
|
||||
|
||||
switch type {
|
||||
case "screenshot":
|
||||
let outputPath = json["path"] as? String
|
||||
let semaphore = DispatchSemaphore(value: 0)
|
||||
let result = ResultBox()
|
||||
|
||||
Task { @MainActor in
|
||||
defer { semaphore.signal() }
|
||||
guard let controller,
|
||||
let recorder = controller.screenRecorder,
|
||||
let view = controller.captureView,
|
||||
view.window != nil
|
||||
else {
|
||||
result.error = "no active VM view"
|
||||
return
|
||||
}
|
||||
do {
|
||||
let url: URL
|
||||
if let outputPath {
|
||||
url = try await recorder.saveScreenshot(view: view, to: URL(fileURLWithPath: outputPath))
|
||||
} else {
|
||||
url = try await recorder.saveScreenshot(view: view)
|
||||
}
|
||||
result.path = url.path
|
||||
} catch {
|
||||
result.error = "\(error)"
|
||||
}
|
||||
}
|
||||
|
||||
semaphore.wait()
|
||||
|
||||
if let path = result.path {
|
||||
writeResponse(fd, ok: true, path: path)
|
||||
} else {
|
||||
writeResponse(fd, ok: false, error: result.error ?? "unknown error")
|
||||
}
|
||||
|
||||
case "tap":
|
||||
guard let x = json["x"] as? Double, let y = json["y"] as? Double else {
|
||||
writeResponse(fd, ok: false, error: "tap requires x and y (pixel coordinates)")
|
||||
return
|
||||
}
|
||||
let semaphore = DispatchSemaphore(value: 0)
|
||||
let result = ResultBox()
|
||||
|
||||
Task { @MainActor in
|
||||
defer { semaphore.signal() }
|
||||
guard let controller, let view = controller.captureView, view.window != nil else {
|
||||
result.error = "no active VM view"
|
||||
return
|
||||
}
|
||||
view.injectTap(
|
||||
pixelX: x, pixelY: y,
|
||||
screenWidth: controller.screenWidth, screenHeight: controller.screenHeight
|
||||
)
|
||||
result.ok = true
|
||||
}
|
||||
|
||||
semaphore.wait()
|
||||
if result.ok {
|
||||
writeResponse(fd, ok: true)
|
||||
} else {
|
||||
writeResponse(fd, ok: false, error: result.error ?? "tap failed")
|
||||
}
|
||||
|
||||
case "swipe":
|
||||
guard let x1 = json["x1"] as? Double, let y1 = json["y1"] as? Double,
|
||||
let x2 = json["x2"] as? Double, let y2 = json["y2"] as? Double
|
||||
else {
|
||||
writeResponse(fd, ok: false, error: "swipe requires x1, y1, x2, y2")
|
||||
return
|
||||
}
|
||||
let durationMs = json["ms"] as? Int ?? 300
|
||||
let semaphore = DispatchSemaphore(value: 0)
|
||||
let result = ResultBox()
|
||||
|
||||
Task { @MainActor in
|
||||
defer { semaphore.signal() }
|
||||
guard let controller, let view = controller.captureView, view.window != nil else {
|
||||
result.error = "no active VM view"
|
||||
return
|
||||
}
|
||||
view.injectSwipe(
|
||||
fromX: x1, fromY: y1, toX: x2, toY: y2,
|
||||
screenWidth: controller.screenWidth, screenHeight: controller.screenHeight,
|
||||
durationMs: durationMs
|
||||
)
|
||||
result.ok = true
|
||||
}
|
||||
|
||||
semaphore.wait()
|
||||
if result.ok {
|
||||
writeResponse(fd, ok: true)
|
||||
} else {
|
||||
writeResponse(fd, ok: false, error: result.error ?? "swipe failed")
|
||||
}
|
||||
|
||||
case "key":
|
||||
guard let name = json["name"] as? String else {
|
||||
writeResponse(fd, ok: false, error: "key requires name (home/power/volup/voldown)")
|
||||
return
|
||||
}
|
||||
let hidKey: (page: UInt32, usage: UInt32)? = switch name {
|
||||
case "home": (0x0C, 0x40)
|
||||
case "power": (0x0C, 0x30)
|
||||
case "volup": (0x0C, 0xE9)
|
||||
case "voldown": (0x0C, 0xEA)
|
||||
default: nil
|
||||
}
|
||||
guard let key = hidKey else {
|
||||
writeResponse(fd, ok: false, error: "unknown key: \(name)")
|
||||
return
|
||||
}
|
||||
let semaphore = DispatchSemaphore(value: 0)
|
||||
let result = ResultBox()
|
||||
|
||||
Task { @MainActor in
|
||||
defer { semaphore.signal() }
|
||||
guard let controller, let ctl = controller.control, ctl.isConnected else {
|
||||
result.error = "guest not connected"
|
||||
return
|
||||
}
|
||||
ctl.sendHIDPress(page: key.page, usage: key.usage)
|
||||
result.ok = true
|
||||
}
|
||||
|
||||
semaphore.wait()
|
||||
if result.ok {
|
||||
writeResponse(fd, ok: true)
|
||||
} else {
|
||||
writeResponse(fd, ok: false, error: result.error ?? "key failed")
|
||||
}
|
||||
|
||||
default:
|
||||
writeResponse(fd, ok: false, error: "unknown command: \(type)")
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Socket I/O
|
||||
|
||||
private nonisolated static func readLine(from fd: Int32) -> String? {
|
||||
var buffer = [UInt8](repeating: 0, count: 4096)
|
||||
var accumulated = Data()
|
||||
|
||||
while accumulated.count < 4096 {
|
||||
let n = read(fd, &buffer, buffer.count)
|
||||
guard n > 0 else { break }
|
||||
accumulated.append(contentsOf: buffer[..<n])
|
||||
if accumulated.contains(0x0A) { break }
|
||||
}
|
||||
|
||||
if let nlRange = accumulated.firstIndex(of: 0x0A) {
|
||||
return String(data: accumulated[..<nlRange], encoding: .utf8)
|
||||
}
|
||||
return accumulated.isEmpty ? nil : String(data: accumulated, encoding: .utf8)
|
||||
}
|
||||
|
||||
private nonisolated static func writeResponse(_ fd: Int32, ok: Bool, path: String? = nil, error: String? = nil) {
|
||||
var dict: [String: Any] = ["ok": ok]
|
||||
if let path { dict["path"] = path }
|
||||
if let error { dict["error"] = error }
|
||||
|
||||
guard let data = try? JSONSerialization.data(withJSONObject: dict),
|
||||
var json = String(data: data, encoding: .utf8)
|
||||
else { return }
|
||||
|
||||
json += "\n"
|
||||
_ = json.withCString { ptr in
|
||||
write(fd, ptr, strlen(ptr))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -149,11 +149,15 @@ class VPhoneScreenRecorder {
|
||||
}
|
||||
|
||||
func saveScreenshot(view: NSView) async throws -> URL {
|
||||
try await saveScreenshot(view: view, to: screenshotOutputURL())
|
||||
}
|
||||
|
||||
func saveScreenshot(view: NSView, to url: URL) async throws -> URL {
|
||||
let cgImage = try await captureStillImage(from: view)
|
||||
let url = screenshotOutputURL()
|
||||
let utType = url.pathExtension.lowercased() == "png" ? "public.png" : "public.jpeg"
|
||||
|
||||
guard let dest = CGImageDestinationCreateWithURL(
|
||||
url as CFURL, "public.jpeg" as CFString, 1, nil
|
||||
url as CFURL, utType as CFString, 1, nil
|
||||
) else {
|
||||
throw CaptureError.encodingFailed
|
||||
}
|
||||
|
||||
@@ -164,6 +164,91 @@ class VPhoneVirtualMachineView: VZVirtualMachineView {
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Programmatic Touch (for automation)
|
||||
|
||||
/// Convert screenshot pixel coordinates to NSView local coordinates.
|
||||
private func pixelToLocal(pixelX: Double, pixelY: Double, screenWidth: Int, screenHeight: Int) -> NSPoint {
|
||||
let w = bounds.width
|
||||
let h = bounds.height
|
||||
let localX = pixelX / Double(screenWidth) * w
|
||||
// Screenshot y=0 is top, NSView y=0 is bottom (non-flipped)
|
||||
let localY = (1.0 - pixelY / Double(screenHeight)) * h
|
||||
return NSPoint(x: localX, y: localY)
|
||||
}
|
||||
|
||||
/// Synthesize an NSEvent at a given window point.
|
||||
private func synthesizeMouseEvent(type: NSEvent.EventType, at windowPoint: NSPoint) -> NSEvent? {
|
||||
NSEvent.mouseEvent(
|
||||
with: type,
|
||||
location: windowPoint,
|
||||
modifierFlags: [],
|
||||
timestamp: ProcessInfo.processInfo.systemUptime,
|
||||
windowNumber: window?.windowNumber ?? 0,
|
||||
context: nil,
|
||||
eventNumber: 0,
|
||||
clickCount: type == .leftMouseUp ? 0 : 1,
|
||||
pressure: type == .leftMouseUp ? 0.0 : 1.0
|
||||
)
|
||||
}
|
||||
|
||||
/// Inject a tap at pixel coordinates (matching screenshot image dimensions).
|
||||
func injectTap(pixelX: Double, pixelY: Double, screenWidth: Int, screenHeight: Int) {
|
||||
let localPoint = pixelToLocal(pixelX: pixelX, pixelY: pixelY, screenWidth: screenWidth, screenHeight: screenHeight)
|
||||
let windowPoint = convert(localPoint, to: nil)
|
||||
|
||||
if let downEvent = synthesizeMouseEvent(type: .leftMouseDown, at: windowPoint) {
|
||||
mouseDown(with: downEvent)
|
||||
}
|
||||
DispatchQueue.main.asyncAfter(deadline: .now() + 0.08) { [weak self] in
|
||||
guard let self else { return }
|
||||
if let upEvent = self.synthesizeMouseEvent(type: .leftMouseUp, at: windowPoint) {
|
||||
self.mouseUp(with: upEvent)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Inject a swipe from one pixel coordinate to another.
|
||||
func injectSwipe(
|
||||
fromX: Double, fromY: Double, toX: Double, toY: Double,
|
||||
screenWidth: Int, screenHeight: Int, durationMs: Int = 300
|
||||
) {
|
||||
let startLocal = pixelToLocal(pixelX: fromX, pixelY: fromY, screenWidth: screenWidth, screenHeight: screenHeight)
|
||||
let endLocal = pixelToLocal(pixelX: toX, pixelY: toY, screenWidth: screenWidth, screenHeight: screenHeight)
|
||||
let startWindow = convert(startLocal, to: nil)
|
||||
let endWindow = convert(endLocal, to: nil)
|
||||
|
||||
let steps = max(10, durationMs / 16)
|
||||
let stepInterval = Double(durationMs) / Double(steps) / 1000.0
|
||||
|
||||
if let downEvent = synthesizeMouseEvent(type: .leftMouseDown, at: startWindow) {
|
||||
mouseDown(with: downEvent)
|
||||
}
|
||||
|
||||
for i in 1...steps {
|
||||
let t = Double(i) / Double(steps)
|
||||
let x = startWindow.x + (endWindow.x - startWindow.x) * t
|
||||
let y = startWindow.y + (endWindow.y - startWindow.y) * t
|
||||
let pt = NSPoint(x: x, y: y)
|
||||
let delay = stepInterval * Double(i)
|
||||
|
||||
if i < steps {
|
||||
DispatchQueue.main.asyncAfter(deadline: .now() + delay) { [weak self] in
|
||||
guard let self else { return }
|
||||
if let dragEvent = self.synthesizeMouseEvent(type: .leftMouseDragged, at: pt) {
|
||||
self.mouseDragged(with: dragEvent)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
DispatchQueue.main.asyncAfter(deadline: .now() + delay) { [weak self] in
|
||||
guard let self else { return }
|
||||
if let upEvent = self.synthesizeMouseEvent(type: .leftMouseUp, at: pt) {
|
||||
self.mouseUp(with: upEvent)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Legacy Touch Injection (macOS 15)
|
||||
|
||||
@discardableResult
|
||||
|
||||
Reference in New Issue
Block a user