Download input data from the course website. The file hw3_points3D.txt contains the coordinates of 60 scene points in 3D (each line of the file gives the $\tilde{X}_i$, $\tilde{Y}_i$, and $\tilde{Z}_i$ inhomogeneous coordinates of a point). The file hw3_points2D.txt contains the coordinates of the 60 corresponding image points in 2D (each line of the file gives the $\tilde{x}_i$ and $\tilde{y}_i$ inhomogeneous coordinates of a point). The corresponding 3D scene and 2D image points contain both inlier and outlier correspondences. For the inlier correspondences, the scene points have been randomly generated and projected to image points under a camera projection matrix (i.e., $\boldsymbol{x}_i = \boldsymbol{P} \boldsymbol{X}_i$), then noise has been added to the image point coordinates.
The camera calibration matrix was calculated for a $1280 \times 720$ sensor and $45\,^\circ$ horizontal field of view lens. The resulting camera calibration matrix is given by
$\boldsymbol{K} = \left[ \begin{array}{c c c} 1545.0966799187809 & 0 & 639.5\\ 0 & 1545.0966799187809 & 359.5\\ 0 & 0 & 1 \end{array}\right]$
For each image point $\boldsymbol{x} = (x, y, w)^\top = (\tilde{x}, \tilde{y}, 1)^\top$, calculate the point in normalized coordinates $\hat{\boldsymbol{x}} = \boldsymbol{K}^{-1} \boldsymbol{x}$.
Determine the set of inlier point correspondences using the M-estimator Sample Consensus (MSAC) algorithm, where the maximum number of attempts to find a consensus set is determined adaptively. For each trial, use the 3-point algorithm of Finsterwalder (as described in the paper by Haralick et al.) to estimate the camera pose (i.e., the rotation $\boldsymbol{R}$ and translation $\boldsymbol{t}$ from the world coordinate frame to the camera coordinate frame), resulting in up to 4 solutions, and calculate the error and cost for each solution. Note that the 3-point algorithm requires the 2D points in normalized coordinates, not in image coordinates. Calculate the projection error, which is the (squared) distance between projected points (the points in 3D projected under the normalized camera projection matrix $\hat{\boldsymbol{P}} = [\boldsymbol{R} | \boldsymbol{t}]$) and the measured points in normalized coordinates (hint: the error tolerance is simpler to calculate in image coordinates using $\boldsymbol{P} = \boldsymbol{K} [\boldsymbol{R} | \boldsymbol{t}]$ than in normalized coordinates using $\hat{\boldsymbol{P}} = [\boldsymbol{R} | \boldsymbol{t}]$).
Hint: this problem has codimension 2.
import numpy as np
import time
def Homogenize(x):
# converts points from inhomogeneous to homogeneous coordinates
return np.vstack((x,np.ones((1,x.shape[1]))))
def Dehomogenize(x):
# converts points from homogeneous to inhomogeneous coordinates
return x[:-1]/x[-1]
# load data
x0=np.loadtxt('hw3_points2D.txt').T
X0=np.loadtxt('hw3_points3D.txt').T
print('x is', x0.shape)
print('X is', X0.shape)
K = np.array([[1545.0966799187809, 0, 639.5],
[0, 1545.0966799187809, 359.5],
[0, 0, 1]])
print('K =')
print(K)
def ComputeCost(P, x, X, K):
# Inputs:
# P - camera projection matrix
# x - 2D groundtruth image points
# X - 3D groundtruth scene points
# K - camera calibration matrix
#
# Output:
# cost - total projection error
n = x.shape[1]
covarx = np.eye(2*n) # covariance propagation
"""your code here"""
homo_X = Homogenize(X)
proj_x = P @ homo_X
dehomo_x = Dehomogenize(proj_x)
norm_x = np.linalg.inv(K) @ Homogenize(x)
e = (Dehomogenize(norm_x) - dehomo_x).reshape(-1,1)
cost = e.T @ covarx @ e
return cost
def Finsterwalder(x1,x2,x3,X1,X2,X3):
# calculate the X_cam using 3 points Finsterwalder algorithm
# input: x are nomalized camera coordinates
# X are world frame coordinates
# output: X in cammare frame coordinates
#
p1 = np.array([])
p2 = np.array([])
p3 = np.array([])
j1 = x1/np.linalg.norm(x1)
j2 = x2/np.linalg.norm(x2)
j3 = x3/np.linalg.norm(x3)
a = np.linalg.norm(X2-X3)
b = np.linalg.norm(X1-X3)
c = np.linalg.norm(X1-X2)
#print(j1,j2,j3)
cos_alpha = j2 @ j3
cos_beta = j1 @ j3
cos_gamma = j1 @ j2
alpha = np.arccos(cos_alpha)
beta = np.arccos(cos_beta)
gamma = np.arccos(cos_gamma)
G = (c**2)*((c**2)*(np.sin(beta)**2) - (b**2)*(np.sin(gamma)**2))
#print(G)
H = ((b**2)*(b**2 - a**2)*(np.sin(gamma)**2)
+ (c**2)*(c**2 + 2*a**2)*(np.sin(beta)**2)
+ 2*(b**2)*(c**2)*(-1 + cos_alpha*cos_beta*cos_gamma))
#print(H)
I = ((b**2)*(b**2 - c**2)*(np.sin(alpha)**2)
+ (a**2)*(a**2 + 2*c**2)*(np.sin(beta)**2)
+ 2*(a**2)*(b**2)*(-1 + cos_alpha*cos_beta*cos_gamma))
#print(I)
J = (a**2)*((a**2)*(np.sin(beta)**2) - (b**2)*(np.sin(alpha)**2))
#print(J)
lambda_0 = np.roots([G, H, I, J])
lam = lambda_0[np.isreal(lambda_0)]
#print(lam)
for i in range(lam.shape[0]):
A = 1 + lam[i]
B = -cos_alpha;
C = (b**2 - a**2)/(b**2) - lam[i]*(c**2)/(b**2)
D = -lam[i]*cos_gamma
E = ((a**2)/(b**2) + lam[i]*(c**2)/(b**2))*cos_beta
F = -(a**2)/(b**2) + lam[i]*(b**2-c**2)/(b**2)
p = np.sqrt(B**2 - A*C)
q = np.sign(B*E - C*D)*np.sqrt(E**2 - C*F)
#print(E**2 - C*F)
m = np.array([ (-B + p)/C ,(-B - p)/C])
n = np.array([ -(E - q)/C, -(E + q)/C])
A = b**2 - (m**2)*(c**2)
B = (c**2)*(cos_beta - n) * m - (b**2)*cos_gamma
C = -(c**2)*(n**2) + 2*(c**2)*n*cos_beta + b**2 - c**2
u_large = (-np.sign(B) / A) * (np.abs(B) + np.sqrt(B**2 - A*C))
#print(B**2 - A*C)
u_small = C / (A * u_large)
u = np.array([u_large,u_small])
#v = u*m.reshape(-1,1)+n.reshape(-1,1)
v = u*m+n
#print(m,n)
u = u.reshape((-1))
v = v.reshape((-1))
u = u[np.isreal(u)]
v = v[np.isreal(v)]
u = u[~np.isnan(u)]
v = v[~np.isnan(v)]
if u.shape[0] != 0:
s1 = np.sqrt(c**2 / (1 + u**2 - 2*u*cos_gamma))
s2 = u*s1
s3 = v*s1
p1 = j1.reshape(-1,1)@s1.reshape(1,-1)
p2 = j2.reshape(-1,1)@s2.reshape(1,-1)
p3 = j3.reshape(-1,1)@s3.reshape(1,-1)
#print(p1)
break
return p1,p2,p3
def CalModel(X1_cam,X2_cam,X3_cam,X1,X2,X3):
#
# Estimate camera pose using 3 points
#
P = np.array([])
n = 3
if X1_cam.shape[0] == 0:
n_sol = 0
else:
n_sol = X1_cam.shape[1]
X = np.array([X1,X2,X3]).T
mu_X = np.mean(X,axis=1).reshape(-1,1)
sigma_x = np.linalg.norm(X - mu_X,'fro')**2/n
err = np.inf
for i in range(n_sol):
Y = np.array([X1_cam[:,i],X2_cam[:,i],X3_cam[:,i]]).T
mu_Y = np.mean(Y,axis=1).reshape(-1,1)
sigma_xy = ((Y-mu_Y) @ (X-mu_X).T)/n
[U,D,V] = np.linalg.svd(sigma_xy)
V = V.T
m = 3
S = np.eye(m)
if np.linalg.matrix_rank(sigma_xy) < m-1:
continue
elif np.linalg.matrix_rank(sigma_xy) == m-1:
if np.round(np.linalg.det(U)*np.linalg.det(V)) == -1:
S[-1,-1] = -1
else:
if np.linalg.det(sigma_xy) < 0:
S[-1,-1] = -1
R = U @ S @ V.T
#print(R)
c = np.trace(np.diag(D) @ S)/sigma_x
t = mu_Y - c*R@mu_X
#print(t)
if np.linalg.norm(Y - c*R@X - t)**2 < err:
err = np.linalg.norm(Y - c*R@X - t)**2
P = np.hstack((R,t))
return P
from scipy.stats import chi2
import random
def MSAC(x, X, K, thresh, tol, p):
# Inputs:
# x - 2D inhomogeneous image points
# X - 3D inhomogeneous scene points
# K - camera calibration matrix
# thresh - cost threshold
# tol - reprojection error tolerance
# p - probability that as least one of the random samples does not contain any outliers
#
# Output:
# consensus_min_cost - final cost from MSAC
# consensus_min_cost_model - camera projection matrix P
# inliers - list of indices of the inliers corresponding to input data
# trials - number of attempts taken to find consensus set
"""your code here"""
x0 = x
x = np.linalg.inv(K) @ Homogenize(x) # homo normalized
X_homo = Homogenize(X)
#x = Dehomogenize(x)
tol = chi2.ppf(0.95,2)
p = 0.99
trials = 0
max_trials = np.inf
consensus_min_cost = np.inf
consensus_min_cost_model = np.zeros((3,4))
inliers = np.random.randint(0, 59, size=10)
while (trials < max_trials and consensus_min_cost > thresh):
trials = trials + 1
num = random.sample(range(0, 60), 3)
x1 = x[:,num[0]]
x2 = x[:,num[1]]
x3 = x[:,num[2]]
X1 = X[:,num[0]]
X2 = X[:,num[1]]
X3 = X[:,num[2]]
[X1_cam,X2_cam,X3_cam] = Finsterwalder(x1,x2,x3,X1,X2,X3)
# model
P_hat = CalModel(X1_cam,X2_cam,X3_cam,X1,X2,X3)
if P_hat.shape[0] == 0:
continue
trials = trials + 1
x_projected = K @ P_hat @ X_homo
#x_projected = P_hat @ X_homo
# error
error = (Dehomogenize(x_projected) - x0)**2
#error = (Dehomogenize(x_projected) - Dehomogenize(x))**2
error = np.sum(error,axis=0)
# cost
cost = 0
for i in range(x.shape[1]):
if error[i] <= tol:
cost = cost + error[i]
else:
cost = cost + tol
if cost < consensus_min_cost:
consensus_min_cost = cost
consensus_min_cost_model = P_hat
n_inliers = (error <= tol)
w = np.sum(n_inliers)/x.shape[1]
max_trials = np.log(1-p) / np.log(1-w**3)
inliers = np.where(n_inliers == True)[0]
print("p = " + str(p))
print("tolerance = " + str(tol))
print("num_inliers = " + str(inliers.shape[0]))
print("num_attempts = " + str(trials))
return consensus_min_cost, consensus_min_cost_model, inliers, trials
# MSAC parameters
thresh = 100
tol = 0
p = 0
alpha = 0
tic=time.time()
cost_MSAC, P_MSAC, inliers, trials = MSAC(x0, X0, K, thresh, tol, p)
# choose just the inliers
x = x0[:,inliers]
X = X0[:,inliers]
toc=time.time()
time_total=toc-tic
# display the results
print('took %f secs'%time_total)
# print('%d iterations'%trials)
# print('inlier count: ',len(inliers))
print('MSAC Cost=%.9f'%cost_MSAC)
print('P = ')
print(P_MSAC)
print('inliers: ',inliers)
Estimate the normalized camera projection matrix $\hat{\boldsymbol{P}}_\text{linear} = [\boldsymbol{R}_\text{linear} | \boldsymbol{t}_\text{linear}]$ from the resulting set of inlier correspondences using the linear estimation method (based on the EPnP method) described in lecture. Report the resulting $\boldsymbol{R}_\text{linear}$ and $\boldsymbol{t}_\text{linear}$.
def EPnP(x, X, K):
# Inputs:
# x - 2D inlier points
# X - 3D inlier points
# Output:
# P - normalized camera projection matrix
"""your code here"""
x = np.linalg.inv(K) @ Homogenize(x)
x = Dehomogenize(x)
n = x.shape[1]
mu_X = np.mean(X,axis=1)
sigma_X = np.cov(X)
[U,D,VT] = np.linalg.svd(sigma_X)
V = VT.T
var_X = np.sum(D)
s = np.sqrt(var_X / 3)
C1 = (mu_X).reshape(-1,1)
C2 = (s*V[:,0] + mu_X).reshape(-1,1)
C3 = (s*V[:,1] + mu_X).reshape(-1,1)
C4 = (s*V[:,2] + mu_X).reshape(-1,1)
A = np.hstack((C2-C1,C3-C1,C4-C1))
b = X - C1
alpha234 = np.linalg.solve(A,b)
alpha1 = (1 - np.sum(alpha234,axis=0)).reshape(1,-1)
alpha = np.vstack((alpha1,alpha234))
m = np.zeros((2*n,12))
for i in range(n):
m[2*i,:] = np.array([alpha[0,i],0, -alpha[0,i]*x[0,i],
alpha[1,i], 0, -alpha[1,i]*x[0,i],
alpha[2,i], 0, -alpha[2,i]*x[0,i],
alpha[3,i], 0, -alpha[3,i]*x[0,i]])
m[2*i+1,:] = np.array([0, alpha[0,i], -alpha[0,i]*x[1,i],
0, alpha[1,i], -alpha[1,i]*x[1,i],
0, alpha[2,i], -alpha[2,i]*x[1,i],
0, alpha[3,i], -alpha[3,i]*x[1,i]])
[U,D,VT] = np.linalg.svd(m)
V = VT.T
C1_cam = V[0:3,-1].reshape(-1,1)
C2_cam = V[3:6,-1].reshape(-1,1)
C3_cam = V[6:9,-1].reshape(-1,1)
C4_cam = V[9:12,-1].reshape(-1,1)
X_cam = np.hstack([C1_cam,C2_cam,C3_cam,C4_cam]) @ alpha
mu_X_cam = np.mean(X_cam,axis=1)
sigma_X_cam = np.cov(X_cam)
var_X_cam = np.trace(sigma_X_cam)
if mu_X_cam[2] < 0:
beta = - np.sqrt(var_X/var_X_cam)
else:
beta = np.sqrt(var_X/var_X_cam)
#print(beta)
X_cam = beta*X_cam
mu_X = np.mean(X,axis=1).reshape(-1,1)
mu_Y = np.mean(X_cam, axis=1).reshape(-1,1)
#print(mu_X,mu_Y)
sigma_xy = ((X_cam-mu_Y)@(X-mu_X).T)/n
[U,D,VT] = np.linalg.svd(sigma_xy)
S = np.eye(3)
if np.linalg.matrix_rank(sigma_xy) >= 2:
if np.linalg.det(sigma_xy) < 0:
S[-1,-1] = -1
R = U @ S @ VT
c = 1
t = mu_Y - c*R @ mu_X
P = np.hstack((R,t))
else:
P = np.array([])
return P
tic=time.time()
P_linear = EPnP(x, X, K)
toc=time.time()
time_total=toc-tic
# display the results
print('took %f secs'%time_total)
print('R_linear = ')
print(P_linear[:,0:3])
print('t_linear = ')
print(P_linear[:,-1])
Use $\boldsymbol{R}_\text{linear}$ and $\boldsymbol{t}_\text{linear}$ as an initial estimate to an iterative estimation method, specifically the Levenberg-Marquardt algorithm, to determine the Maximum Likelihood estimate of the camera pose that minimizes the projection error under the normalized camera projection matrix $\hat{\boldsymbol{P}} = [\boldsymbol{R} | \boldsymbol{t}]$. You must parameterize the camera rotation using the angle-axis representation $\boldsymbol{\omega}$ (where $[\boldsymbol{\omega}]_\times = \ln \boldsymbol{R}$) of a 3D rotation, which is a 3-vector.
Report the initial cost (i.e. cost at iteration 0) and the cost at the end of each successive iteration. Show the numerical values for the final estimate of the camera rotation $\boldsymbol{\omega}_\text{LM}$ and $\boldsymbol{R}_\text{LM}$, and the camera translation $\boldsymbol{t}_\text{LM}$.
from scipy.linalg import block_diag
# Note that np.sinc is different than defined in class
def Sinc(x):
"""your code here"""
if x == 0:
y = 1
else :
y = np.sin(x)/x
return y
def d_sinc(x):
if x == 0:
return 0
else:
return (np.cos(x)/(x)-np.sin(x)/(x**2))
def skew(w):
# Returns the skew-symmetrix represenation of a vector
"""your code here"""
w_skew = np.array([[0, -w[2], w[1]],
[w[2],0,-w[0]],
[-w[1],w[0],0]])
return w_skew
def Parameterize(R):
# Parameterizes rotation matrix into its axis-angle representation
"""your code here"""
[U,D,VT] = np.linalg.svd(R-np.eye(3))
v = VT[-1,:].reshape(-1,1)
v_hat = np.array([R[2,1] - R[1,2],
R[0,2] - R[2,0],
R[1,0] - R[0,1]]).reshape(-1,1)
sin_theta = (v.T@v_hat)/2
cos_theta = (np.trace(R) - 1)/2
theta = np.arctan2(sin_theta,cos_theta)
w = theta * v / np.linalg.norm(v)
if theta > np.pi:
w = w*( 1 - 2*np.pi/theta * np.ceil((theta-np.pi)/(2*np.pi)))
return w, theta
def Deparameterize(w):
# Deparameterizes to get rotation matrix
"""your code here"""
w = w.reshape(-1,1)
theta = np.linalg.norm(w)
Wx = skew(w)
R = (np.cos(theta)*np.eye(3) + Sinc(theta)*Wx
+ (1-np.cos(theta))/(theta**2)* w @ w.T)
return R
def Jacobian(R, w, t, X):
# compute the jacobian matrix
# Inputs:
# R - 3x3 rotation matrix
# w - 3x1 axis-angle parameterization of R
# t - 3x1 translation vector
# X - 3D inlier points
#
# Output:
# J - Jacobian matrix of size 2*nx6
"""your code here"""
A = np.zeros((1,6))
t = t.reshape(-1,1)
w = w.reshape(-1,1)
for i in range(X.shape[1]):
Xi = X[:,i].reshape(-1,1)
P = np.hstack([R,t])
x_hat = P @ Homogenize(Xi)
x_hat = Dehomogenize(x_hat)
X_rotated = R @ Xi
w_hat = X_rotated[2,0] + t[2,0]
d_x_hat_X_rotated = np.array([[1/w_hat,0,-x_hat[0]/w_hat],
[0,1/w_hat,-x_hat[1]/w_hat]])
theta = np.linalg.norm(w)
if theta < np.pi/36:
d_X_rotated_w = skew(-X_rotated)
else:
d_X_rotated_w = (Sinc(theta)*skew(-X_rotated)
+ (skew(w)@X_rotated)*d_sinc(theta)@(1/theta*w.T)
+ skew(w) @ (skew(w)@X_rotated)
*((theta*np.sin(theta)
- 2*(1-np.cos(theta)))/(theta**3))@(1/theta*w.T)
+ ((1-np.cos(theta))/(theta**2))
*(skew(w)@skew(-X_rotated)+skew(-skew(w)@X_rotated)) )
d_x_hat_w = d_x_hat_X_rotated @ d_X_rotated_w
w_hat = (R[2,:]@X + t[2,0])[0]
d_x_hat_t = np.array([[1/w_hat,0,-x_hat[0]/w_hat],
[0,1/w_hat,-x_hat[1]/w_hat]])
Ai = np.hstack([d_x_hat_w,d_x_hat_t])
A = np.vstack((A,Ai))
J = A[1:,:]
return J
def LM(P, x, X, K, max_iters, lam):
# Inputs:
# P - initial estimate of camera pose
# x - 2D inliers
# X - 3D inliers
# K - camera calibration matrix
# max_iters - maximum number of iterations
# lam - lambda parameter
#
# Output:
# P - Final camera pose obtained after convergence
"""your code here"""
n = X.shape[1]
x_normalized = np.linalg.inv(K) @ Homogenize(x)
x_projected_normalized = P @ Homogenize(X)
R_l = P[:,0:3]
t_l = P[:,-1].reshape(-1,1)
w_l,theta = Parameterize(R_l)
p = np.vstack([w_l,t_l])
previous_cost = np.inf
# step 1
K_inv = np.linalg.inv(K)
sigma = np.eye(2*n)*(K_inv[0,0]**2)
err = Dehomogenize(x_normalized) - Dehomogenize(x_projected_normalized)
err = err.reshape(-1,1)
iters = 0
for i in range(max_iters):
#step 2
R = P[:,0:3]
t = P[:,-1].reshape(-1,1)
w,theta = Parameterize(R)
J = Jacobian(R, w, t, X)
# step 3
norm_U = J.T @ np.linalg.inv(sigma) @ J
E = J.T @ np.linalg.inv(sigma) @ err
# step 4
flag = 1
while flag:
delta = np.linalg.solve((norm_U + lam*np.eye(6)),E)
# step 5
p0 = p + delta
# step 6
w0 = p0[:3]
t0 = p0[3:].reshape(-1,1)
R0 = Deparameterize(w0)
P0 = np.hstack((R0,t0))
x_projected_normalized = P0 @ Homogenize(X)
err0 = Dehomogenize(x_normalized) - Dehomogenize(x_projected_normalized)
err0 = err0.reshape(-1,1)
# step 7
cost = err.T @ np.linalg.inv(sigma) @ err
cost0 = err0.T @ np.linalg.inv(sigma) @ err0
if cost0 < cost:
P = P0
err = err0
lam = 0.1 * lam
iters = iters + 1
flag = 0
else :
lam = 10 * lam
print('iter %03d Cost %.9f'%(i+1, cost))
# terminate condition
if 1e-7 > (1 - (cost0 / cost)):
break
return P
# LM hyperparameters
lam = .001
max_iters = 100
tic = time.time()
P_LM = LM(P_linear, x, X, K, max_iters, lam)
w_LM,_ = Parameterize(P_LM[:,0:3])
toc = time.time()
time_total = toc-tic
# display the results
print('took %f secs'%time_total)
print('w_LM = ')
print(w_LM)
print('R_LM = ')
print(P_LM[:,0:3])
print('t_LM = ')
print(P_LM[:,-1])