Example Multiview Scene Reconstruction

From BoofCV
Revision as of 19:28, 18 June 2014 by Peter (talk | contribs) (Created page with "<center> <gallery widths=640px heights=260px> file:Example_sfm_chair.jpg | 3D point cloud created by combining 20 images of a chair. </gallery> </center> Example showing how...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigationJump to search

Example showing how multiple views from a calibrated camera can be combined together to estimate the camera location and the 3D location of interest points. As discussed in the source code this is still a work in progress and the output is very noisy still.

Example Code:

Concepts:

  • Structure from Motion
  • Multiple Views
  • Single camera
  • calibrated camera

Relevant Applets:

Example Code

/**
 * Demonstration on how to do 3D reconstruction from a set of unordered photos with known intrinsic camera calibration.
 * The code below is still a work in process and is very basic, but still require a solid understanding of
 * structure from motion to understand.  In other words, this is not for beginners and requires good clean set of
 * images to work.
 *
 * One key element it is missing is bundle adjustment to improve the estimated camera location and 3D points.  The
 * current bundle adjustment in BoofCV is too inefficient.   Better noise removal and numerous other improvements
 * are needed before it can compete with commercial equivalents.
 *
 * @author Peter Abeles
 */
public class ExampleStructureFromMotion {

	// Converts a point from pixel to normalized image coordinates
	PointTransform_F64 pixelToNorm;

	// ratio of matching features to unmatched features for two images to be considered connected
	double connectThreshold = 0.3;

	// tolerance for inliers in pixels
	double inlierTol = 1.5;

	// Detects and describes image interest points
	DetectDescribePoint<ImageFloat32, SurfFeature> detDesc = FactoryDetectDescribe.surfStable(null, null, null, ImageFloat32.class);
	// score ans association algorithm
	ScoreAssociation<SurfFeature> scorer = FactoryAssociation.scoreEuclidean(SurfFeature.class, true);
	AssociateDescription<SurfFeature> associate = FactoryAssociation.greedy(scorer, 1, true);

	// Triangulates the 3D coordinate of a point from two observations
	TriangulateTwoViewsCalibrated triangulate = FactoryTriangulate.twoGeometric();

	// List of visual features (e.g. SURF) descriptions in each image
	List<FastQueue<SurfFeature>> imageVisualFeatures = new ArrayList<FastQueue<SurfFeature>>();
	// List of visual feature locations as normalized image coordinates in each image
	List<FastQueue<Point2D_F64>> imagePixels = new ArrayList<FastQueue<Point2D_F64>>();
	// Color of the pixel at each feature location
	List<GrowQueue_I32> imageColors = new ArrayList<GrowQueue_I32>();
	// List of 3D features in each image
	List<List<Feature3D>> imageFeature3D = new ArrayList<List<Feature3D>>();

	// Transform from world to each camera image
	Se3_F64 motionWorldToCamera[];

	// indicates if an image has had its motion estimated yet
	boolean estimatedImage[];
	// if true the image has been processed.  Estimation could have failed. so this can be true but estimated false
	boolean processedImage[];

	// List of all 3D features
	List<Feature3D> featuresAll = new ArrayList<Feature3D>();

	// used to provide initial estimate of the 3D scene
	ModelMatcher<Se3_F64, AssociatedPair> estimateEssential;
	ModelMatcher<Se3_F64, Point2D3D> estimatePnP;
	ModelFitter<Se3_F64, Point2D3D> refinePnP = FactoryMultiView.refinePnP(1e-12,40);

	/**
	 * Process the images and reconstructor the scene as a point cloud using matching interest points between
	 * images.
	 */
	public void process(IntrinsicParameters intrinsic , List<BufferedImage> colorImages ) {

		pixelToNorm = LensDistortionOps.transformRadialToNorm_F64(intrinsic);

		setupEssential(intrinsic);
		setupPnP(intrinsic);

		// find features in each image
		detectImageFeatures(colorImages);

		// see which images are the most similar to each o ther
		double[][] matrix = computeConnections();

		printConnectionMatrix(matrix);

		// find the image which is connected to the most other images.  Use that as the origin of the arbitrary
		// coordinate system
		int bestImage = selectMostConnectFrame(colorImages, matrix);

		// Use two images to initialize the scene reconstruction
		initializeReconstruction(colorImages, matrix, bestImage);

		// Process rest of the images and compute 3D coordinates
		List<Integer> seed = new ArrayList<Integer>();
		seed.add(bestImage);
		performReconstruction(seed, -1, matrix);

		// Bundle adjustment would normally be done at this point, but has been omitted since the current
		// implementation is too slow for a large number of points

		// display a point cloud from the 3D features
		PointCloudViewer gui = new PointCloudViewer(intrinsic,1);

		for( Feature3D t : featuresAll) {
			gui.addPoint(t.worldPt.x,t.worldPt.y,t.worldPt.z,t.color);
		}

		gui.setPreferredSize(new Dimension(500,500));
		ShowImages.showWindow(gui, "Points");
	}

	/**
	 * Initialize the reconstruction by finding the image which is most similar to the "best" image.  Estimate
	 * its pose up to a scale factor and create the initial set of 3D features
	 */
	private void initializeReconstruction(List<BufferedImage> colorImages, double[][] matrix, int bestImage) {
		// Set all images, but the best one, as not having been estimated yet
		estimatedImage = new boolean[colorImages.size()];
		processedImage = new boolean[colorImages.size()];
		estimatedImage[bestImage] = true;
		processedImage[bestImage] = true;

		// declare stored for found motion of each image
		motionWorldToCamera = new Se3_F64[colorImages.size()];
		for (int i = 0; i < colorImages.size(); i++) {
			motionWorldToCamera[i] = new Se3_F64();
			imageFeature3D.add(new ArrayList<Feature3D>());
		}

		// pick the image most similar to the original image to initialize pose estimation
		int firstChild = findBestFit(matrix, bestImage);
		initialize(bestImage, firstChild);
	}

	/**
	 * Select the frame which has the most connections to all other frames.  The is probably a good location
	 * to start since it will require fewer hops to estimate the motion of other frames
	 */
	private int selectMostConnectFrame(List<BufferedImage> colorImages, double[][] matrix) {
		int bestImage = -1;
		int bestCount = 0;
		for (int i = 0; i < colorImages.size(); i++) {
			int count = 0;
			for (int j = 0; j < colorImages.size(); j++) {
				if( matrix[i][j] > connectThreshold ) {
					count++;
				}
			}
			System.out.println(i+"  count "+count);
			if( count > bestCount ) {
				bestCount = count;
				bestImage = i;
			}
		}
		return bestImage;
	}

	/**
	 * Detect image features in all the images.  Save location, description, and color
	 */
	private void detectImageFeatures(List<BufferedImage> colorImages) {
		System.out.println("Detecting Features in each image.  Total "+colorImages.size());
		for (int i = 0; i < colorImages.size(); i++) {
			System.out.print("*");
			BufferedImage colorImage = colorImages.get(i);
			FastQueue<SurfFeature> features = new SurfFeatureQueue(64);
			FastQueue<Point2D_F64> pixels = new FastQueue<Point2D_F64>(Point2D_F64.class, true);
			GrowQueue_I32 colors = new GrowQueue_I32();
			detectFeatures(colorImage, features, pixels, colors);

			imageVisualFeatures.add(features);
			imagePixels.add(pixels);
			imageColors.add(colors);
		}
		System.out.println();
	}

	/**
	 * Compute connectivity matrix based on fraction of matching image features
	 */
	private double[][] computeConnections() {
		double matrix[][] = new double[imageVisualFeatures.size()][imageVisualFeatures.size()];

		for (int i = 0; i < imageVisualFeatures.size(); i++) {
			for (int j = i+1; j < imageVisualFeatures.size(); j++) {
				System.out.printf("Associated %02d %02d ",i,j);
				associate.setSource(imageVisualFeatures.get(i));
				associate.setDestination(imageVisualFeatures.get(j));
				associate.associate();

				matrix[i][j] = associate.getMatches().size()/(double) imageVisualFeatures.get(i).size();
				matrix[j][i] = associate.getMatches().size()/(double) imageVisualFeatures.get(j).size();

				System.out.println(" = "+matrix[i][j]);
			}
		}
		return matrix;
	}

	/**
	 * Prints out which frames are connected to each other
	 */
	private void printConnectionMatrix( double[][] matrix) {
		for (int i = 0; i < matrix.length; i++) {
			for (int j = 0; j < matrix.length; j++) {
				if( matrix[i][j] >= connectThreshold )
					System.out.print("#");
				else
					System.out.print(".");
			}
			System.out.println();
		}
	}

	/**
	 * Configure robust algorithm for estimating pose from 3D features
	 */
	private void setupPnP( IntrinsicParameters intrinsic ) {
		Estimate1ofPnP estimator = FactoryMultiView.computePnP_1(EnumPNP.P3P_FINSTERWALDER, -1, 2);
		final DistanceModelMonoPixels<Se3_F64,Point2D3D> distance = new PnPDistanceReprojectionSq();
		distance.setIntrinsic(intrinsic.fx, intrinsic.fy, intrinsic.skew);

		ModelManagerSe3_F64 manager = new ModelManagerSe3_F64();
		EstimatorToGenerator<Se3_F64,Point2D3D> generator = new EstimatorToGenerator<Se3_F64,Point2D3D>(estimator);

		// 1/2 a pixel tolerance for RANSAC inliers
		double ransacTOL = inlierTol * inlierTol;

		estimatePnP = new Ransac<Se3_F64, Point2D3D>(2323, manager, generator, distance, 4000, ransacTOL);
	}

	/**
	 * Configure robust algorithm for estimating pose from essential matrix
	 */
	private void setupEssential( IntrinsicParameters intrinsic ) {
		// Since this is the first frame estimate the camera motion up to a translational scale factor
		Estimate1ofEpipolar essentialAlg = FactoryMultiView.computeFundamental_1(EnumEpipolar.ESSENTIAL_5_NISTER, 5);

		ModelManager<Se3_F64> manager = new ModelManagerSe3_F64();
		ModelGenerator<Se3_F64, AssociatedPair> generateEpipolarMotion =
				new Se3FromEssentialGenerator(essentialAlg, triangulate);

		DistanceFromModel<Se3_F64, AssociatedPair> distanceSe3 =
				new DistanceSe3SymmetricSq(triangulate,
						intrinsic.fx, intrinsic.fy, intrinsic.skew,
						intrinsic.fx, intrinsic.fy, intrinsic.skew);

		// tolerance for RANSAC inliers
		double ransacTOL = inlierTol * inlierTol * 2.0;

		estimateEssential = new Ransac<Se3_F64, AssociatedPair>(2323, manager, generateEpipolarMotion, distanceSe3,
				4000, ransacTOL);
	}

	/**
	 * Detects image features.  Saves their location, description, and pixel color
	 */
	private void detectFeatures(BufferedImage colorImage,
								FastQueue<SurfFeature> features, FastQueue<Point2D_F64> pixels,
								GrowQueue_I32 colors ) {

		ImageFloat32 image = ConvertBufferedImage.convertFrom(colorImage, (ImageFloat32) null);

		features.reset();
		pixels.reset();
		colors.reset();
		detDesc.detect(image);
		for (int i = 0; i < detDesc.getNumberOfFeatures(); i++) {
			Point2D_F64 p = detDesc.getLocation(i);

			features.grow().set(detDesc.getDescription(i));
			// store pixels are normalized image coordinates
			pixelToNorm.compute(p.x, p.y, pixels.grow());

			colors.add( colorImage.getRGB((int)p.x,(int)p.y) );
		}
	}

	/**
	 * Finds the frame which is the best match for the given target frame
	 */
	private int findBestFit( double matrix[][] , int target ) {

		// find the image which is the closest fit
		int bestIndex = -1;
		double bestRatio = 0;

		for (int i = 0; i < estimatedImage.length; i++) {
			double ratio = matrix[target][i];
			if( ratio > bestRatio ) {
				bestRatio = ratio;
				bestIndex = i;
			}
		}

		return bestIndex;
	}

	/**
	 * Initialize the 3D world given these two images.  imageA is assumed to be the origin of the world.
	 */
	private void initialize( int imageA , int imageB ) {
		System.out.println("Initializing 3D world using "+imageA+" and "+imageB);

		// Compute the 3D pose and find valid image features
		Se3_F64 motionAtoB = new Se3_F64();
		List<AssociatedIndex> inliers = new ArrayList<AssociatedIndex>();

		if( !estimateStereoPose(imageA, imageB, motionAtoB, inliers))
			throw new RuntimeException("The first image pair is a bad keyframe!");

		motionWorldToCamera[imageB].set(motionAtoB);
		estimatedImage[imageB] = true;
		processedImage[imageB] = true;

		// create tracks for only those features in the inlier list
		FastQueue<Point2D_F64> pixelsA = imagePixels.get(imageA);
		FastQueue<Point2D_F64> pixelsB = imagePixels.get(imageB);
		List<Feature3D> tracksA = imageFeature3D.get(imageA);
		List<Feature3D> tracksB = imageFeature3D.get(imageB);

		GrowQueue_I32 colorsA = imageColors.get(imageA);

		for (int i = 0; i < inliers.size(); i++) {
			AssociatedIndex a = inliers.get(i);

			Feature3D t = new Feature3D();
			t.color = colorsA.get(a.src);
			t.obs.grow().set( pixelsA.get(a.src) );
			t.obs.grow().set( pixelsB.get(a.dst) );
			t.frame.add(imageA);
			t.frame.add(imageB);
			// compute the 3D coordinate of the feature
			Point2D_F64 pa = pixelsA.get(a.src);
			Point2D_F64 pb = pixelsB.get(a.dst);

			if( !triangulate.triangulate(pa, pb, motionAtoB, t.worldPt) )
				continue;
			// the feature has to be in front of the camera
			if (t.worldPt.z > 0) {
				featuresAll.add(t);
				tracksA.add(t);
				tracksB.add(t);
			}
		}

		// adjust the scale so that it's not excessively large or small
		normalizeScale(motionWorldToCamera[imageB],tracksA);
	}

	/**
	 * Perform a breadth first search through connection graph until the motion to all images has been found
	 */
	private void performReconstruction(List<Integer> parents, int childAdd, double matrix[][]) {

		System.out.println("--------- Total Parents "+parents.size());

		List<Integer> children = new ArrayList<Integer>();

		if( childAdd != -1 ) {
			children.add(childAdd);
		}

		for( int parent : parents ) {
			for (int i = 0; i < estimatedImage.length; i++) {
				// see if it is connected to the target and has not had its motion estimated
				if( matrix[parent][i] > connectThreshold && !processedImage[i] ) {
					estimateMotionPnP(parent,i);
					children.add(i);
				}
			}
		}

		if( !children.isEmpty() )
			performReconstruction(children, -1, matrix);
	}

	/**
	 * Estimate the motion between two images.  Image A is assumed to have known features with 3D coordinates already
	 * and image B is an unprocessed image with no 3D features yet.
	 */
	private void estimateMotionPnP( int imageA , int imageB ) {
		// Mark image B as processed so that it isn't processed a second time.
		processedImage[imageB] = true;

		System.out.println("Estimating PnP motion between "+imageA+" and "+imageB);

		// initially prune features using essential matrix
		Se3_F64 dummy = new Se3_F64();
		List<AssociatedIndex> inliers = new ArrayList<AssociatedIndex>();

		if( !estimateStereoPose(imageA, imageB, dummy, inliers))
			throw new RuntimeException("The first image pair is a bad keyframe!");

		FastQueue<Point2D_F64> pixelsA = imagePixels.get(imageA);
		FastQueue<Point2D_F64> pixelsB = imagePixels.get(imageB);
		List<Feature3D> featuresA = imageFeature3D.get(imageA);
		List<Feature3D> featuresB = imageFeature3D.get(imageB); // this should be empty

		// create the associated pair for motion estimation
		List<Point2D3D> features = new ArrayList<Point2D3D>();
		List<AssociatedIndex> inputRansac = new ArrayList<AssociatedIndex>();
		List<AssociatedIndex> unmatched = new ArrayList<AssociatedIndex>();
		for (int i = 0; i < inliers.size(); i++) {
			AssociatedIndex a = inliers.get(i);
			Feature3D t = lookupFeature(featuresA, imageA, pixelsA.get(a.src));
			if( t != null ) {
				Point2D_F64 p = pixelsB.get(a.dst);
				features.add(new Point2D3D(p, t.worldPt));
				inputRansac.add(a);
			} else {
				unmatched.add(a);
			}
		}

		// make sure there are enough features to estimate motion
		if( features.size() < 15 ) {
			System.out.println("  Too few features for PnP!!  "+features.size());
			return;
		}

		// estimate the motion between the two images
		if( !estimatePnP.process(features))
			throw new RuntimeException("Motion estimation failed");

		// refine the motion estimate using non-linear optimization
		Se3_F64 motionWorldToB = new Se3_F64();
		if( !refinePnP.fitModel(estimatePnP.getMatchSet(), estimatePnP.getModelParameters(), motionWorldToB) )
			throw new RuntimeException("Refine failed!?!?");

		motionWorldToCamera[imageB].set(motionWorldToB);
		estimatedImage[imageB] = true;

		// Add all tracks in the inlier list to the B's list of 3D features
		int N = estimatePnP.getMatchSet().size();
		boolean inlierPnP[] = new boolean[features.size()];
		for (int i = 0; i < N; i++) {
			int index = estimatePnP.getInputIndex(i);
			AssociatedIndex a = inputRansac.get(index);

			// find the track that this was associated with and add it to B
			Feature3D t = lookupFeature(featuresA, imageA, pixelsA.get(a.src));
			featuresB.add(t);
			t.frame.add(imageB);
			t.obs.grow().set( pixelsB.get(a.dst) );
			inlierPnP[index] = true;
		}

		// Create new tracks for all features which were a member of essential matrix but not used to estimate
		// the motion using PnP.
		Se3_F64 motionBtoWorld = motionWorldToB.invert(null);
		Se3_F64 motionWorldToA = motionWorldToCamera[imageA];
		Se3_F64 motionBtoA =  motionBtoWorld.concat(motionWorldToA, null);
		Point3D_F64 pt_in_b = new Point3D_F64();

		int totalAdded = 0;
		GrowQueue_I32 colorsA = imageColors.get(imageA);
		for( AssociatedIndex a : unmatched ) {

			if( !triangulate.triangulate(pixelsB.get(a.dst),pixelsA.get(a.src),motionBtoA,pt_in_b) )
				continue;

			// the feature has to be in front of the camera
			if( pt_in_b.z > 0 ) {
				Feature3D t = new Feature3D();

				// transform from B back to world frame
				SePointOps_F64.transform(motionBtoWorld, pt_in_b, t.worldPt);

				t.color = colorsA.get(a.src);
				t.obs.grow().set( pixelsA.get(a.src) );
				t.obs.grow().set( pixelsB.get(a.dst) );
				t.frame.add(imageA);
				t.frame.add(imageB);

				featuresAll.add(t);
				featuresA.add(t);
				featuresB.add(t);

				totalAdded++;
			}
		}

		// create new tracks for existing tracks which were not in the inlier set.  Maybe things will work
		// out better if the 3D coordinate is re-triangulated as a new feature
		for (int i = 0; i < features.size(); i++) {
			if( inlierPnP[i] )
				continue;

			AssociatedIndex a = inputRansac.get(i);

			if( !triangulate.triangulate(pixelsB.get(a.dst),pixelsA.get(a.src),motionBtoA,pt_in_b) )
				continue;

			// the feature has to be in front of the camera
			if( pt_in_b.z > 0 ) {
				Feature3D t = new Feature3D();

				// transform from B back to world frame
				SePointOps_F64.transform(motionBtoWorld, pt_in_b, t.worldPt);

				// only add this feature to image B since a similar one already exists in A.
				t.color = colorsA.get(a.src);
				t.obs.grow().set(pixelsB.get(a.dst));
				t.frame.add(imageB);

				featuresAll.add(t);
				featuresB.add(t);

				totalAdded++;
			}
		}

		System.out.println("  New added " + totalAdded + "  tracksA.size = " + featuresA.size() + "  tracksB.size = " + featuresB.size());
	}

	/**
	 * Given a list of 3D features, find the feature which was observed at the specified frame at the
	 * specified location.  If no feature is found return null.
	 */
	private Feature3D lookupFeature(List<Feature3D> features, int frameIndex, Point2D_F64 pixel) {
		for (int i = 0; i < features.size(); i++) {
			Feature3D t = features.get(i);
			for (int j = 0; j < t.frame.size(); j++) {
				if( t.frame.get(j) == frameIndex ) {
					Point2D_F64 o = t.obs.get(j);
					if( o.x == pixel.x && o.y == pixel.y ) {
						return t;
					} else {
						break;
					}
				}
			}
		}
		return null;
	}

	/**
	 * Given two images compute the relative location of each image using the essential matrix.
	 */
	protected boolean estimateStereoPose(int imageA, int imageB, Se3_F64 motionAtoB,
										 List<AssociatedIndex> inliers)
	{
		// associate the features together
		associate.setSource(imageVisualFeatures.get(imageA));
		associate.setDestination(imageVisualFeatures.get(imageB));
		associate.associate();

		FastQueue<AssociatedIndex> matches = associate.getMatches();

		// create the associated pair for motion estimation
		FastQueue<Point2D_F64> pixelsA = imagePixels.get(imageA);
		FastQueue<Point2D_F64> pixelsB = imagePixels.get(imageB);
		List<AssociatedPair> pairs = new ArrayList<AssociatedPair>();
		for (int i = 0; i < matches.size(); i++) {
			AssociatedIndex a = matches.get(i);
			pairs.add(new AssociatedPair(pixelsA.get(a.src), pixelsB.get(a.dst)));
		}

		if( !estimateEssential.process(pairs) )
			throw new RuntimeException("Motion estimation failed");

		List<AssociatedPair> inliersEssential = estimateEssential.getMatchSet();

		motionAtoB.set(estimateEssential.getModelParameters());

		for (int i = 0; i < inliersEssential.size(); i++) {
			int index = estimateEssential.getInputIndex(i);

			inliers.add( matches.get(index));
		}

		return true;
	}

	/**
	 * Scale can only be estimated up to a scale factor.  Might as well set the distance to 1 since it is
	 * less likely to have overflow/underflow issues.  This step is not strictly necessary.
	 */
	public void normalizeScale( Se3_F64 transform , List<Feature3D> features) {

		double T = transform.T.norm();
		double scale = 1.0/T;

		for( Se3_F64 m : motionWorldToCamera) {
			m.T.scale(scale);
		}

		for( Feature3D t : features) {
			t.worldPt.scale(scale);
		}
	}

	public static class Feature3D {
		// color of the pixel first found int
		int color;
		// estimate 3D position of the feature
		Point3D_F64 worldPt = new Point3D_F64();
		// observations in each frame that it's visible
		FastQueue<Point2D_F64> obs = new FastQueue<Point2D_F64>(Point2D_F64.class, true);
		// index of each frame its visible in
		GrowQueue_I32 frame = new GrowQueue_I32();
	}

	public static void main(String[] args) {

		String directory = "../data/applet/sfm/chair";

		IntrinsicParameters intrinsic = UtilIO.loadXML(directory+"/intrinsic_DSC-HX5_3648x2736_to_640x480.xml");

		List<BufferedImage> images = UtilImageIO.loadImages(directory,".*jpg");

		ExampleStructureFromMotion example = new ExampleStructureFromMotion();

		long before = System.currentTimeMillis();
		example.process(intrinsic,images);
		long after = System.currentTimeMillis();

		System.out.println("Elapsed time "+(after-before)/1000.0+" (s)");
	}
}