vivo 互联网服务器团队- Shuai Guangying
一、业务背景
GET /my_locations/_search{ "query": { "bool": { "must": { "match_all": {} }, "filter": { "geo_distance": { "distance": "1km", "pin.location": { "lat": 40, "lon": 116 } } } } }}
二、背景知识
-
1.如何精确定位一个地址?
-
2.如何计算两个地址距离?
这个公式非常简单,只需用到arcsin和cos两个高中数学公式。其中φ和λ表示两个点纬度和经度的弧度制度量。其中d即为所求两个点的距离,对应的数学公式如下(参考维基百科):
// 代码摘自lucene-core-8.2.0, SloppyMath工具类 /** * Returns the Haversine distance in meters between two points * given the previous result from {@link #haversinSortKey(double, double, double, double)} * @return distance in meters. */ public static double haversinMeters(double sortKey) { return TO_METERS * 2 * asin(Math.min(1, Math.sqrt(sortKey * 0.5))); } /** * Returns a sort key for distance. This is less expensive to compute than * {@link #haversinMeters(double, double, double, double)}, but it always compares the same. * This can be converted into an actual distance with {@link #haversinMeters(double)}, which * effectively does the second half of the computation. */ public static double haversinSortKey(double lat1, double lon1, double lat2, double lon2) { double x1 = lat1 * TO_RADIANS; double x2 = lat2 * TO_RADIANS; double h1 = 1 - cos(x1 - x2); double h2 = 1 - cos((lon1 - lon2) * TO_RADIANS); double h = h1 + cos(x1) * cos(x2) * h2; // clobber crazy precision so subsequent rounding does not create ties. return Double.longBitsToDouble(Double.doubleToRawLongBits(h) & 0xFFFFFFFFFFFFFFF8L); } // haversin // TODO: remove these for java 9, they fixed Math.toDegrees()/toRadians() to work just like this. public static final double TO_RADIANS = Math.PI / 180D; public static final double TO_DEGREES = 180D / Math.PI; // Earth's mean radius, in meters and kilometers; see private static final double TO_METERS = 6_371_008.7714D; // equatorial radius private static final double TO_KILOMETERS = 6_371.0087714D; // equatorial radius /** * Returns the Haversine distance in meters between two points * specified in decimal degrees (latitude/longitude). This works correctly * even if the dateline is between the two points. ** Error is at most 4E-1 (40cm) from the actual haversine distance, but is typically * much smaller for reasonable distances: around 1E-5 (0.01mm) for distances less than * 1000km. * * @param lat1 Latitude of the first point. * @param lon1 Longitude of the first point. * @param lat2 Latitude of the second point. * @param lon2 Longitude of the second point. * @return distance in meters. */ public static double haversinMeters(double lat1, double lon1, double lat2, double lon2) { return haversinMeters(haversinSortKey(lat1, lon1, lat2, lon2)); }
-
3.如何方便在互联网分享经纬度坐标?
-
Geohash给地图上每个坐标提供了独一无二的ID,这个唯一ID就相当于给每个地理位置提供了一个身份证。唯一ID在数据库中应用场景非常丰富。
-
在数据库中给坐标点提供了另一种存储方式,将二维的坐标点转化成为一维的字符串,对于一维数据就可以借助B树等索引来加速查询。
-
Geohash是一种前缀编码,位置相近的坐标点前缀相同。通过前缀提供了高性能的邻近位置POI查询,而邻近位置POI查询是LBS服务的核心能力。 关于Geohash的编码规则,这里不展开。这里最关键的点在于:
Geohash是一种前缀编码,位置相近的坐标点前缀相同。Geohash编码长度不同,所覆盖区域范围不同。
-
暴力算法
中心坐标点依次跟集合中每个坐标点计算距离,筛选出符合半径条件的坐标点。
-
二次筛选
-
基于坐标中心点计算出geohash, 基于半径确定geohash前缀。
-
通过Geohash前缀初筛出大致符合要求的坐标点(需要将中心点所在Geohash块周围8个Geohash块纳入初筛范围)。
-
对于初筛结果使用Haversine公式进行二次筛选。
三、方案演进
从2015年至今已经经历了6年的发展, 建设了如下的能力:
技术迭代大致可以分为3个阶段:
发展的成效显著,从性能测试的结果可以略窥一二:
总的来说,资源消耗降低的前提下搜索和写入数据效率都有大幅度提升。下面就详细介绍Elasticsearch对地理信息索引的思路。
3.1 史前时代
第一步: 通过关键词找到对应的倒排表。这一步简单来说就是查词典。例如:TermQuery.TermWeight 获取该term的倒排表,读取docId+freq信息。 第二步: 根据倒排表得到的docId和词频信息对文档进行打分,返回给用户分值最高的TopN结果。例如:TopScoreDocCollector -- collect()方法,基于小顶堆,保留分数最大的TopN文档。 第三步: 基于docId查询正排表获取文档字段明细信息。
例如:查词典可以用很多数据结构实现,比如跳跃表,平衡树、HashMap等,而Lucene的核心工程师Mike McCandless实现了一个只有他自己能懂的FST, 是综合了有限自动机和前缀树的一种数据结构,用来平衡查询复杂度和存储空间,比HashMap慢,但是空间消耗低。文档打分通常用小顶堆来维护分值最高的N个结果,如果有新的文档打分超过堆顶,则替换堆顶元素即可。
Added NumericRangeQuery and NumericRangeFilter, a fast alternative to RangeQuery/RangeFilter for numeric searches. They depend on a specific structure of terms in the index that can be created by indexing using the new NumericField or NumericTokenStream classes. NumericField can only be used for indexing and optionally stores the values as string representation in the doc store. Documents returned from IndexReader/IndexSearcher will return only the String value using the standard Fieldable interface. NumericFields can be sorted on and loaded into the FieldCache. (Uwe Schindler, Yonik Seeley, Mike McCandless)
如下图所示:
例如:如果precisionStep=8,则意味前缀树叶子节点的上层控制着255个叶子。那么,当查询范围在1~511时,由于跨了相邻的2个非叶子节点,所以需要遍历511个term。但是假如查询范围在0~512,又只需遍历2个term即可。这样的实现用起来真的有过山车的感觉。
3.2 Elasticsearch 2.0 版本
public static final class GeoPointFieldType extends MappedFieldType { private MappedFieldType geohashFieldType; private int geohashPrecision; private boolean geohashPrefixEnabled; private MappedFieldType latFieldType; private MappedFieldType lonFieldType; public GeoPointFieldType() {}}
// 计算经纬度坐标+距离得到的矩形区域// GeoDistance类public static DistanceBoundingCheck distanceBoundingCheck(double sourceLatitude, double sourceLongitude, double distance, DistanceUnit unit) { // angular distance in radians on a great circle // assume worst-case: use the minor axis double radDist = unit.toMeters(distance) / GeoUtils.EARTH_SEMI_MINOR_AXIS; double radLat = Math.toRadians(sourceLatitude); double radLon = Math.toRadians(sourceLongitude); double minLat = radLat - radDist; double maxLat = radLat + radDist; double minLon, maxLon; if (minLat > MIN_LAT && maxLat < MAX_LAT) { double deltaLon = Math.asin(Math.sin(radDist) / Math.cos(radLat)); minLon = radLon - deltaLon; if (minLon < MIN_LON) minLon += 2d * Math.PI; maxLon = radLon + deltaLon; if (maxLon > MAX_LON) maxLon -= 2d * Math.PI; } else { // a pole is within the distance minLat = Math.max(minLat, MIN_LAT); maxLat = Math.min(maxLat, MAX_LAT); minLon = MIN_LON; maxLon = MAX_LON; } GeoPoint topLeft = new GeoPoint(Math.toDegrees(maxLat), Math.toDegrees(minLon)); GeoPoint bottomRight = new GeoPoint(Math.toDegrees(minLat), Math.toDegrees(maxLon)); if (minLon > maxLon) { return new Meridian180DistanceBoundingCheck(topLeft, bottomRight); } return new SimpleDistanceBoundingCheck(topLeft, bottomRight); }
public class IndexedGeoBoundingBoxQuery {public static Query create(GeoPoint topLeft, GeoPoint bottomRight, GeoPointFieldMapper.GeoPointFieldType fieldType) { if (!fieldType.isLatLonEnabled()) { throw new IllegalArgumentException("lat/lon is not enabled (indexed) for field [" + fieldType.names().fullName() + "], can't use indexed filter on it"); } //checks to see if bounding box crosses 180 degrees if (topLeft.lon() > bottomRight.lon()) { return westGeoBoundingBoxFilter(topLeft, bottomRight, fieldType); } else { return eastGeoBoundingBoxFilter(topLeft, bottomRight, fieldType); }}private static Query westGeoBoundingBoxFilter(GeoPoint topLeft, GeoPoint bottomRight, GeoPointFieldMapper.GeoPointFieldType fieldType) { BooleanQuery.Builder filter = new BooleanQuery.Builder(); filter.setMinimumNumberShouldMatch(1); filter.add(fieldType.lonFieldType().rangeQuery(null, bottomRight.lon(), true, true), Occur.SHOULD); filter.add(fieldType.lonFieldType().rangeQuery(topLeft.lon(), null, true, true), Occur.SHOULD); filter.add(fieldType.latFieldType().rangeQuery(bottomRight.lat(), topLeft.lat(), true, true), Occur.MUST); return new ConstantScoreQuery(filter.build());}private static Query eastGeoBoundingBoxFilter(GeoPoint topLeft, GeoPoint bottomRight, GeoPointFieldMapper.GeoPointFieldType fieldType) { BooleanQuery.Builder filter = new BooleanQuery.Builder(); filter.add(fieldType.lonFieldType().rangeQuery(topLeft.lon(), bottomRight.lon(), true, true), Occur.MUST); filter.add(fieldType.latFieldType().rangeQuery(bottomRight.lat(), topLeft.lat(), true, true), Occur.MUST); return new ConstantScoreQuery(filter.build());}}
// GeoDistanceRangeQuery类的实现 @Override public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException { final Weight boundingBoxWeight; if (boundingBoxFilter != null) { boundingBoxWeight = searcher.createNormalizedWeight(boundingBoxFilter, false); } else { boundingBoxWeight = null; } return new ConstantScoreWeight(this) { @Override public Scorer scorer(LeafReaderContext context) throws IOException { final DocIdSetIterator approximation; if (boundingBoxWeight != null) { approximation = boundingBoxWeight.scorer(context); } else { approximation = DocIdSetIterator.all(context.reader().maxDoc()); } if (approximation == null) { // if the approximation does not match anything, we're done return null; } final MultiGeoPointValues values = indexFieldData.load(context).getGeoPointValues(); final TwoPhaseIterator twoPhaseIterator = new TwoPhaseIterator(approximation) { @Override public boolean matches() throws IOException { final int doc = approximation.docID(); values.setDocument(doc); final int length = values.count(); for (int i = 0; i < length; i++) { GeoPoint point = values.valueAt(i); if (distanceBoundingCheck.isWithin(point.lat(), point.lon())) { double d = fixedSourceDistance.calculate(point.lat(), point.lon()); if (d >= inclusiveLowerPoint && d <= inclusiveUpperPoint) { return true; } } } return false; } }; return new ConstantScoreScorer(this, score(), twoPhaseIterator); } }; }