JAVA版 中文地址 识别 切分源码
原文地址:https://www.vbox.top/38.html
几乎没接触过C#,不过跟java很相似,花了一天时间,将原作者的代码改成了java版。
Splitter.java文件如下
import java.util.regex.Pattern; /** * Created by ajtdnyy on 13-9-3. */ public class Splitter { Pattern pattern; Pattern[] patterns; boolean flag = true; public Splitter(Pattern pattern) { this.pattern = pattern; } public Splitter(Pattern pattern, Pattern[] patterns) { this.pattern = pattern; this.patterns = patterns; } public Splitter(Pattern pattern, Pattern[] patterns, boolean flag) { this.pattern = pattern; this.flag = flag; this.patterns = patterns; } }
Segment.java类如下
import java.util.regex.Pattern; /** * Created by ajtdnyy on 13-9-3. */ public class Segment { String value; Pattern pattern; public Segment(String value, Pattern pattern) { this.value = value; this.pattern = pattern; } }
ChineseAddress.java类如下
import java.util.List; /** * Created by ajtdnyy on 13-9-3. */ public class ChineseAddress { public String source; public String nation; public String province; public String city; public String county; public String district; public String street; public List roads; public String number; public String plaza; public String ip; public String town; public String village; public String zone; public String underground; public List notes; public List noises; private static final String SEPARATOR = System.getProperty("line.separator"); public String toString() { String s = "src: " + source + SEPARATOR; if (nation != null) { s = s + "nat: " + nation + SEPARATOR; } if (province != null) { s = s + "pro: " + province + SEPARATOR; } if (city != null) { s = s + "cit: " + city + SEPARATOR; } if (county != null) { s = s + "cou: " + county + SEPARATOR; } if (district != null) { s = s + "dis: " + district + SEPARATOR; } if (street != null) { s = s + "str: " + street + SEPARATOR; } if (number != null) { s = s + "num: " + number + SEPARATOR; } if (plaza != null) { s = s + "pla: " + plaza + SEPARATOR; } if (ip != null) { s = s + "idp: " + ip + SEPARATOR; } if (town != null) { s = s + "twn: " + town + SEPARATOR; } if (village != null) { s = s + "vil: " + village + SEPARATOR; } if (zone != null) { s = s + "zon: " + zone + SEPARATOR; } if (underground != null) { s = s + "udg: " + underground + SEPARATOR; } if (roads != null) { s = s + "rod: "; for (int i = 0; i < roads.size(); i++) { String r = roads.get(i); if (r == roads.get(0)) { s = s + r; } else { s = s + " / " + r; } } s = s + SEPARATOR; } if (notes != null) { s = s + "not: "; for (int i = 0; i < notes.size(); i++) { String n = notes.get(i); if (n == roads.get(0)) { s = s + n; } else { s = s + " / " + n; } } s = s + SEPARATOR; } if (noises != null) { s = s + "noi: "; for (int i = 0; i < noises.size(); i++) { s = s + noises.get(i) + " / "; } s = s + SEPARATOR; } return s; } }
ChineseAddressParser.java类如下
import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.Iterator; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created by ajtdnyy on 13-9-3. */ public class ChineseAddressParser { static final String reg = "[\u4e00-\u9fa5]"; static final Pattern ms_Pattern_guo = Pattern.compile("中国"); static final Pattern ms_Pattern_jinjiao = Pattern.compile("近郊"); static final Pattern ms_Pattern_sheng = Pattern.compile(reg + "+?省"); static final Pattern ms_Pattern_shi = Pattern.compile(reg + "+?市(?!场)"); static final Pattern ms_Pattern_qu = Pattern.compile(reg + "+?区"); static final Pattern ms_Pattern_xiang = Pattern.compile(reg + "+?乡"); static final Pattern ms_Pattern_xian = Pattern.compile(reg + "+?县"); static final Pattern ms_Pattern_dao = Pattern.compile(reg + "+?道"); static final Pattern ms_Pattern_hutong = Pattern.compile(reg + "+?胡同"); static final Pattern ms_Pattern_nongtang = Pattern.compile(reg + "+?弄堂"); static final Pattern ms_Pattern_jie = Pattern.compile(reg + "+?街"); static final Pattern ms_Pattern_xiangg = Pattern.compile(reg + "+?巷"); static final Pattern ms_Pattern_lu = Pattern.compile(reg + "+?路"); static final Pattern ms_Pattern_cun = Pattern.compile(reg + "+?村"); static final Pattern ms_Pattern_zhen = Pattern.compile(reg + "+?镇"); static final Pattern ms_Pattern_hao = Pattern.compile("[甲_乙_丙_0-9_-]+?号"); static final Pattern ms_Pattern_point = Pattern.compile(reg + "+?(?:广场|酒店|饭店|宾馆|中心|大厦|百货|大楼|商城)"); static final Pattern ms_Pattern_ditie = Pattern.compile("地铁" + reg + "+?线(?:" + reg + "+?站)?"); static final Pattern ms_Pattern_province = Pattern.compile(reg + "{2,10}?(?:省|特区|自治区|特别行政区)"); static final Pattern ms_Pattern_city = Pattern.compile(reg + "+?(?:市|地区|自治州)"); static final Pattern ms_Pattern_county = Pattern.compile(reg + "+?(?:乡|县)"); static final Pattern ms_Pattern_street = Pattern.compile(reg + "+?街道"); static final Pattern ms_Pattern_road = Pattern.compile(reg + "+?(?:胡同|弄堂|街|巷|路|道)"); static final Pattern ms_Pattern_roadnear = Pattern.compile("(?<=近)" + reg + "+?(?:胡同|弄堂|街|巷|路|道)"); static final Pattern ms_Pattern_ip = Pattern.compile(reg + "+?(?:开发区|科技区|园区)"); static final Pattern ms_Pattern_zone = Pattern.compile(reg + "+?(?:小区|社区|新村)"); static final Pattern ms_Pattern_village = Pattern.compile(reg + "+?村"); static final Pattern ms_Pattern_town = Pattern.compile(reg + "+?镇"); static final Pattern ms_Pattern_number = Pattern.compile("[甲_乙_丙_0-9_-]+号"); static final Pattern ms_Pattern_plaza = Pattern.compile(reg + "+?(?:广场|酒店|饭店|宾馆|中心|大厦|百货|大楼|商城)"); static final Pattern ms_Pattern_underground = Pattern.compile("地铁" + reg + "+?线(?:" + reg + "+?站)?"); static final Splitter ms_splitter_guo = new Splitter(ms_Pattern_guo, new Pattern[]{ms_Pattern_guo}); static final Splitter ms_splitter_sheng = new Splitter(ms_Pattern_sheng, new Pattern[]{ms_Pattern_province}); static final Splitter ms_splitter_shi = new Splitter(ms_Pattern_shi, new Pattern[]{ms_Pattern_city}, false); static final Splitter ms_splitter_jinjiao = new Splitter(ms_Pattern_jinjiao, new Pattern[]{ms_Pattern_jinjiao}); static final Splitter ms_splitter_qu = new Splitter(ms_Pattern_qu, new Pattern[]{ms_Pattern_province, ms_Pattern_city, ms_Pattern_zone, ms_Pattern_ip, ms_Pattern_qu}, false); static final Splitter ms_splitter_xiang = new Splitter(ms_Pattern_xiang, new Pattern[]{ms_Pattern_county}); static final Splitter ms_splitter_xian = new Splitter(ms_Pattern_xian, new Pattern[]{ms_Pattern_county}); static final Splitter ms_splitter_dao = new Splitter(ms_Pattern_dao, new Pattern[]{ms_Pattern_street, ms_Pattern_roadnear, ms_Pattern_road}, false); static final Splitter ms_splitter_hutong = new Splitter(ms_Pattern_hutong, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false); static final Splitter ms_splitter_nongtang = new Splitter(ms_Pattern_nongtang, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false); static final Splitter ms_splitter_jie = new Splitter(ms_Pattern_jie, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false); static final Splitter ms_splitter_lu = new Splitter(ms_Pattern_lu, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false); static final Splitter ms_splitter_xiangg = new Splitter(ms_Pattern_xiangg, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false); static final Splitter ms_splitter_cun = new Splitter(ms_Pattern_cun, new Pattern[]{ms_Pattern_zone, ms_Pattern_village}); static final Splitter ms_splitter_zhen = new Splitter(ms_Pattern_zhen, new Pattern[]{ms_Pattern_town}); static final Splitter ms_splitter_hao = new Splitter(ms_Pattern_hao, new Pattern[]{ms_Pattern_number}); static final Splitter ms_splitter_point = new Splitter(ms_Pattern_point, new Pattern[]{ms_Pattern_plaza}); static final Splitter ms_splitter_ditie = new Splitter(ms_Pattern_ditie, new Pattern[]{ms_Pattern_underground}); static final Splitter[] ms_defaultsplitters = new Splitter[]{ ms_splitter_guo, ms_splitter_sheng, ms_splitter_shi, ms_splitter_qu, ms_splitter_xiang, ms_splitter_xian, ms_splitter_dao, ms_splitter_hutong, ms_splitter_nongtang, ms_splitter_jie, ms_splitter_xiangg, ms_splitter_lu, ms_splitter_cun, ms_splitter_zhen, ms_splitter_hao, ms_splitter_point, ms_splitter_ditie, ms_splitter_jinjiao }; private static LinkedHashMap<Integer, Splitter> split(String src, Splitter[] splitters) { LinkedHashMap<Integer, Splitter> splitterdic = new LinkedHashMap<Integer, Splitter>(); for (Splitter s : splitters) { Matcher m = s.pattern.matcher(src); while (m.find()) { splitterdic.put(m.start() + m.group().length(), s); if (s.flag) { break; } } } return splitterdic; } private static ArrayList recognize(String src, LinkedHashMap<Integer, Splitter> splitterdic) { Segment s; int index = 0; ArrayList segments = new ArrayList(); if (src.length() > 0) { for (Integer key : splitterdic.keySet()) { Splitter value = splitterdic.get(key); if (key > index && key < src.length()) { for (Pattern r : value.patterns) { s = segmentRecognize(src.substring(index, key), r); if (s != null) { segments.add(s); break; } } index = key; } } } return segments; } private static Segment segmentRecognize(String src, Pattern r) { Matcher m = r.matcher(src); if (m.matches()) { return new Segment(m.group(), r); } else { return null; } } private static ArrayList segmentsGetStringListForPattern(ArrayList segments, Pattern r) { ArrayList ss = new ArrayList(); for (Iterator it = segments.iterator(); it.hasNext();) { Segment s = it.next(); if (s.pattern == r) { ss.add(s.value); } } return ss; } private static String segmentsGetStringForPattern(ArrayList segments, Pattern r) { for (Iterator it = segments.iterator(); it.hasNext();) { Segment s = it.next(); if (s.pattern == r) { return s.value; } } return null; } public static void main(String[] args) { System.out.println(ChineseAddressParser.parse("北京市海淀区中关村北大街37号天龙大厦3层")); System.out.println(ChineseAddressParser.parse("福州市台江区群众路278号源利明珠大厦6楼")); System.out.println(ChineseAddressParser.parse("北京西城区百万庄大街68号6楼")); } public static ChineseAddress parse(String source) { source = source.replace(".", "").replace(",", "").replace(",", ""); ArrayList segments = recognize(source, split(source, ms_defaultsplitters)); ChineseAddress ca = new ChineseAddress(); ca.source = source; ca.nation = segmentsGetStringForPattern(segments, ms_Pattern_guo); ca.province = segmentsGetStringForPattern(segments, ms_Pattern_province); ca.city = segmentsGetStringForPattern(segments, ms_Pattern_city); ca.district = segmentsGetStringForPattern(segments, ms_Pattern_qu); ca.county = segmentsGetStringForPattern(segments, ms_Pattern_county); ca.street = segmentsGetStringForPattern(segments, ms_Pattern_street); ArrayList roads = segmentsGetStringListForPattern(segments, ms_Pattern_road); ArrayList near = segmentsGetStringListForPattern(segments, ms_Pattern_roadnear); for (Iterator it = near.iterator(); it.hasNext();) { roads.add(it.next()); } ca.roads = roads; ca.underground = segmentsGetStringForPattern(segments, ms_Pattern_underground); ca.number = segmentsGetStringForPattern(segments, ms_Pattern_number); ca.plaza = segmentsGetStringForPattern(segments, ms_Pattern_plaza); ca.ip = segmentsGetStringForPattern(segments, ms_Pattern_ip); ca.town = segmentsGetStringForPattern(segments, ms_Pattern_town); ca.village = segmentsGetStringForPattern(segments, ms_Pattern_village); return ca; } }
原作者C#博客地址:http://blog.csdn.net/helanmouse/article/details/4096933