Changeset 1427
- Timestamp:
- 09/12/08 15:45:31 (2 months ago)
- Files:
-
- BADataMunger/trunk (modified) (1 prop)
- BADataMunger/trunk/batlaspipe.py (modified) (3 diffs)
- BADataMunger/trunk/bidmaker.py (modified) (1 diff)
- BADataMunger/trunk/config/BATL001_config.xml (added)
- BADataMunger/trunk/config/BATL001aconfig.xml (added)
- BADataMunger/trunk/config/BATL002_config.xml (added)
- BADataMunger/trunk/config/BATL003_config.xml (added)
- BADataMunger/trunk/config/BATL004_config.xml (added)
- BADataMunger/trunk/config/BATL005_config.xml (added)
- BADataMunger/trunk/config/BATL006_config.xml (added)
- BADataMunger/trunk/config/BATL064_config.xml (added)
- BADataMunger/trunk/config/BATL089_config.xml (added)
- BADataMunger/trunk/config/BATL090_config.xml (added)
- BADataMunger/trunk/config/BATL091_config.xml (added)
- BADataMunger/trunk/config/BATL092_config.xml (added)
- BADataMunger/trunk/config/BATL093_config.xml (added)
- BADataMunger/trunk/config/BATL094_config.xml (added)
- BADataMunger/trunk/config/BATL095_config.xml (added)
- BADataMunger/trunk/config/BATL096_config.xml (added)
- BADataMunger/trunk/config/BATL097_config.xml (added)
- BADataMunger/trunk/config/BATL098_config.xml (added)
- BADataMunger/trunk/config/BATL099_config.xml (added)
- BADataMunger/trunk/config/BATL100_config.xml (added)
- BADataMunger/trunk/config/BATL101_config.xml (added)
- BADataMunger/trunk/config/BATL102_config.xml (added)
- BADataMunger/trunk/etc/wordhtml (modified) (1 prop)
- BADataMunger/trunk/etc/wordhtml/BATL001A.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL001_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL002_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL003_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL004_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL005_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL006_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL064_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL089_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL090_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL091_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL092_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL093_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL094_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL095_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL096_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL097_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL098_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL099_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL100-orig.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL100-tidied.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL100_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL101-orig.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL101-tidied.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL101_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL102-orig.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL102-tidied-2_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL102-tidied_.htm (added)
- BADataMunger/trunk/etc/wordhtml/BATL102_.htm (added)
- BADataMunger/trunk/tableparser.py (modified) (16 diffs)
- BADataMunger/trunk/wordnormalizer.py (modified) (1 diff)
- BADataMunger/trunk/wordstripper.xsl (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
BADataMunger/trunk
- Property svn:ignore set to
pleiades
scratch
- Property svn:ignore set to
BADataMunger/trunk/batlaspipe.py
r1371 r1427 178 178 if len(bid) > 0: 179 179 place.batlasids.append(bid) 180 altlabel = maker.buildAltLabel(label, place.locdesc, placetypes) 180 if len(place.placenames) > 0: 181 altlabel = maker.buildAltLabel(label, place.placenames[0].name, placetypes) 182 else: 183 altlabel = maker.buildAltLabel(label, place.locdesc, placetypes) 181 184 bid = maker.makeID(altlabel, mapnum, grid, phraseprefix=label) 182 185 bid = self.vetid(bid, priorids) … … 195 198 196 199 if len(place.placenames) == 0: 197 if place.dirtype == 'numbered' :200 if place.dirtype == 'numbered' and len(place.placenames) == 0: 198 201 names = place.locdesc.split('/') 199 202 prefix = label … … 292 295 citname = SLASH_REGEX.sub('/',p.namestring.strip()) 293 296 citcontent = "BAtlas %s false name %s" % (self.map_number, citname) 294 elif p.dirtype == 'numbered' :297 elif p.dirtype == 'numbered' and len(p.placenames) == 0: 295 298 citname = SLASH_REGEX.sub('/',p.locdesc.strip()) 299 citcontent = "BAtlas %s %s no. %s (%s)" % (self.map_number, p.grid, p.namestring, citname) 300 elif p.dirtype == 'numbered' and len(p.placenames) > 0: 301 citname = SLASH_REGEX.sub('/',p.placenames[0].name.strip()) 296 302 citcontent = "BAtlas %s %s no. %s (%s)" % (self.map_number, p.grid, p.namestring, citname) 297 303 elif p.dirtype == 'name' and len(p.namestring) == 0: BADataMunger/trunk/bidmaker.py
r1370 r1427 66 66 67 67 68 elif dirtype in ['aqueduct', 'aqueduct-group', 'bath', 'bridge', 'bridge-group', 'causeway', 'cemetery', 'centuriation', 'centuriation-group', 'church-group', 'dam', 'dam-group', 'dike', 'dike-group', 'feature', 'feature-group', 'fort', 'fort-group', 'levee', 'lighthouse', 'lighthouse-group', 'mine', 'm ole', 'monastery', 'monument', 'monument-group', 'pass', 'quarry', 'quarry-group', 'road-station', 'salt-pans', 'spring', 'tumulus', 'tunnel', 'villa', 'villa-group', 'wall', 'wall-group', 'waterwheel', 'well']:68 elif dirtype in ['aqueduct', 'aqueduct-group', 'bath', 'bridge', 'bridge-group', 'causeway', 'cemetery', 'centuriation', 'centuriation-group', 'church-group', 'dam', 'dam-group', 'dike', 'dike-group', 'feature', 'feature-group', 'fort', 'fort-group', 'levee', 'lighthouse', 'lighthouse-group', 'mine', 'mine-group', 'mole', 'monastery', 'monument', 'monument-group', 'pass', 'quarry', 'quarry-group', 'reservoir', 'road-station', 'salt-pans', 'spring', 'tumulus', 'tunnel', 'villa', 'villa-group', 'wall', 'wall-group', 'waterwheel', 'well']: 69 69 # second most common case: more-or-less normal features without names 70 70 phrase = "%s %s %s" % (dirtype, phrase, locdesc) BADataMunger/trunk/etc/wordhtml
- Property svn:ignore set to
Shorcut*
FinalDirVersions*
- Property svn:ignore set to
BADataMunger/trunk/tableparser.py
r1369 r1427 26 26 SERIAL_ITALICS_SPACES_REGEX = re.compile('</i>(\s+)<i>') 27 27 SERIAL_ITALICS_COMMA_REGEX = re.compile('</i>(,\s+)<i>') 28 EMPTY_ITALICS_REGEX = re.compile('<i> </i>') 28 29 29 30 tabletypes = { … … 73 74 "Mines / Quarries":"quarry", 74 75 "Mines":"mine", 76 "Mines GLPMR":"mine_glpmr", 75 77 "Mine GLPMR":"mine_glpmr", 76 78 "Mines (Named)":"mine_glpmr_named", … … 83 85 "Names":"name", 84 86 "Names found only in Avienus":"names_avienus", 87 "Named Dioceses":"named_diocese", 88 "Named Provinces":"named_province", 89 "Named Provinces With Loc":"named_province_loc", 90 "Numbered Provinces":"numbered_province", 91 "Numbered Provinces With Loc":"numbered_province_loc", 85 92 "Numbered Bridges":"numbered_bridge", 86 93 "Numbered Features":"numbered", … … 91 98 "Numbered Villas":"numbered_villas", 92 99 "Pass":"pass", 100 "Pass (dated)":"pass_dated", 93 101 "Passes (dated)":"pass_dated", 94 102 "Quarry":"quarry", 95 103 "Quarries":"quarry", 96 104 "Quarry (dateless)":"quarry_dateless", 105 "Reservoir":"reservoir", 97 106 "Roads":"road", 98 107 "Road":"road", … … 107 116 "Unlocated Toponyms":"unlocated", 108 117 "Unlocated Toponym":"unlocated", 118 "Unlocated Toponyms (dateless)":"unlocated_dateless", 119 "Unlocated Toponym (dateless)":"unlocated_dateless", 109 120 "Unlocated Coin Names":"unlocated_coin", 121 "Unlocated Provinces":"unlocated_province", 110 122 "Unnamed Sites":"unnamed_site", 111 123 "Villas":"villa", … … 113 125 "Walls":"wall", 114 126 "Wall":"wall", 127 "Walls (named)":"wall", 115 128 "Wall (Named No Loc)" : "wall_named_noloc", 116 129 "Wall (Unnamed)" : "wall_unnamed", 130 "Wall (unnamed)" : "wall_unnamed", 117 131 "Walls (Unnamed)" : "wall_unnamed", 132 "Walls (unnamed)" : "wall_unnamed", 118 133 "Walls (Dateless)":"wall_dateless", 119 134 "Walls / Fortifications":"wall", … … 165 180 # ignore empty rows 166 181 rows_blank+=1 167 elif rowtext.find(u' =') != -1:182 elif rowtext.find(u'=') != -1: 168 183 # ignore internal cross references 169 184 rows_xref+=1 … … 176 191 p.rowi = ri 177 192 rowcells = row.xpath("*[local-name()='td']") 193 #rowlen = 0 194 #rowlens = [len(''.join(cell.text.split())) for cell in rowcells] 195 #for l in rowlens: 196 # rowlen = rowlen + l 197 #if rowlen < 5: 198 # logging.critical("There's no content in here!") 178 199 configs = pparser_config = context['config'].xpath('//%s' % tabletype) 179 200 if len(configs) == 1: … … 614 635 615 636 def add_name(self, cells): 637 logging.debug("adding names for:\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s" % tuple([etree.tostring(cell).replace('\n', ' ') for cell in cells])) 616 638 self.dirtype = 'name' 617 639 cmap = CMAP_TMPL_NAMED_5 … … 700 722 self.addinator(cells, cmap) 701 723 724 def add_reservoir(self, cells): 725 self.dirtype = 'reservoir' 726 cmap = CMAP_TMPL_UN_4 727 self.addinator(cells, cmap) 728 702 729 def add_road(self, cells): 703 730 self.dirtype = 'road' … … 745 772 self.addinator(cells, cmap) 746 773 774 def add_unlocated_dateless(self, cells): 775 self.dirtype = 'unlocated' 776 cmap = {'placenames':0, 'namestring':0, 'locdesc':1, 'references':2} 777 self.addinator(cells, cmap) 778 747 779 def add_unlocated_coin(self, cells): 748 780 self.dirtype = 'unlocated_coin' … … 750 782 self.addinator(cells, cmap) 751 783 784 def add_unlocated_province(self, cells): 785 self.dirtype = 'unlocated' 786 self.types.append('province') 787 cmap = {'placenames':0, 'namestring':0, 'locdesc':1} 788 self.addinator(cells, cmap) 789 752 790 def add_unnamed_site(self, cells): 753 791 self.dirtype = 'feature' … … 758 796 self.dirtype = 'unlocated' 759 797 cmap = {'placenames':0, 'namestring':0, 'avienus':1, 'locdesc':2, 'references':3} 798 self.addinator(cells, cmap) 799 800 def add_named_diocese(self, cells): 801 self.dirtype = 'name' 802 self.types.append('diocese') 803 cmap = {'grid':0, 'namestring':1, 'placenames':1} 804 self.addinator(cells, cmap) 805 806 def add_named_province(self, cells): 807 self.dirtype = 'name' 808 self.types.append('province') 809 cmap = {'grid':0, 'placenames':1, 'namestring':1} 810 self.addinator(cells, cmap) 811 812 def add_named_province_loc(self, cells): 813 self.dirtype = 'name' 814 self.types.append('province') 815 cmap = {'grid':0, 'placenames':1, 'namestring':1, 'locdesc':2} 816 self.addinator(cells, cmap) 817 818 def add_numbered_province_loc(self, cells): 819 self.dirtype = 'numbered' 820 self.types.append('province') 821 cmap = {'number':0, 'grid':1, 'namestring':0, 'placenames':2, 'locdesc':3} 822 self.addinator(cells, cmap) 823 824 def add_numbered_province(self, cells): 825 self.dirtype = 'numbered' 826 self.types.append('province') 827 cmap = {'number':0, 'grid':1, 'namestring':0, 'placenames':2} 760 828 self.addinator(cells, cmap) 761 829 … … 846 914 # test to see if the nameish follows a "diamond" symbol (i.e., it is a minor alternative name) 847 915 try: 848 diamond_expr = u".+\u00A7.*%s([\s\/].*|$)" % CLEAN_NAMEISH_REGEX.sub('', nameish.strip()) 916 #diamond_expr = u".+\u00A7.*%s([\s\/].*|$)" % CLEAN_NAMEISH_REGEX.sub('', nameish.strip()) 917 diamond_expr = u".+\u00A7\s+%s" % CLEAN_NAMEISH_REGEX.sub('', nameish.strip()) 918 logging.debug("diamond expression is: '%s'" % diamond_expr) 849 919 diamond_regex = re.compile(diamond_expr) 850 920 except sre_constants.error, detail: … … 852 922 raise 853 923 diamond_m = diamond_regex.search(celltext) 924 if diamond_m: 925 logging.debug("nameish (%s) follows a diamond: %s" % (nameish.strip(), diamond_m.group())) 854 926 855 927 # test for italic tag (and deal with MSWord formatting goofiness along the way) 928 thexpath = 'descendant-or-self::*[contains(normalize-space(text()),"%s")]' % nameish.strip() 856 929 try: 857 nameishmatches = namecell.xpath( 'descendant-or-self::*[contains(normalize-space(text()),"%s")]' % nameish.strip())930 nameishmatches = namecell.xpath(thexpath) 858 931 except etree.XPathSyntaxError, detail: 859 932 logging.critical ("incorporating nameish (%s) into an xpath expression caused a fatal error: %s" % (nameish.strip(), detail)) 860 933 raise 861 934 935 if len(nameishmatches) == 0: 936 logging.debug("there were %s nameishmatches for nameish = '%s'" % (len(nameishmatches), nameish.strip())) 937 logging.debug("xpath was '%s'" % thexpath) 938 862 939 try: 863 940 deepestnameishmatch = nameishmatches[-1] … … 866 943 m_postfix_numeral = POSTFIX_NUMERAL_REGEX.match(nameish.strip()) 867 944 m_postfix_parenthetical = POSTFIX_PARENTHETICAL_REGEX.match(nameish.strip()) 868 newcellxml = SERIAL_ITALICS_REGEX.sub('', cellxml) 945 newcellxml = cellxml 946 newcellxml = EMPTY_ITALICS_REGEX.sub('', newcellxml) 947 newcellxml = SERIAL_ITALICS_REGEX.sub('', newcellxml) 869 948 newcellxml = SERIAL_ITALICS_SPACES_REGEX.sub(' ', newcellxml) 870 949 newcellxml = SERIAL_ITALICS_COMMA_REGEX.sub(', ', newcellxml) BADataMunger/trunk/wordnormalizer.py
r1302 r1427 36 36 (' ',' '), # non-breaking space 37 37 (' ',' '), # non-breaking space bis 38 ('ߪ','...') # horizontal ellipsis 38 ('ߪ','...') , # horizontal ellipsis 39 ('\xc2\xa0;', ' ') # non-breaking space in utf-8 39 40 ] 40 41 BADataMunger/trunk/wordstripper.xsl
r1304 r1427 49 49 50 50 <!-- suppress all style and br elements --> 51 <xsl:template match="*[local-name()='style' or local-name()='br']"/> 51 <xsl:template match="*[local-name()='style' or local-name()='br' or local-name()='sup']"/> 52 53 <xsl:template match="*[local-name()='i' and normalize-space(.)='̣']">̣</xsl:template> 52 54 53 55 <!-- strip span tags, but pass through their sub nodes, note that line breaks may matter :| --> … … 58 60 </xsl:choose> 59 61 </xsl:template> 62 63 <xsl:template match="*[local-name()='i' and normalize-space(text()) = ' ']"/> 60 64 61 65 <xsl:template match="*"><xsl:call-template name="elepassthrough"/></xsl:template>
