= Lc@s8dZddkZeidjo"dGHdeiGHeidnddkZddkZddkZddkZddk Z ddk Z ddk Z ddk Z ddk Z ddkZddkZddkZy eZ[Wnej odZdZnXdZd Zd Zdd d d ddddddg ZdddgZdZdZe idZee iddddddd gZ d!d"d#d$d%d&d'gZ!d(Z"d)Z#d*Z$d+Z%d,Z&d-Z'd.Z(d/d0d1hd2d3fgZ)d4e*fd5YZ+d6e+fd7YZ,d8fd9YZ-e-Z.d:fd;YZ/e/Z0d<e1fd=YZ2d>fd?YZ3d@fdAYZ4dBfdCYZ5dDfdEYZ6dFfdGYZ7dHei8i9i:fdIYZ;dJfdKYZ<dLfdMYZ=dNei8i9i:fdOYZ>dPZ?dQZ@dRZAdSZBdTZCdUZDeEdVjoeDeiFdZGeG peGiHdW p eGiHdXoe0iIednweGiHdYZJeCeGdWeJZKeK oe0iIdZdn9eKiLe0iId[e0iMde0iId\e0iNdndS(]sWA simple script to automatically produce sitemaps for a webserver, in the Google Sitemap Protocol (GSP). Usage: python sitemap_gen.py --config=config.xml [--help] [--testing] --config=config.xml, specifies config file location --help, displays usage message --testing, specified when user is experimenting iNis)This script requires Python 2.2 or later.sCurrently run with version: %siitASCIIsUTF-8tIDNAsUS-ASCIItUStIBM367tCP367sISO646-USISO_646.IRV:1991sISO-IR-6sANSI_X3.4-1968sANSI_X3.4-1986tCPASCIIs ISO-8859-1s ISO-8859-2s ISO-8859-5iPs _index.xmls5.+\s+"([^\s]+)\s+([^\s]+)\s+HTTP/\d+\.\d+"\s+200\s+.*s ^\d\d\d\d$s^\d\d\d\d-\d\d$s^\d\d\d\d-\d\d-\d\d$s^\d\d\d\d-\d\d-\d\dT\d\d:\d\dZ$s+^\d\d\d\d-\d\d-\d\dT\d\d:\d\d[+-]\d\d:\d\d$s,^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?Z$s8^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?[+-]\d\d:\d\d$talwaysthourlytdailytweeklytmonthlytyearlytnevers4 s sM %(loc)s %(lastmod)s s, s s s thttpswww.google.comswebmasters/sitemaps/pingttsitemaptErrorcBseZdZRS(s Base exception class. In this module we tend not to use our own exception types for very much, but they come in very handy on XML parsing with SAX. (t__name__t __module__t__doc__(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRst SchemaErrorcBseZdZRS(s?Failure to process an XML file according to the schema we know.(RRR(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRstEncodercBs;eZdZdZdZdZdZdZRS(s Manages wide-character/narrow-character conversions for just about all text that flows into or out of the script. You should always use this class for string coercion, as opposed to letting Python handle coercions automatically. Reason: Python usually assumes ASCII (7-bit) as a default narrow character encoding, which is not the kind of data we generally deal with. General high-level methodologies used in sitemap_gen: [PATHS] File system paths may be wide or narrow, depending on platform. This works fine, just be aware of it and be very careful to not mix them. That is, if you have to pass several file path arguments into a library call, make sure they are all narrow or all wide. This class has MaybeNarrowPath() which should be called on every file system path you deal with. [URLS] URL locations are stored in Narrow form, already escaped. This has the benefit of keeping escaping and encoding as close as possible to the format we read them in. The downside is we may end up with URLs that have intermingled encodings -- the root path may be encoded in one way while the filename is encoded in another. This is obviously wrong, but it should hopefully be an issue hit by very few users. The workaround from the user level (assuming they notice) is to specify a default_encoding parameter in their config file. [OTHER] Other text, such as attributes of the URL class, configuration options, etc, are generally stored in Unicode for simplicity. cCs%d|_g|_t|_ytii|_WnEtj o9yt i ti j|_Wqvtj oqvXnXy:t i }|o#|i tjo|g|_nWntj onX|ip:t i}|o#|i tjo|g|_q n|ip t|_ndS(N(tNonet_usert_learnedtFalset _widefilestostpathtsupports_unicode_filenamestAttributeErrortsystgetwindowsversiontVER_PLATFORM_WIN32_NTtgetfilesystemencodingtuppertENC_ASCII_LISTtgetdefaultencodingtENC_DEFAULT_LIST(tselftencoding((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt__init__s,        cCs ||_dS(N(R(R'R(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytSetUserEncodingsc Cs|t|tijo|S|oxy;|i|}||ijo|ii|n|SWqtj oqtj oti d|qXn|i ocy|i|i SWqtj oqtj o*|i }d|_ ti d|qXnx;|io0y|i|idSWq |id=q Xq Wy|it SWntj onX|it dS(s" Narrow a piece of arbitrary text sUnknown encoding: %ssUnknown default_encoding: %sitignoreN(ttypettypest UnicodeTypetencodeRtappendt UnicodeErrort LookupErrortoutputtWarnRRtENC_UTF8t ENC_ASCII(R'ttextR(tresultttemp((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt NarrowTexts@    cCs|io|S|i|dS(s# Paths may be allowed to stay wide N(RR:R(R'R7((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytMaybeNarrowPath's c Cst|tijo|S|oxy;t||}||ijo|ii|n|SWqtj oqtj oti d|qXn|i ocyt||i SWqtj oqtj o*|i }d|_ ti d|qXnx;|io0yt||idSWq |id=q Xq Wyt|t SWntj onXti d||i pti dn|i tdS(s! Widen a piece of arbitrary text sUnknown encoding: %ssUnknown default_encoding: %sis!Unrecognized encoding in text: %ssBYou may need to set a default_encoding in your configuration file.R+N(R,R-t StringTypetunicodeRR0R1R2R3R4RRR5tdecodeR6(R'R7R(R8R9((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt WidenText.sF     (RRRR)R*R:R;R?(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs !  - tOutputcBsDeZdZdZdZdZdZdZdZRS(s Exposes logging functionality, and tracks how many errors we have thus output. Logging levels should be used as thus: Fatal -- extremely sparingly Error -- config errors, entire blocks of user 'intention' lost Warn -- individual URLs lost Log(,0) -- Un-suppressable text that's not an error Log(,1) -- touched files, major actions Log(,2) -- parsing notes, filtered or duplicated URLs Log(,3) -- each accepted URL cCs1d|_d|_h|_h|_d|_dS(Ni(t num_errorst num_warnst _errors_shownt _warns_shownt_verbose(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)rs     cCs:|o/ti|d}|i|jo |GHq6ndS(sC Output a blurb of diagnostic text, if the verbose level allows it N(tencoderR:RRE(R'R7tlevel((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytLog{scCs|o|ti|d}ti|i}|ii|pd|i|%s (RUtSITEURL_XML_PREFIXRRR,R-R.RFR:RR<tstrtxmltsaxtsaxutilstescapetSITEURL_XML_SUFFIXtwrite(R'tfileRR]R^((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytWriteXMLps   (slocslastmods changefreqspriority(RRRRR)RZR_Rht staticmethodR[RRRHR(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRTs    <  9  tFiltercBs eZdZdZdZRS(sQ A filter on the stream of URLs we find. A filter is, in essence, a wildcard applied to the stream. You can think of this as an operator that returns a tri-state when given a URL: True -- this URL is to be included in the sitemap None -- this URL is undecided False -- this URL is to be dropped from the sitemap cCsd|_d|_t|_td|dpdSti}|id}|idd}|idd}|o|i }n|o|i }n|pti dn_| p|djo|djoti d n,|d jo|djoti d n|djo t|_n|d jo t |_n|djo ||_nR|djoDyt i ||_Wqt ij oti d |qXn|tijo!tid |||fdndS(NtFILTERRR,tactiontwildcardtdrops1On a filter you must specify a "pattern" to matchtregexpsHOn a filter you must specify either 'type="wildcard"' or 'type="regexp"'tpasssTIf you specify a filter action, it must be either 'action="pass"' or 'action="drop"'sBad regular expression: %ss'Filter: %s any URL that matches %s "%s"i(spatternstypesaction(Rt _wildcardt_regexpRt_passtValidateAttributesR3RAtgetRRRbtretcompileterrorRH(R't attributesRARR,R((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)sD      "        cCs| p |i odS|io&ti|i|io|iSdS|io#|ii|io|iSdStpt dS(s Process the URL, as above. N( RURRtfnmatcht fnmatchcaseRRtsearchRRz(R'turl((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytApplys  (RRRR)R(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs  3tInputURLcBs eZdZdZdZRS(s Each Input class knows how to yield a set of URLs from a data source. This one handles a single URL, manually specified in the config file. cCsd|_td|d pdSt}xM|iD]?}|djo|id||q7|i|||q7W|iptiddS||_ti d|iid dS( NRTthrefRVRWRXRUs(Url entries must have an href attribute.sInput: From URL "%s"i(shrefslastmods changefreqspriority( Rt_urlRRTtkeysR_RUR3RRH(R'RRtattr((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)s         cCs"|io||itndS(sD Produces URLs from our data source, hands them in to the consumer. N(RRb(R'tconsumer((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt ProduceURLss (RRRR)R(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs t InputURLListcBs eZdZdZdZRS(s Each Input class knows how to yield a set of URLs from a data source. This one handles a text file with a list of URLs cCsd|_d|_td|dpdS|id|_|idt|_|iogti|i|_ti i |iot i d|idqt i d|id|_nt i ddS( NtURLLISTRR(sInput: From URLLIST "%s"isCan not locate file: %ss-Urllist entries must have a "path" attribute.(spathsencoding(Rt_patht _encodingRRR5RFR;RRtisfileR3RHR(R'R((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)s    c Cst|id\}}|pdSd}xj|iD]\}|d}|ioti||i}n|i}| p|ddjoq7nt}|id}x1t dt |D]}||i||        (RRRR)R(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs tInputDirectorycBs eZdZdZdZRS(s Each Input class knows how to yield a set of URLs from a data source. This one handles a directory that acts as base for walking the filesystem. cCsd|_d|_d|_td|d pdS|id}|ptiddSti |}|i t i p|t i }nt i i|ptid|dS|id}|ptiddSti|}|i dp|d}n|i|pBti||}|i|ptid||fdSn|id}|o>ti |}t i |jotid |d}qn||_||_||_|o!tid |||fd ntid ||fd dS(Nt DIRECTORYRRt default_files<Directory entries must have both "path" and "url" attributessCan not locate directory: %sRs:The directory URL "%s" is not relative to the base_url: %ss7The default_file "%s" can not include path information.s6Input: From DIRECTORY "%s" (%s) with default file "%s"is4Input: From DIRECTORY "%s" (%s) with no default file(spathsurls default_file(RRRt _default_fileRRR3RRFR;RRtsepRtisdirRTR[R|R`R{RH(R'RRRRR((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)NsX               cs|ipdS|i|i|ifdfd}tid|id|idtii|i|ddS(sD Produces URLs from our data source, hands them in to the consumer. Nc st}t}y|otii||}n|}tii|}d}|oPoItii|}yti|ti}Wqt j oqXn|pti|ti}nt ||_ Wn%t j ont j onX|t }tidjo|itid}n|o|d}n|o#||}|o|d}qxn|idti|d|o(|jo|idddddS|tdS(sn Called once per file. Note that 'name' will occasionally be None -- for a directory itself RRURsIGNORED (default file)RGiN(RTRRRtjoinRRtstattST_MTIMEtOSErrortTimestampISO8601RVRRRqRtreplaceR_RFR?RH(tdirpathtnameRRRttimeRtmiddle(troot_URLt root_filet root_pathR(sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytPerFilesF    csN|}|iptid|dSx|D]}||q3WdS(sT Called once per directory with a list of all the contained files/dirs. s8Unable to decide what the root path is for directory: %sN(R|R3R4(R+RtnamelistR(RR(sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt PerDirectorys sWalking DIRECTORY "%s"i( RRRR3RHRRRtwalk(R'RR((RRRRRsk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs    1(RRRR)R(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRGs :tInputAccessLogcBsDeZdZdZdZdZdZdZdZRS(s- Each Input class knows how to yield a set of URLs from a data source. This one handles access logs. It's non-trivial in that we want to auto-detect log files in the Common Logfile Format (as used by Apache, for instance) and the Extended Log File Format (as used by IIS, for instance). cCsd|_d|_t|_t|_d|_d|_d|_d|_ d|_ t d|d pdS|i d|_|i dt |_|iogti|i|_tii|iotid|idqtid|id|_ntiddS( Nit ACCESSLOGRR(sInput: From ACCESSLOG "%s"isCan not locate file: %ss/Accesslog entries must have a "path" attribute.(spathsencoding(RRRRt_is_elft_is_clft _elf_statust _elf_methodt_elf_urit _elf_urifrag1t _elf_urifrag2RRR5RFR;RRRR3RHR(R'R((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)s(           cCs|idptS|id}|d=xtdt|D]}||i}|djo ||_qA|djo ||_qA|djo ||_qA|djo ||_ qA|djo ||_ qAqAWt i d d t S( s7 Recognize the Fields directive that heads an ELF file s#Fields:Ris sc-statuss cs-methodscs-uris cs-uri-stems cs-uri-querys,Recognized an Extended Log File Format file.i(R|RRpRRqRRRRRRR3RHRb(R'RtfieldsRtfield((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytRecognizeELFLines&         cCs|id}t|}|idjo8|i|jodS||iidjpdSn|idjo8|i|jodS||iid jodSn|idjo>|i|jodS||ii}|djo|Sn|idjo|i|jp|i|jodS||ii}d}|idjo||i}n|o8|djo+|o|djo|d|}n|SndS( s* Fetch the requested URL from an ELF line Rit200tHEADtGETt-t?N(RR( RpRqRRRRRRR(R'RRtcountRturlfrag1turlfrag2((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt GetELFLines<       cCsJti|}|o|iddj}|otiddn|S(sR Try to tokenize a logfile line according to CLF pattern and see if it works. iRRs(Recognized a Common Logfile Format file.i(sHEADsGET(tACCESSLOG_CLF_PATTERNR~tgroupR3RH(R'RR~t recognize((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytRecognizeCLFLine>s cCsHti|}|o.|id}|djo|idSndS(s) Fetch the requested URL from a CLF line iRRi(sHEADsGETN(RR~RR(R'RR~trequest((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt GetCLFLineHs  cCs?t|id\}}|pdSx|iD]}|ioti||i}n|i}|i o3|i o(|i ||_|i ||_nd}|io|i |}n|io|i |}n|pq1nt}|id|||tq1W|i|o|indS(sD Produces URLs from our data source, hands them in to the consumer. RNRU(RRRRRFR?RRRRRRRRRTR_RbR(R'RRRRR~R((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRRs0       ( RRRR)RRRRR(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs   + t InputSitemapcBseZdZdefdYZdefdYZdefdYZdefdYZd efd YZd efd YZ d Z dZ dZ dZ dZdZdZRS(s Each Input class knows how to yield a set of URLs from a data source. This one handles Sitemap files and Sitemap index files. For the sake of simplicity in design (and simplicity in interfacing with the SAX package), we do not handle these at the same time, recursively. Instead we read an index file completely and make a list of Sitemap files, then go back and process each Sitemap. t _ContextBasecBsDeZdZdZdZdZdZdZdZRS(stBase class for context handlers in our SAX processing. A context handler is a class that is responsible for understanding one level of depth in the XML schema. The class knows what sub-tags are allowed, and doing any processing specific for the tag we're in. This base class is the API filled in by specific context handlers, all defined below. cCs||_d|_dS(sUInitialize with a sequence of the sub-tags that would be valid in this context.N(t _allowed_tagsRt _last_tag(R'tsubtags((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)s cCs0||ij}|o ||_n d|_|S(s<Returns True iff opening a sub-tag is valid in this context.N(RRR(R'ttagtvalid((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt AcceptTags   cCstS(s:Returns True iff a blurb of text is valid in this context.(R(R'R7((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt AcceptTextscCsdS(s+The context is opening. Do initialization.N((R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytOpenscCsdS(s3The context is closing. Return our result, if any.N((R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytClosescCs|o tndS(s We're returning to this context after handling a sub-tag. This method is called with the result data from the sub-tag that just closed. Here in _ContextBase, if we ever see a result it means the derived child class forgot to override this method.N(tNotImplementedError(R'R8((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytReturns( RRRR)RRRRR(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs     t_ContextUrlSetcBseZdZdZRS(s3Context handler for the document node in a Sitemap.cCstii|ddS(NR(surl(RRR)(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)s(RRRR)(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRst _ContextUrlcBs2eZdZdZdZdZdZRS(s,Context handler for a URL node in a Sitemap.cCs,tii|tid|_||_dS(sUInitialize this context handler with the callable consumer that wants our URLs.N(RRR)RTRRRt _consumer(R'R((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)s cCs"|i ptt|_dS(sInitialize the URL.N(RRzRT(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRscCs1|ipt|i|itd|_dS(s2Pass the URL to the consumer and reset it to None.N(RRzR RR(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRscCs6|ipt|o|ii|i|ndS(s7A value context has closed, absorb the data it gave us.N(RRzR_R(R'R8((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs(RRRR)RRR(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR s    t_ContextSitemapIndexcBs2eZdZdZdZdZdZRS(s7Context handler for the document node in an index file.cCs tii|dg|_dS(NR(ssitemap(RRR)t_loclist(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)scCs|i ptdS(sJust a quick verify of state.N(R Rz(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRscCs%|io|i}g|_|SdS(s$Return our list of accumulated URLs.N(R (R'R9((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs   cCs|o|ii|ndS(s0Getting a new loc URL, add it to the collection.N(R R0(R'R8((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs(RRRR)RRR(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR s    t_ContextSitemapcBs2eZdZdZdZdZdZRS(s5Context handler for a Sitemap entry in an index file.cCs tii|dd|_dS(NRURV(slocslastmod(RRR)Rt_loc(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)scCs|i ptdS(sJust a quick verify of state.N(RRz(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR scCs2|io|i}d|_|StiddS(sReturn our URL to our parent.s:In the Sitemap index file, a "sitemap" entry had no "loc".N(RRR3R4(R'R9((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs    cCs(|o|idjo ||_ndS(s2A value has closed. If it was a 'loc', absorb it.RUN(RR(R'R8((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs(RRRR)RRR(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR s    t _ContextValuecBs2eZdZdZdZdZdZRS(sContext handler for a single value. We return just the value. The higher level context has to remember what tag led into us.cCs tii|dd|_dS(N((RRR)Rt_text(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)%scCs+|io|i||_n ||_tS(s(Allow all text, adding it to our buffer.(RRb(R'R7((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR*s  cCs d|_dS(sInitialize our buffer.N(RR(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR3scCs-|i}d|_|o|i}n|S(sReturn what's in our buffer.N(RRR(R'R7((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR8s   (RRRR)RRR(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR s   cCstiiii|d|_d|_d|_d|_ d|_ t d|dgpdS|i d}|o[t i|}tii|o$tid|d|g|_qtid|ntiddS( sQInitialize with a dictionary of attributes from our entry in the config file.itSITEMAPRNsInput: From SITEMAP "%s"isCan not locate file "%s"s-Sitemap entries must have a "path" attribute.(RRthandlertContentHandlerR)Rt _pathlistt_currentt _contextst _contexts_idxt _contexts_stmRRRFR;RRRR3RHR(R'RR((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)Bs      cCstititig|_titi|tig|_|ipt |id}d|_ |i ||i|_ x"|idD]}|i |qWdS(sIn general: Produces URLs from our data source, hand them to the callable consumer. In specific: Iterate over our list of paths and delegate the actual processing to helper methods. This is a complexity no other data source needs to suffer. We are unique in that we can have files that tell us to bring in other files. Note the decision to allow an index file or not is made in this method. If we call our parser with (self._contexts == None) the parser will grab whichever context stack can handle the file. IE: index is allowed. If instead we set (self._contexts = ...) before parsing, the parser will only use the stack we specify. IE: index not allowed. iiN( RR R RRRR RRRzRRt _ProcessFile(R'RR((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR\s        cCs|ptt|d\}}|pdSy d|_tii||Wntj otid|nkt j otid|nHtii i j o2}tid||i |i |ifnX|i|o|indS(sBDo per-file reading/parsing/consuming for the file path passed in.RNis8An error in file "%s" made us abort reading the Sitemap.sCannot read from file "%s"s3XML error in the file "%s" (line %d, column %d): %s(RzRRRRtparseRR3RtIOErrort _exceptionstSAXParseExceptiont_linenumt_colnumt getMessageR(R'RRRte((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRs$    % c CsD|ipt|id}tii|}tii|}t}t|ti jo t }nx|D]}t i |}t id|dti|\}}}}} tii|} ti| } |oti| } n|o|ti| } n| o(|ii| t id| dqoqoWdS(sqGiven a list of URLs, munge them into our self._pathlist property. We do this by assuming all the files live in the same directory as the first file in the existing pathlist. That is, we assume a Sitemap index points to Sitemaps only in the same directory. This is not true in general, but will be true for any output produced by this script. is#Index points to Sitemap file at: %sis%Will attempt to read Sitemap file: %siN(RRzRRtnormpathtdirnameRR,R-R.RbRTR[R3RHR`RatbasenameRktunquoteRFR?RR0( R'turllistRtdirtwideRRdReRfRgR((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt_MungeLocationListIntoFiless*  cCs"|idjo|i o||io |ipt|djo|i|_q|djo |i|_tiddqtidtn|idjo7|i|ijo$|djotidtn|od}x|i D]s}|idjo>|i d djoqn|i d djoqqLn|o|d }n||}qW|oti d ||fqn|idjp|i|ii |oH|id |_|it |ijpt|i|iintid|tdS(sSAX processing, called per node in the config stream. As long as the new tag is legal in our current context, this becomes an Open call on one context deeper. iturlsett sitemapindexsFile is a Sitemap index.isAThe document appears to be neither a Sitemap nor a Sitemap index.s7A Sitemap index can not refer to another Sitemap index.Rtxmlnstxsis, sMDid not expect any attributes on any tag, instead tag "%s" had attributes: %sis)Can not accept tag "%s" where it appears.N(RRRRRzR3RHRRRtfindR4RRqR(R'RRR7R((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt startElementsD     #      * cCs|}|idjpt|i|ii}|id|_|idjo|i|ii|n,|o$|i|ijo|i|ndS(sSAX processing, called per node in the config stream. This becomes a call to Close on one context followed by a call to Return on the previous. iiN(RRzRRRRR)(R'Rtretval((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt endElementscCs[|idjp|i|ii| o,|iotid|tqWndS(sSAX processing, called when text values are read. Important to note that one single text value may be split across multiple calls of this method. is*Can not accept text "%s" where it appears.N(RRRRR3RR(R'R7((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt characterss  (RRRtobjectRRR R R RR)RRR)R/R1R2(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRzs 5 !"  $   3 tFilePathGeneratorcBs;eZdZdZdZdZdZdZRS(s^ This class generates filenames in a series, upon request. You can request any iteration number at any time, you don't have to go in order. Example of iterations for '/path/foo.xml.gz': 0 --> /path/foo.xml.gz 1 --> /path/foo1.xml.gz 2 --> /path/foo2.xml.gz _index.xml --> /path/foo_index.xml cCs(t|_d|_d|_d|_dS(N(Rtis_gzipRRt_prefixt_suffix(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)s   cCs"ti|}tii|}tii|i}|ptid|t St |}d}dddg}x/|D]'}|i |ot |}Pq~q~W|ptid|t S|i d|_ t |}||| |_ |||||!|_||||_tS(s6 Splits up a path into forms ready for recombination. s Couldn't parse the file path: %sis.xmls.xml.gzs.gzs8The path "%s" doesn't end in a supported file extension.(RFR;RRR"R$RR3RRRqRR5RR6R7Rb(R'Rtbasetlenbaset lensuffixtcompare_suffixtsuffixtlenpath((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytPreload s0     cCsV|i|i}t|tijo(|od|||ifS||iS||S(s/ Generates the iterations, as described above. s%s%d%s(RR6R,R-RPR7(R'tinstanceR((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt GeneratePathBs  cCsr||i}d}t|tijo2|od|||if}qe||i}n ||}ti|S(s7 Generates iterations, but as a URL instead of a path. s%s%d%sN(R6RR,R-RPR7RTR[(R'R?troot_urlRR0((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyt GenerateURLLs  cCsHti||i}ti||i}|t|}|d|S(s; Generates a wildcard that should match all our iterations t*(RTR[R6R7Rq(R'RARR9R<((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytGenerateWildURLZs(RRRR)R>R@RBRD(((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR4 s    " tPerURLStatisticscBs)eZdZdZdZdZRS(sD Keep track of some simple per-URL statistics, like file extension. cCs h|_dS(N(t _extensions(R'((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyR)gsc Cs|o|ioti|i\}}}}}|pdS|idoA|iido|idd|idRDRSRBtSITEINDEX_SUFFIXRTR]R,R-R<R.R(R'tall_good((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytValidateBasicConfigsF            % &cCsx!|iD]}|i|iq Wt|io|in|iptid|in|idjo|i n|i |i i dS(s1 Run over all the Inputs and ask them to Produce s0No URLs were recorded, writing an empty sitemap.iN( RORt ConsumeURLRqRQtFlushSetRUR3R4t WriteIndext NotifySearchRWRH(R'tinput((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytGenerates    cCs|pdS|i|i|pdSd}x2|iD]'}|i|}|djoPq7q7W|p |djp|idddddSti|i|i pti|i|i o|idddddS|i }|i i |og|i |}|djo8|d}||i |<|i|jo ||_qOn|idd dSd|i |<|ii||ii||it|itjo|indS( s All per-URL processing comes together here, regardless of Input. Here we run filters, remove duplicates, spill to disk as needed, etc. NRtFILTEREDRGisIGNORED (output file)iit DUPLICATE(RR[RRNRRHRRRURSRTRRPRLRVRQR0RWRJRqtMAXURLS_PER_SITEMAPRe(R'RRtaccepttfilterRMtdup((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRdsB          c Cstidd|iixv|iD]k}|i}|i|}|djo?d|i|<|ip$dt|t|i|_qq'q'W|i i |i }|pti dn|i d|_ tid|t |ifdd}d}y|i ioCtii|}t|d}tid |d |d d }nt|d }|itx|iD]}|i|q{W|it|i|o|ind}d}Wn$tj oti d |nXti|dg|_dS(s Flush the current set of URLs to the output. This is a little slow because we like to sort them all and normalize the priorities before dumping. s'Sorting and normalizing collected URLs.iiis%.4fs.Unexpected: Couldn't generate output filename.s&Writing Sitemap file "%s" with %d URLstwbtfileobjtfilenametmodetwtsCouldn't write out to file: %siN(R3RHRQRKRRPRXRRVRRR@RURORqRR5RRR$topentgziptGzipFileRtSITEMAP_HEADERRtSITEMAP_FOOTERRRtchmod(R'RRMRoRrRRR$((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRe6sL       ,  "     cCs+|iit}|ptidntid||ifdtti}yt |d}|i t xWt d|iD]C}|ii ||i}h|d6|d6}|i t|qW|i t|id }Wn$tj otid|nXti|d d S( s- Write the master index of all Sitemap files s4Unexpected: Couldn't generate output index filename.s(Writing index file "%s" with %d SitemapsiRtiRURVsCouldn't write out to file: %siN(RRR@RaR3RORHRURRRuRtSITEINDEX_HEADERRRBR[tSITEINDEX_ENTRYtSITEINDEX_FOOTERRRRRRz(R'RrRVtfdt mapnumbertmapurlt mapattributes((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRfos(     c Cs|iotidddStidddtifdY}ti}|t_|idjo|iit |i }n|iid|i }yti |}|i Wn1t j o%tid|tid nXxtD]}|d }|d }||| ISO 8601 time string.s%Y-%m-%dT%H:%M:%SZ(Rtstrftimetgmtime(tt((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pyRTscCsti}t|}y+tid|dtii||Wntj otid|nmtii i j o/}tid|i |i |i fn*tii ij otidnX|tijo|SdS(sF Sets up a new Sitemap object from the specified configuration file. sReading configuration file: %sis"Cannot read configuration file: %ss5XML error in the config file (line %d, column %d): %ssSome installs of Python 2.2 did not include complete support for XML. Please try upgrading your version of Python and re-running the script.N(R3RARMRHRRRRRRRRRR tSAXReaderNotAvailableR(t configpathR^RARR!((sk/home.restoring/home/endymion/informationwithoutborders.endymion.com/sitemap/sitemap_gen-1.4/sitemap_gen.pytCreateSitemapFromFileYs   !cCsh}d}d}d|d|d}ti|}x|D]}ye|i|i}|ido|d||dtj od SXq>W|S( s Parse command line flags per specified usage, pick off key, value pairs All flags of type "--key=value" will be processed as __flags[key] = value, "--option" will be processed as __flags[option] = option s--(?P\S*)[=](?P\S*)s--(?P