U .e$+@sddlmZmZmZddlmZddlmZddl m Z ddl m Z ddl m Z m Z ddl mZmZmZdd l mZmZdd l mZdd lmZdd lmZee ZGd ddeZdS))absolute_importdivisionunicode_literals)unichr)deque)spaceCharacters)entities) asciiLettersasciiUpper2Lower)digits hexDigitsEOF) tokenTypes tagTokenTypes)replacementCharacters)HTMLInputStream)TriecsdeZdZdZdfdd ZddZddZdd d Zd d ZddZ ddZ ddZ ddZ ddZ ddZddZddZddZd d!Zd"d#Zd$d%Zd&d'Zd(d)Zd*d+Zd,d-Zd.d/Zd0d1Zd2d3Zd4d5Zd6d7Zd8d9Zd:d;Zdd?Z!d@dAZ"dBdCZ#dDdEZ$dFdGZ%dHdIZ&dJdKZ'dLdMZ(dNdOZ)dPdQZ*dRdSZ+dTdUZ,dVdWZ-dXdYZ.dZd[Z/d\d]Z0d^d_Z1d`daZ2dbdcZ3dddeZ4dfdgZ5dhdiZ6djdkZ7dldmZ8dndoZ9dpdqZ:drdsZ;dtduZdzd{Z?d|d}Z@d~dZAddZBddZCddZDddZEddZFddZGddZHddZIddZJddZKddZLZMS) HTMLTokenizera  This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.stream Points to HTMLInputStream object. Nc sFt|f||_||_d|_g|_|j|_d|_d|_t t | dS)NF) rstreamparserZ escapeFlagZ lastFourChars dataStatestateescape currentTokensuperr__init__)selfrrkwargs __class__C/usr/lib/python3.8/site-packages/pip/_vendor/html5lib/_tokenizer.pyr"szHTMLTokenizer.__init__ccsPtg|_|rL|jjr6td|jjddVq|jr |jVq6q dS)z This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. ParseErrorrtypedataN)r tokenQueuerrerrorsrpoppopleftrr!r!r"__iter__1s  zHTMLTokenizer.__iter__c %Cst}d}|rt}d}g}|j}||krH|tk rH|||j}q"td||}|tkrt|}|j t ddd|idnbd|krd ksn|d krd }|j t ddd|idn d |krd ksnd|krdksnd|krdksnd|kr,dksn|t ddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d g#kr|j t ddd|idz t |}Wn>t k r|d6}t d|d?Bt d7|d8@B}YnX|d9kr|j t dd:d;|j||S)r#z'expected-tag-name-but-got-right-bracketr$rOz<>?z'expected-tag-name-but-got-question-markzexpected-tag-namerHT)rr:markupDeclarationOpenStatercloseTagOpenStater rr tagNameStater'r;rrAbogusCommentStaterpr!r!r"rcis@           zHTMLTokenizer.tagOpenStatecCs|j}|tkr0td|gdd|_|j|_n|dkrX|jtddd|j |_nn|t kr|jtddd|jtd d d|j |_n0|jtdd d |id |j ||j |_dS)NrZFr%rYr&r[rtr#z*expected-closing-tag-but-got-right-bracketr$z expected-closing-tag-but-got-eofrOr'r;rrrscriptDataDoubleEscapedStaterrr rArpr!r!r"rs    z.HTMLTokenizer.scriptDataDoubleEscapeStartStatecCs|j}|dkr2|jtddd|j|_n|dkrZ|jtddd|j|_nt|dkr|jtddd|jtdddn>|tkr|jtdd d|j |_n|jtd|dd S NrrOr$rHr_r#r`r2eof-in-script-in-scriptT) rr:r'r;r scriptDataDoubleEscapedDashStater(scriptDataDoubleEscapedLessThanSignStaterrrpr!r!r"rs*        z*HTMLTokenizer.scriptDataDoubleEscapedStatecCs|j}|dkr2|jtddd|j|_n|dkrZ|jtddd|j|_n|dkr|jtddd|jtddd|j|_nF|t kr|jtdd d|j |_n|jtd|d|j|_d Sr) rr:r'r;r$scriptDataDoubleEscapedDashDashStaterrrrrrpr!r!r"rs.        z.HTMLTokenizer.scriptDataDoubleEscapedDashStatecCs|j}|dkr*|jtdddn|dkrR|jtddd|j|_n|dkrz|jtddd|j|_n|dkr|jtddd|jtdd d|j|_nF|t kr|jtdd d|j |_n|jtd|d|j|_d S) NrrOr$rHrtr_r#r`r2rT) rr:r'r;rrrrorrrrpr!r!r"rs2        z2HTMLTokenizer.scriptDataDoubleEscapedDashDashStatecCsP|j}|dkr8|jtdddd|_|j|_n|j||j |_dS)NrsrOr$r/T) rr:r'r;rrscriptDataDoubleEscapeEndStaterrArrpr!r!r"r0s   z6HTMLTokenizer.scriptDataDoubleEscapedLessThanSignStatecCs|j}|ttdBkrR|jtd|d|jdkrH|j |_ q|j |_ nB|t kr|jtd|d|j|7_n|j ||j |_ dSr)rr:rr>r'r;rrrrrrr rArpr!r!r"r;s    z,HTMLTokenizer.scriptDataDoubleEscapeEndStatecCs0|j}|tkr$|jtdn|tkrJ|jd|dg|j|_n|dkr\| n|dkrn|j |_n|dkr|j t ddd |jd|dg|j|_n|d kr|j t dd d |jdd dg|j|_nF|t kr|j t dd d |j|_n|jd|dg|j|_dS)NTr&r/rtrs)'"rMrHr##invalid-character-in-attribute-namer$r_r`r2z#expected-attribute-name-but-got-eof)rr:rrdr rr;attributeNameStaterr^r}r'rrrrpr!r!r"r|Ks<           z&HTMLTokenizer.beforeAttributeNameStatecCs|j}d}d}|dkr&|j|_n.|tkr\|jddd||jtd7<d}n|dkrjd}n|tkr||j|_n|dkr|j |_n|d kr|j t d d d |jdddd 7<d}n|dkr |j t d dd |jddd|7<d}nH|t kr6|j t d dd |j|_n|jddd|7<d}|r|jdddt|jddd<|jdddD]>\}}|jddd|kr|j t d dd qҐq|r|dS)NTFrMr&rJrrtrsr_r#r`r$r2rrrHrzeof-in-attribute-namezduplicate-attribute)rr:beforeAttributeValueStaterr rrdrafterAttributeNameStater}r'r;rrrr\r r^)rr&ZleavingThisStateZ emitTokenrY_r!r!r"ris^             z HTMLTokenizer.attributeNameStatecCsD|j}|tkr$|jtdn|dkr8|j|_n|dkrJ|n|tkrp|jd |dg|j |_n|dkr|j |_n|dkr|j t dd d |jd d dg|j |_n|d kr|j t dd d |jd |dg|j |_nF|tkr$|j t ddd |j|_n|jd |dg|j |_dS)NTrMrtr&r/rsr_r#r`r$r2rz&invalid-character-after-attribute-namezexpected-end-of-tag-but-got-eof)rr:rrdrrr^r rr;rr}r'rrrrpr!r!r"rs@            z%HTMLTokenizer.afterAttributeNameStatecCsh|j}|tkr$|jtdn@|dkr8|j|_n,|dkrX|j|_|j|n |dkrj|j|_n|dkr|j t ddd| n|d kr|j t dd d|j d d d d7<|j|_n|dkr|j t ddd|j d d d |7<|j|_nL|tkrB|j t ddd|j|_n"|j d d d |7<|j|_dS)NTrrGrrtr#z.expected-attribute-value-but-got-right-bracketr$r_r`r&rJrr2)rMrH`z"equals-in-unquoted-attribute-valuez$expected-attribute-value-but-got-eof)rr:rrdattributeValueDoubleQuotedStaterattributeValueUnQuotedStaterAattributeValueSingleQuotedStater'r;rr^rrrrpr!r!r"rsF             z'HTMLTokenizer.beforeAttributeValueStatecCs|j}|dkr|j|_n|dkr0|dn|dkrj|jtddd|jddd d 7<nN|t kr|jtdd d|j |_n&|jddd ||j d 7<d S)NrrGr_r#r`r$r&rJrr2z#eof-in-attribute-value-double-quote)rrGr_T rr:afterAttributeValueStaterrXr'r;rrrrrdrpr!r!r"rs&       z-HTMLTokenizer.attributeValueDoubleQuotedStatecCs|j}|dkr|j|_n|dkr0|dn|dkrj|jtddd|jddd d 7<nN|t kr|jtdd d|j |_n&|jddd ||j d 7<d S)NrrGr_r#r`r$r&rJrr2z#eof-in-attribute-value-single-quote)rrGr_Trrpr!r!r"rs&       z-HTMLTokenizer.attributeValueSingleQuotedStatecCs|j}|tkr|j|_n|dkr0|dn|dkrB|n|dkr||jt ddd|j ddd |7<n|d kr|jt dd d|j ddd d 7<nV|t kr|jt dd d|j |_n.|j ddd ||j tdtB7<dS)NrGrt)rrrMrHrr#z0unexpected-character-in-unquoted-attribute-valuer$r&rJrr_r`r2z eof-in-attribute-value-no-quotes)rGrtrrrMrHrr_T)rr:rr|rrXr^r'r;rrrrrdr>rpr!r!r"rs4         z)HTMLTokenizer.attributeValueUnQuotedStatecCs|j}|tkr|j|_n|dkr.|np|dkr@|j|_n^|tkrt|j t ddd|j ||j |_n*|j t ddd|j ||j|_dS)Nrtrsr#z$unexpected-EOF-after-attribute-valuer$z*unexpected-character-after-attribute-valueT) rr:rr|rr^r}rr'r;rrArrpr!r!r"r s&         z&HTMLTokenizer.afterAttributeValueStatecCs|j}|dkr&d|jd<|n^|tkrZ|jtddd|j||j |_ n*|jtddd|j||j |_ dS)NrtTr[r#z#unexpected-EOF-after-solidus-in-tagr$z)unexpected-character-after-solidus-in-tag) rr:rr^rr'r;rrArrr|rpr!r!r"r}4s         z&HTMLTokenizer.selfClosingStartTagStatecCsD|jd}|dd}|jtd|d|j|j|_dS)Nrtr_r2Commentr$T) rrdreplacer'r;rr:rrrpr!r!r"ryFs    zHTMLTokenizer.bogusCommentStatecCs|jg}|ddkrR||j|ddkrPtddd|_|j|_dSn|ddkrd}dD](}||j|d|krfd }qqf|rtd ddddd |_|j|_dSn|dd krD|jdk rD|jj j rD|jj j dj |jj j krDd}d D].}||j|d|krd }q2q|rD|j |_dS|jtddd|rt|j|qZ|j|_dS)NrJrrr/r$T)dD))oOrDCtTyYpPeEFZDoctype)r%rYpublicIdsystemIdcorrect[)rrArrrr#zexpected-dashes-or-doctype)rr:r;rrcommentStartStater doctypeStaterZtreeZ openElements namespaceZdefaultNamespacecdataSectionStater'rAr)ry)rrCmatchedexpectedr!r!r"rvUs\       z(HTMLTokenizer.markupDeclarationOpenStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd d|j|j|j|_nP|t kr|jtdd d|j|j|j|_n|jd|7<|j |_d S) Nrr_r#r`r$r&r2rtincorrect-commenteof-in-commentT) rr:commentStartDashStaterr'r;rrrr commentStaterpr!r!r"rs.       zHTMLTokenizer.commentStartStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd d|j|j|j|_nT|t kr|jtdd d|j|j|j|_n|jdd|7<|j |_d S) Nrr_r#r`r$r&-�rtrrT) rr:commentEndStaterr'r;rrrrrrpr!r!r"rs.       z#HTMLTokenizer.commentStartDashStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<nT|tkr|jtddd|j|j|j |_n|jd||j d 7<d S) Nrr_r#r`r$r&r2r)rr_T) rr:commentEndDashStaterr'r;rrrrrdrpr!r!r"rs$       zHTMLTokenizer.commentStatecCs|j}|dkr|j|_n|dkrV|jtddd|jdd7<|j|_nT|t kr|jtddd|j|j|j |_n|jdd|7<|j|_d S) Nrr_r#r`r$r&rzeof-in-comment-end-dashT) rr:rrr'r;rrrrrrpr!r!r"rs$      z!HTMLTokenizer.commentEndDashStatecCs,|j}|dkr*|j|j|j|_n|dkrd|jtddd|jdd7<|j|_n|dkr|jtdd d|j |_n|d kr|jtdd d|jd|7<nj|t kr|jtdd d|j|j|j|_n4|jtdd d|jdd|7<|j|_dS)Nrtr_r#r`r$r&u--�rrz,unexpected-bang-after-double-dash-in-commentrz,unexpected-dash-after-double-dash-in-commentzeof-in-comment-double-dashzunexpected-char-in-commentz--T) rr:r'r;rrrrrcommentEndBangStaterrpr!r!r"rs@          zHTMLTokenizer.commentEndStatecCs|j}|dkr*|j|j|j|_n|dkrN|jdd7<|j|_n|dkr|jtddd|jdd 7<|j |_nT|t kr|jtdd d|j|j|j|_n|jdd|7<|j |_d S) Nrtrr&z--!r_r#r`r$u--!�zeof-in-comment-end-bang-stateT) rr:r'r;rrrrrrrrpr!r!r"rs,       z!HTMLTokenizer.commentEndBangStatecCs|j}|tkr|j|_nj|tkr\|jtdddd|j d<|j|j |j |_n*|jtddd|j ||j|_dS)Nr#!expected-doctype-name-but-got-eofr$Frzneed-space-after-doctypeT) rr:rbeforeDoctypeNameStaterrr'r;rrrrArpr!r!r"r s        zHTMLTokenizer.doctypeStatecCs|j}|tkrn|dkrT|jtdddd|jd<|j|j|j|_n|dkr|jtdddd |jd <|j |_nR|t kr|jtdd dd|jd<|j|j|j|_n||jd <|j |_d S) Nrtr#z+expected-doctype-name-but-got-right-bracketr$Frr_r`r2rYrT) rr:rr'r;rrrrdoctypeNameStaterrpr!r!r"rs4           z$HTMLTokenizer.beforeDoctypeNameStatecCs|j}|tkr2|jdt|jd<|j|_n|dkrh|jdt|jd<|j |j|j |_n|dkr|j t ddd|jdd7<|j |_nh|t kr|j t dddd |jd <|jdt|jd<|j |j|j |_n|jd|7<d S) NrYrtr_r#r`r$r2zeof-in-doctype-nameFrT)rr:rrr\r afterDoctypeNameStaterr'r;rrrrrpr!r!r"r6s0        zHTMLTokenizer.doctypeNameStatecCsH|j}|tkrn.|dkr8|j|j|j|_n |tkrd|jd<|j ||jt ddd|j|j|j|_n|dkrd}d D]}|j}||krd}qq|r|j |_dSnD|d kr d}d D]}|j}||krd}qq|r |j |_dS|j ||jt dd d |idd|jd<|j |_dS)NrtFrr#eof-in-doctyper$rT))uU)bB)lL)iIrsS)rrrr)mMz*expected-space-or-right-bracket-in-doctyper&r1)rr:rr'r;rrrrrArafterDoctypePublicKeywordStateafterDoctypeSystemKeywordStatebogusDoctypeState)rr&rrr!r!r"rOsT            z#HTMLTokenizer.afterDoctypeNameStatecCs|j}|tkr|j|_n|dkrP|jtddd|j||j|_nT|t kr|jtdddd|j d<|j|j |j |_n|j||j|_dS N)rrr#unexpected-char-in-doctyper$rFrT) rr:r"beforeDoctypePublicIdentifierStaterr'r;rrArrrrpr!r!r"rs&         z,HTMLTokenizer.afterDoctypePublicKeywordStatecCs|j}|tkrn|dkr0d|jd<|j|_n|dkrLd|jd<|j|_n|dkr|jt dddd |jd <|j|j|j |_nh|t kr|jt dd dd |jd <|j|j|j |_n(|jt dd dd |jd <|j |_d S)Nrr/rrrtr#unexpected-end-of-doctyper$FrrrT) rr:rr(doctypePublicIdentifierDoubleQuotedStater(doctypePublicIdentifierSingleQuotedStater'r;rrrrrpr!r!r"rs:             z0HTMLTokenizer.beforeDoctypePublicIdentifierStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd dd |jd <|j|j|j|_nR|t kr|jtdd dd |jd <|j|j|j|_n|jd|7<d S)Nrr_r#r`r$rr2rtrFrrT rr:!afterDoctypePublicIdentifierStaterr'r;rrrrrpr!r!r"rs0         z6HTMLTokenizer.doctypePublicIdentifierDoubleQuotedStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd dd |jd <|j|j|j|_nR|t kr|jtdd dd |jd <|j|j|j|_n|jd|7<d S)Nrr_r#r`r$rr2rtrFrrTrrpr!r!r"rs0         z6HTMLTokenizer.doctypePublicIdentifierSingleQuotedStatecCs |j}|tkr|j|_n|dkr<|j|j|j|_n|dkrn|jt dddd|jd<|j |_n|dkr|jt dddd|jd<|j |_nh|t kr|jt dd dd |jd <|j|j|j|_n(|jt dddd |jd <|j |_d S) Nrtrr#rr$r/rrrFrT)rr:r-betweenDoctypePublicAndSystemIdentifiersStaterr'r;rrr(doctypeSystemIdentifierDoubleQuotedState(doctypeSystemIdentifierSingleQuotedStaterrrpr!r!r"rs>              z/HTMLTokenizer.afterDoctypePublicIdentifierStatecCs|j}|tkrn|dkr4|j|j|j|_n|dkrPd|jd<|j|_n|dkrld|jd<|j |_nh|t kr|jt dddd |jd <|j|j|j|_n(|jt dd dd |jd <|j |_d S) Nrtrr/rrr#rr$FrrT) rr:rr'r;rrrrrrrrrpr!r!r"rs2           z;HTMLTokenizer.betweenDoctypePublicAndSystemIdentifiersStatecCs|j}|tkr|j|_n|dkrP|jtddd|j||j|_nT|t kr|jtdddd|j d<|j|j |j |_n|j||j|_dSr) rr:r"beforeDoctypeSystemIdentifierStaterr'r;rrArrrrpr!r!r"rs&         z,HTMLTokenizer.afterDoctypeSystemKeywordStatecCs|j}|tkrn|dkr0d|jd<|j|_n|dkrLd|jd<|j|_n|dkr|jt dddd |jd <|j|j|j |_nh|t kr|jt dd dd |jd <|j|j|j |_n(|jt dddd |jd <|j |_d S) Nrr/rrrtr#rr$FrrT) rr:rrrrrr'r;rrrrrpr!r!r"r/s:             z0HTMLTokenizer.beforeDoctypeSystemIdentifierStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd dd |jd <|j|j|j|_nR|t kr|jtdd dd |jd <|j|j|j|_n|jd|7<d S)Nrr_r#r`r$rr2rtrFrrT rr:!afterDoctypeSystemIdentifierStaterr'r;rrrrrpr!r!r"rLs0         z6HTMLTokenizer.doctypeSystemIdentifierDoubleQuotedStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd dd |jd <|j|j|j|_nR|t kr|jtdd dd |jd <|j|j|j|_n|jd|7<d S)Nrr_r#r`r$rr2rtrFrrTrrpr!r!r"rds0         z6HTMLTokenizer.doctypeSystemIdentifierSingleQuotedStatecCs|j}|tkrn~|dkr4|j|j|j|_n^|tkrt|jt dddd|jd<|j|j|j|_n|jt ddd|j |_dS) Nrtr#rr$FrrT) rr:rr'r;rrrrrrrpr!r!r"r|s$      z/HTMLTokenizer.afterDoctypeSystemIdentifierStatecCsZ|j}|dkr*|j|j|j|_n,|tkrV|j||j|j|j|_ndS)NrtT) rr:r'r;rrrrrArpr!r!r"rs    zHTMLTokenizer.bogusDoctypeStatecCsg}||jd||jd|j}|tkr>qq|dksJt|ddddkrv|ddd|d<qq||qd|}|d}|dkrt|D]}|j t d d d q| dd }|r|j t d |d |j |_ dS)N]rtrJz]]r/r_rr#r`r$r2rOT)r;rrdr:rAssertionErrorr=countranger'rrrr)rr&r:Z nullCountrr!r!r"rs2          zHTMLTokenizer.cdataSectionState)N)NF)N__name__ __module__ __qualname____doc__rr,rFrWrXr^rrbrjrhrmrorqrcrwrxrirrrlrrrnrrrrrrrrrrrrrrrrr|rrrrrrrr}ryrvrrrrrrrrrrrrrrrrrrrrrrr __classcell__r!r!rr"rs H P#         6 "-3rN)Z __future__rrrZpip._vendor.sixrr? collectionsrZ constantsrr r r r r rrrrZ _inputstreamrZ_trierrPobjectrr!r!r!r"s