Better handling of hash mark, tags with multiple emails and not quoted names in emails. See comments in the script. Signed-off-by: Andrey Albershteyn <aalbersh@xxxxxxxxxx> Signed-off-by: "Darrick J. Wong" <djwong@xxxxxxxxxx> --- tools/git-contributors.py | 109 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 90 insertions(+), 19 deletions(-) diff --git a/tools/git-contributors.py b/tools/git-contributors.py index 70ac8abb26c8ce65de336c5ae48abcfee39508b2..1a0f2b80e3dad9124b86b29f8507389ef91fe813 100755 --- a/tools/git-contributors.py +++ b/tools/git-contributors.py @@ -37,35 +37,106 @@ class find_developers(object): self.r1 = re.compile(regex1, re.I) + # regex to guess if this is a list of multiple addresses. + # Not sure why the initial "^.*" is needed here. + self.r2 = re.compile(r'^.*,[^,]*@[^@]*,[^,]*@', re.I) + + # regex to match on anything inside a pair of angle brackets + self.r3 = re.compile(r'^.*<(.+)>', re.I) + + def _handle_addr(self, addr): + # The next split removes everything after an octothorpe (hash + # mark), because someone could have provided an improperly + # formatted email address: + # + # Cc: stable@xxxxxxxxxxxxxxx # v6.19+ + # + # This, according to my reading of RFC5322, is allowed because + # octothorpes can be part of atom text. However, it is + # interepreted as if there weren't any whitespace + # ("stable@xxxxxxxxxxxxxxx#v6.19+"). The grammar allows for + # this form, even though this is not a correct Internet domain + # name. + # + # Worse, if you follow the format specified in the kernel's + # SubmittingPatches file: + # + # Cc: <stable@xxxxxxxxxxxxxxx> # v6.9 + # + # emailutils will not know how to parse this, and returns empty + # strings. I think this is because the angle-addr + # specification allows only whitespace between the closing + # angle bracket and the CRLF. + # + # Hack around both problems by ignoring everything after an + # octothorpe, no matter where it occurs in the string. If + # someone has one in their name or the email address, too bad. + a = addr.split('#')[0] + + # emailutils can extract email addresses from headers that + # roughly follow the destination address field format: + # + # Reviewed-by: Bogus J. Simpson <bogus@xxxxxxxxxxx> + # Reviewed-by: "Bogus J. Simpson" <bogus@xxxxxxxxxxx> + # Reviewed-by: bogus@xxxxxxxxxxx + # + # Use it to extract the email address, because we don't care + # about the display name. + (name, addr) = email.utils.parseaddr(a) + if DEBUG: + print(f'A:{a}:NAME:{name}:ADDR:{addr}:') + if len(addr) > 0: + return addr + + # If emailutils fails to find anything, let's see if there's + # a sequence of characters within angle brackets and hope that + # is an email address. This works around things like: + # + # Reported-by: Xu, Wen <wen.xu@xxxxxxxxxx> + # + # Which should have had the name in quotations because there's + # a comma. + m = self.r3.match(a) + if m: + addr = m.expand(r'\g<1>') + if DEBUG: + print(f"M3:{addr}:M:{m}:") + return addr + + # No idea, just spit the whole thing out and hope for the best. + return a + def run(self, lines): addr_list = [] for line in lines: l = line.strip() - # emailutils can handle abominations like: - # - # Reviewed-by: Bogus J. Simpson <bogus@xxxxxxxxxxx> - # Reviewed-by: "Bogus J. Simpson" <bogus@xxxxxxxxxxx> - # Reviewed-by: bogus@xxxxxxxxxxx - # Cc: <stable@xxxxxxxxxxxxxxx> # v6.9 - # Tested-by: Moo Cow <foo@xxxxxxx> # powerpc + # First, does this line match any of the headers we + # know about? m = self.r1.match(l) if not m: continue - (name, addr) = email.utils.parseaddr(m.expand(r'\g<2>')) + rightside = m.expand(r'\g<2>') - # This last split removes anything after a hash mark, - # because someone could have provided an improperly - # formatted email address: - # - # Cc: stable@xxxxxxxxxxxxxxx # v6.19+ - # - # emailutils doesn't seem to catch this, and I can't - # fully tell from RFC2822 that this isn't allowed. I - # think it is because dtext doesn't forbid spaces or - # hash marks. - addr_list.append(addr.split('#')[0]) + n = self.r2.match(rightside) + if n: + # Break the line into an array of addresses, + # delimited by commas, then handle each + # address. + addrs = rightside.split(',') + if DEBUG: + print(f"0LINE:{rightside}:ADDRS:{addrs}:M:{n}") + for addr in addrs: + a = self._handle_addr(addr) + addr_list.append(a) + else: + # Otherwise treat the line as a single email + # address. + if DEBUG: + print(f"1LINE:{rightside}:M:{n}") + a = self._handle_addr(rightside) + addr_list.append(a) return sorted(set(addr_list)) -- 2.47.2