Python script to check LaTeX citations

Slug: bibtex-check

17556 characters 1771 words
import platform import os import sys from re import findall from time import sleep EXIT_SUCCESS = 0 EXIT_FAILURE = 1 EOF = -1 def getTxt(filepath, index=0) -> str: # get .txt content coding = ("utf-8", "gbk", "utf-16") # codings if 0 <= index < len(coding): # in the range try: with open(filepath, "r", encoding=coding[index]) as f: content = f.read() return content[1:] if content.startswith("\ufeff") else content # if utf-8 with BOM, remove BOM except (UnicodeError, UnicodeDecodeError): return getTxt(filepath, index + 1) # recursion except: return None else: return None # out of range def removeCommentLine(text) -> str: # remove comment lines lines = text.split("\n") for i, line in enumerate(lines): for j in range(len(line)): if line[j] == "%" and (0 == j or line[j - 1] != "\\"): lines[i] = lines[i][:j] return "\n".join(lines) def clearScreen(fakeClear=120): if sys.stdin.isatty(): # is at a console if platform.system().lower() == "linux": os.system("clear") else: try: print("\n" * int(fakeClear)) except: print("\n" * 120) else: try: print("\n" * int(fakeClear)) except: print("\n" * 120) def press_any_key_to_continue(): input("Press any key to continue...") def preExit(countdownTime=5) -> None: try: cntTime = int(countdownTime) length = len(str(cntTime)) except: return print() while cntTime > 0: print("\rProgram ended, exiting in {{0:>{0}}}} second(s). ".format(length).format(cntTime), end="") try: sleep(1) except: print("\rProgram ended, exiting in {{0:>{0}}}} second(s). ".format(length).format(0)) return cntTime -= 1 print("\rProgram ended, exiting in {{0:>{0}}}} second(s). ".format(length).format(cntTime)) def loadFolder(latex_folder) -> dict: dicts = {} for root, dirs, files in os.walk(latex_folder): for f in files: filepath = os.path.join(root, f) if os.path.isfile(filepath): if filepath.lower().endswith(".tex"): dicts.setdefault("tex", []) dicts["tex"].append(filepath) elif filepath.lower().endswith(".bib"): dicts.setdefault("bib", []) dicts["bib"].append(filepath) return dicts def checkLabels(texFilepaths, isDebug=False) -> bool: clearScreen() if type(texFilepaths) not in (tuple, list) or not texFilepaths: print("As no tex files are found, the checking cannot work. ") print("Please press any key to go back. ") press_any_key_to_continue() return None content = "" for texFilepath in texFilepaths: text = getTxt(texFilepath) if text is None: print("Read tex file \"{0}\" failed. ".format(texFilepath)) else: content += removeCommentLine(text) + "\n" labels = [item[item.index("{") + 1:-1] for item in findall("\\\\label\\{.+?\\}", content)] refs = [item[item.index("{") + 1:-1] for item in findall("\\\\ref\\{.+?\\}", content)] + [item[item.index("{") + 1:-1] for item in findall("\\\\eqref\\{.+?\\}", content)] for i in range(len(refs) - 1, -1, -1): if "," in refs[i]: refs += [item.strip() for item in refs[i].split(",")] del refs[i] if isDebug: print("labels =", labels) print("refs =", refs) s = set() repeated_label = set() undefined_label = set() unreferred_label = set() for label in labels: if label in s: repeated_label.add(label) else: s.add(label) if label not in refs: unreferred_label.add(label) for ref in refs: if ref not in labels: undefined_label.add(ref) if len(s) == 1: print("This is the label checking. There is 1 label in total. ") elif len(s) > 1: print("This is the label checking. There are {0} labels in total. ".format(len(s))) else: print("This is the label checking. There are no labels found. ") print() if len(repeated_label) == 1: print("There is a repeated label found: \"{0}\". ".format(*repeated_label)) elif len(repeated_label) > 1: print("There are {0} repeated labels found. The details are as follows. \n{1}".format(len(repeated_label), repeated_label)) else: print("No repeated labels are found. ") if len(undefined_label) == 1: print("There is an undefined label found: \"{0}\". ".format(*undefined_label)) elif len(undefined_label) > 1: print("There are {0} undefined labels found. The details are as follows. \n{1}".format(len(undefined_label), undefined_label)) else: print("No undefined labels are found. ") if len(unreferred_label) == 1: print("There is an unreferred label found: \"{0}\". ".format(*unreferred_label)) elif len(unreferred_label) > 1: print("There are {0} unreferred labels found. The details are as follows. \n{1}".format(len(unreferred_label), unreferred_label)) else: print("No unreferred labels are found. ") print() if input("Would you like to check again (input \"Y\" and enter to check again): ").upper() == "Y": return checkLabels(texFilepaths, isDebug=isDebug) else: return not any([repeated_label, undefined_label, unreferred_label]) def checkCitations(texFilepaths, isDebug=False) -> bool: clearScreen() if type(texFilepaths) not in (tuple, list) or not texFilepaths: print("As no tex files are found, the checking cannot work. ") print("Please press any key to go back. ") press_any_key_to_continue() return None content = "" for texFilepath in texFilepaths: text = getTxt(texFilepath) if text is None: print("Read tex file \"{0}\" failed. ".format(texFilepath)) else: content += removeCommentLine(text) + "\n" cites = [item[item.index("{") + 1:-1] for item in findall("\\\\cite\\{.+?\\}", content)] for i in range(len(cites) - 1, -1, -1): if "," in cites[i]: cites += [item.strip() for item in cites[i].split(",")] del cites[i] dicts = {} repeated_entry = [] for line in content.split("\n"): targets = findall("\\\\bibitem\\{.+?\\}", line) if len(targets): target = targets[0] key = target[target.index("{") + 1:-1] if key in dicts: repeated_entry.append(key) else: dicts[key] = line[len(target):] space_start = [] multiple_space = [] end_dot = [] repeated_content = [] undefined_entry = set() uncited_entry = set() for key in list(dicts.keys()): if not dicts[key].startswith(" "): space_start.append(key) if dicts[key][:2] in (" ", " \t"): # do not use elif multiple_space.append(key) if not dicts[key].endswith(". "): # do not use elif end_dot.append(key) for key in list(dicts.keys()): dicts[key] = dicts[key].strip() reverse_dict = {} for key in list(dicts.keys()): reverse_dict.setdefault(dicts[key], []) reverse_dict[dicts[key]].append(key) for key in list(reverse_dict.keys()): if len(reverse_dict[key]) > 2: repeated_content.append(reverse_dict[key]) for key in list(dicts.keys()): if key not in cites: uncited_entry.add(key) for cite in cites: if cite not in dicts: undefined_entry.add(cite) if isDebug: print("cites =", cites) print("dicts =", dicts) print("This is the citation checking. The result is as follows. ") print() if len(repeated_entry) == 1: print("There is a repeated bibitem key: \"{0}\". ".format(*repeated_entry)) elif len(repeated_entry) > 1: print("There are {0} repeated bibitem keys. The details are as follows. \n{1}".format(len(repeated_entry), repeated_entry)) else: print("No repeated bibitem key is found. ") if len(space_start) == 1: print("There is a bibitem entry not starting with a space: \"{0}\". ".format(*space_start)) elif len(space_start) > 1: print("There are {0} bibitem entries not starting with a space. The details are as follows. \n{1}".format(len(space_start), space_start)) else: print("All bibitem entries start with a space. ") if len(multiple_space) == 1: print("There is a bibitem entry starting with multiple spaces: \"{0}\". ".format(*multiple_space)) elif len(multiple_space) > 1: print("There are {0} bibitem entries starting with multiple spaces. The details are as follows. \n{1}".format(len(multiple_space), multiple_space)) else: print("No bibitem entry starts with multiple spaces. ") if len(end_dot) == 1: print("There is a bibitem entry not ending with a dot: \"{0}\". ".format(*end_dot)) elif len(end_dot) > 1: print("There are {0} bibitem entries not ending with a dot. The details are as follows. \n{1}".format(len(end_dot), end_dot)) else: print("All bibitem entries end with a dot. ") if len(repeated_content) == 1: print("There are repeated bibitem entries. The details are as follows. \n{0}".format(repeated_content[0])) elif len(repeated_content) > 1: print("There are {0} groups of repeated bibitem entries. The details are as follows. \n{1}".format(len(repeated_content), repeated_content)) else: print("No repeated bibitem entry is found. ") if len(undefined_entry) == 1: print("There is an undefined citation key: \"{0}\". ".format(*undefined_entry)) elif len(undefined_entry) > 1: print("There are {0} undefined citation keys. The details are as follows. \n{1}".format(len(undefined_entry), undefined_entry)) else: print("No undefined citation key is found. ") if len(uncited_entry) == 1: print("There is an uncited bibitem entry: \"{0}\". ".format(*uncited_entry)) elif len(uncited_entry) > 1: print("There are {0} uncited bibitem entries. The details are as follows. \n{1}".format(len(uncited_entry), uncited_entry)) else: print("No uncited bibitem entry is found. ") print() if input("Would you like to check again (input \"Y\" and enter to check again): ").upper() == "Y": return checkCitations(texFilepaths, isDebug=isDebug) else: return not any([repeated_entry, space_start, multiple_space, end_dot, repeated_content, undefined_entry, uncited_entry]) def checkBibtex(bibFilepaths, isDebug=False) -> bool: clearScreen() if type(bibFilepaths) not in (tuple, list) or not bibFilepaths: print("As no bib files are found, the checking cannot work. ") print("Please press any key to go back. ") press_any_key_to_continue() return None content = "" for bibFilepath in bibFilepaths: text = getTxt(bibFilepath) if text is None: print("Read bib file \"{0}\" failed. ".format(bibFilepath)) else: content += removeCommentLine(text) + "\n" bibitems = [item[item.index("{") + 1:item.index(",")] for item in findall("@.+?\\{.+?,", content)] if isDebug: print("bibitems =", bibitems) s = set() repeated_bibitem = set() for bibitem in bibitems: if bibitem in s: repeated_bibitem.add(bibitem) else: s.add(bibitem) if len(s) == 1: print("This is the bibtex checking. There is 1 bibitem in total. ") elif len(s) > 1: print("This is the bibtex checking. There are {0} bibitems in total. ".format(len(s))) else: print("This is the bibtex checking. There are no bibitems found. ") print() if len(repeated_bibitem) == 1: print("There is a repeated bibitem key: \"{0}\". ".format(*repeated_bibitem)) elif len(repeated_bibitem) > 1: print("There are {0} repeated bibitem keys. The details are as follows. \n{1}".format(len(repeated_bibitem), repeated_bibitem)) else: print("No repeated bibitem key is found. ") print() if input("Would you like to check again (input \"Y\" and enter to check again): ").upper() == "Y": return checkBibtex(bibFilepaths, isDebug=isDebug) else: return not any([repeated_bibitem]) def citationSurvey(texFilepaths, isDebug=False) -> dict: clearScreen() if type(texFilepaths) not in (tuple, list) or not texFilepaths: print("As no tex files are found, the checking cannot work. ") print("Please press any key to go back. ") press_any_key_to_continue() return None dicts = {} for texFilepath in texFilepaths: text = getTxt(texFilepath) if text is None: print("Read tex file \"{0}\" failed. ".format(texFilepath)) else: lines = removeCommentLine(text).split("\n") section = "" for line in lines: if len(findall("\\\\section\\{.+?\\}", line)): section = findall("\\\\section\\{.+?\\}", line)[0][9:-1] elif len(findall("\\\\section\\*\\{.+?\\}", line)): section = findall("\\\\section\\*\\{.+?\\}", line)[0][10:-1] elif len(findall("\\\\chapter\\{.+?\\}", line)): section = findall("\\\\chapter\\{.+?\\}", line)[0][9:-1] elif len(findall("\\\\chapter\\*\\{.+?\\}", line)): section = findall("\\\\chapter\\*\\{.+?\\}", line)[0][10:-1] if not section: continue for cite in findall("\\\\cite\\{.+?\\}", line): cite = cite[cite.index("{") + 1:-1] for subCite in cite.split(","): dicts.setdefault(section, {}) dicts[section].setdefault(subCite.strip(), 0) dicts[section][subCite.strip()] += 1 for section in list(dicts.keys()): totalCount = 0 for subCite in list(dicts[section].keys()): totalCount += dicts[section][subCite] dicts[section] = { "citation_count": len(dicts[section]), "total_count": totalCount, "details": dicts[section], } if isDebug: print("dicts =", dicts) print("This is the citation survey. The result is as follows. ") print() if len(dicts): sections = sorted(list(dicts.keys())) maxSectionLen = max([len(section) for section in sections]) for section in sections: print(section.ljust(maxSectionLen + 2) + "contains {0} citations with {1} citation count(s). ".format(dicts[section]["citation_count"], dicts[section]["total_count"])) else: print("No citation is found. ") print() if input("Would you like to check again (input \"Y\" and enter to check again): ").upper() == "Y": return citationSurvey(texFilepaths, isDebug=isDebug) else: return dicts def mainBoard(texFilepaths, bibFilepaths, isDebug=False): clearScreen() if isDebug: print("texFilepaths =", texFilepaths) print("bibFilepaths =", bibFilepaths) print("Main Board\n" + "-" * 10) print("1. Reload files") print("2. Check labels") print("3. Check citations") print("4. Check bibtex") print("5. Citation survey") print("0. Exit") print() print("Please input the corresponding number to select. ", end="") print("The default option is \"0\". ", end="") userInput = input().strip() print() if userInput == "1": return elif userInput == "2": result = checkLabels(texFilepaths, isDebug=isDebug) print("Check label result:", result) elif userInput == "3": result = checkCitations(texFilepaths, isDebug=isDebug) print("Check citation result:", result) elif userInput == "4": result = checkBibtex(bibFilepaths, isDebug=isDebug) print("Check bibtex result:", result) elif userInput == "5": result = citationSurvey(texFilepaths, isDebug=isDebug) print("Citation survey result:", result) else: return def main(): latex_folder = os.getcwd() isDebug = True # Change this to False in production while True: clearScreen() if isDebug: print("latex_folder =", latex_folder) print("Welcome to use the Label/Citation Checker!\n" + "-" * 10) print("Please enter the folder path where the LaTeX files are located.") userInput = input("Folder path (default is the current folder): ").strip() if userInput: latex_folder = userInput else: latex_folder = os.getcwd() file_dict = loadFolder(latex_folder) mainBoard(file_dict.get("tex", []), file_dict.get("bib", []), isDebug=isDebug) if __name__ == "__main__": main()
URL: https://ib.bsb.br/bibtex-check
Reference: https://github.com/BatchClayderman/checkCite