diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index fefbba0f..8615d736 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -27,12 +27,24 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) +FILE_PATHS = [ + shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_combined_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_lastest_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_prior_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_retired_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_country.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_language.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_restrictions.csv"), +] def parse_arguments(): """ Parse command-line options, returns parsed argument namespace. """ + global QUARTER LOGGER.info("Parsing command-line options") parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( @@ -48,15 +60,23 @@ def parse_arguments(): parser.add_argument( "--enable-git", action="store_true", - help="Enable git actions such as fetch, merge, add, commit, and push" - " (default: False)", + help="Enable git actions such as fetch, merge, add, commit, and push", + ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if processed files already exist", ) args = parser.parse_args() if not args.enable_save and args.enable_git: parser.error("--enable-git requires --enable-save") if args.quarter != QUARTER: - global PATHS + global FILE_PATHS, PATHS + FILE_PATHS = shared.paths_list_update( + LOGGER, FILE_PATHS, QUARTER, args.quarter + ) PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + QUARTER = args.quarter args.logger = LOGGER args.paths = PATHS return args @@ -308,6 +328,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) + shared.check_for_data_files(args, FILE_PATHS, QUARTER) # Count data file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv") diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index 27945613..72f27cd2 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -24,12 +24,17 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) +FILE_PATHS = [ + shared.path_join(PATHS["data_phase"], "github_totals_by_license.csv"), + shared.path_join(PATHS["data_phase"], "github_totals_by_restriction.csv"), +] def parse_arguments(): """ Parse command-line options, returns parsed argument namespace. """ + global QUARTER LOGGER.info("Parsing command-line options") parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( @@ -48,24 +53,27 @@ def parse_arguments(): help="Enable git actions such as fetch, merge, add, commit, and push" " (default: False)", ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if processed files already exist", + ) + args = parser.parse_args() if not args.enable_save and args.enable_git: parser.error("--enable-git requires --enable-save") if args.quarter != QUARTER: - global PATHS + global FILE_PATHS, PATHS + FILE_PATHS = shared.paths_list_update( + LOGGER, FILE_PATHS, QUARTER, args.quarter + ) PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + QUARTER = args.quarter args.logger = LOGGER args.paths = PATHS return args -def check_for_data_file(file_path): - if os.path.exists(file_path): - raise shared.QuantifyingException( - f"Processed data already exists for {QUARTER}", 0 - ) - - def data_to_csv(args, data, file_path): if not args.enable_save: return @@ -98,7 +106,6 @@ def process_totals_by_license(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_license.csv" ) - check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -133,7 +140,6 @@ def process_totals_by_restriction(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_restriction.csv" ) - check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -141,7 +147,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) - + shared.check_for_data_files(args, FILE_PATHS, QUARTER) file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv") count_data = shared.open_data_file( LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"] @@ -167,7 +173,7 @@ def main(): LOGGER.info(e.message) else: LOGGER.error(e.message) - sys.exit(e.code) + sys.exit(e.exit_code) except SystemExit as e: LOGGER.error(f"System exit with code: {e.code}") sys.exit(e.code) diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py index 7712b26a..861447bc 100755 --- a/scripts/2-process/wikipedia_process.py +++ b/scripts/2-process/wikipedia_process.py @@ -28,12 +28,24 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) +FILE_PATHS = [ + shared.path_join( + PATHS["data_phase"], "wikipedia_highest_language_usage.csv" + ), + shared.path_join( + PATHS["data_phase"], "wikipedia_least_language_usage.csv" + ), + shared.path_join( + PATHS["data_phase"], "wikipedia_language_representation.csv" + ), +] def parse_arguments(): """ Parse command-line options, returns parsed argument namespace. """ + global QUARTER LOGGER.info("Parsing command-line options") parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( @@ -52,24 +64,27 @@ def parse_arguments(): help="Enable git actions such as fetch, merge, add, commit, and push" " (default: False)", ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if processed files already exist", + ) + args = parser.parse_args() if not args.enable_save and args.enable_git: parser.error("--enable-git requires --enable-save") if args.quarter != QUARTER: - global PATHS + global FILE_PATHS, PATHS + FILE_PATHS = shared.paths_list_update( + LOGGER, FILE_PATHS, QUARTER, args.quarter + ) PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + QUARTER = args.quarter args.logger = LOGGER args.paths = PATHS return args -def check_for_data_file(file_path): - if os.path.exists(file_path): - raise shared.QuantifyingException( - f"Processed data already exists for {QUARTER}", 0 - ) - - def data_to_csv(args, data, file_path): if not args.enable_save: return @@ -98,7 +113,6 @@ def process_highest_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_highest_language_usage.csv" ) - check_for_data_file(file_path) data_to_csv(args, top_10, file_path) @@ -122,7 +136,6 @@ def process_least_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_least_language_usage.csv" ) - check_for_data_file(file_path) data_to_csv(args, bottom_10, file_path) @@ -149,7 +162,6 @@ def process_language_representation(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_language_representation.csv" ) - check_for_data_file(file_path) data_to_csv(args, language_counts, file_path) @@ -157,6 +169,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) + shared.check_for_data_files(args, FILE_PATHS, QUARTER) file_count = shared.path_join( PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv" ) diff --git a/scripts/3-report/gcs_report.py b/scripts/3-report/gcs_report.py index 359796a9..eb9130fe 100755 --- a/scripts/3-report/gcs_report.py +++ b/scripts/3-report/gcs_report.py @@ -9,6 +9,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -27,7 +28,8 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "Google Custom Search (GCS)" +SECTION_FILE = Path(__file__).name +SECTION_TITLE = "Google Custom Search (GCS)" def parse_arguments(): @@ -83,7 +85,8 @@ def gcs_intro(args): total_count = f"{data['Count'].sum():,d}" shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, "Overview", None, None, @@ -137,7 +140,8 @@ def plot_products(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing Creative Commons (CC) legal tool product totals and" @@ -180,7 +184,8 @@ def plot_tool_status(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing Creative Commons (CC) legal tool status totals and" @@ -223,7 +228,8 @@ def plot_latest_tools(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing latest Creative Commons (CC) legal tool totals and" @@ -265,7 +271,8 @@ def plot_prior_tools(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing prior Creative Commons (CC) legal tool totals and" @@ -311,7 +318,8 @@ def plot_retired_tools(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing retired Creative Commons (CC) legal tools total and" @@ -360,7 +368,8 @@ def plot_countries_highest_usage(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing countries with the highest useage of the latest" @@ -413,7 +422,8 @@ def plot_languages_highest_usage(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing languages with the highest useage of the latest" @@ -460,7 +470,8 @@ def plot_free_culture(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing Approved for Free Cultural Works legal tool usage.", diff --git a/scripts/3-report/github_report.py b/scripts/3-report/github_report.py index 37979175..958dcc40 100755 --- a/scripts/3-report/github_report.py +++ b/scripts/3-report/github_report.py @@ -9,6 +9,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -25,7 +26,8 @@ # Setup LOGGER, PATHS = shared.setup(__file__) QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "GitHub data" +SECTION_FILE = Path(__file__).name +SECTION_TITLE = "Github" def parse_arguments(): @@ -55,6 +57,11 @@ def parse_arguments(): help="Enable git actions such as fetch, merge, add, commit, and push" " (default: False)", ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if images files already exist", + ) args = parser.parse_args() if not args.enable_save and args.enable_git: parser.error("--enable-git requires --enable-save") @@ -99,7 +106,8 @@ def github_intro(args): cc_percentage = f"{(cc_total / total_repositories) * 100:.2f}%" shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, "Overview", None, None, @@ -110,7 +118,7 @@ def github_intro(args): f"** of the {total_repositories} total public repositories" " on GitHub that use a CC legal tool. Additionally," " many more use a non-CC use a Public domain" - " equivalent legal tools.**\n" + " equivalent legal tools.\n" "\n" " The Github data showcases the different level of" " rights reserved on repositories We have Public" @@ -120,7 +128,7 @@ def github_intro(args): " without restriction." " See more at" " [Public-domain-equivalent license]" - "(https://en.wikipedia.org/wiki/Public-domain-equivalent_license)" + "(https://en.wikipedia.org/wiki/Public-domain-equivalent_license).\n" " While a Permissive category of license contains works" " under MIT-0 and CC BY 4.0 allows users to" " reuse the code with some conditions and attribution" @@ -129,7 +137,7 @@ def github_intro(args): " and Copyleft contains works under CC BY-SA 4.0." " which requires any derivative works to be licensed" " under the same terms." - " [Copyleft](https://en.wikipedia.org/wiki/Copyleft)" + " [Copyleft](https://en.wikipedia.org/wiki/Copyleft).\n" "\n" "Thank you GitHub for providing public API" " access to repository metadata!", @@ -171,7 +179,8 @@ def plot_totals_by_license_type(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing totals by license type." @@ -219,7 +228,8 @@ def plot_totals_by_restriction(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing totals by different levels of restrictions." diff --git a/scripts/3-report/wikipedia_report.py b/scripts/3-report/wikipedia_report.py index 83a92fa3..9224bcb6 100755 --- a/scripts/3-report/wikipedia_report.py +++ b/scripts/3-report/wikipedia_report.py @@ -9,6 +9,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -25,7 +26,8 @@ # Setup LOGGER, PATHS = shared.setup(__file__) QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "Wikipedia data" +SECTION_FILE = Path(__file__).name +SECTION_TITLE = "Wikipedia" def parse_arguments(): @@ -97,7 +99,8 @@ def wikipedia_intro(args): language_count = len(data) shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, "Overview", None, None, @@ -155,7 +158,8 @@ def plot_language_representation(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing the language representation across different language" @@ -200,7 +204,8 @@ def plot_highest_language_usage(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing the most represented languages across the different" @@ -243,7 +248,8 @@ def plot_least_language_usage(args): shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, title, image_path, "Plots showing the least represented languages across the different" diff --git a/scripts/3-report/notes.py b/scripts/3-report/zzz-notes.py similarity index 96% rename from scripts/3-report/notes.py rename to scripts/3-report/zzz-notes.py index ccefd058..1ddcc64d 100755 --- a/scripts/3-report/notes.py +++ b/scripts/3-report/zzz-notes.py @@ -8,6 +8,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -25,7 +26,8 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "Notes" +SECTION_FILE = Path(__file__).name +SECTION_TITLE = "Notes" def parse_arguments(): @@ -72,7 +74,8 @@ def data_locations(args): """ shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, "Data locations", None, None, @@ -100,7 +103,8 @@ def usage(args): """ shared.update_readme( args, - SECTION, + SECTION_FILE, + SECTION_TITLE, "Usage", None, None, diff --git a/scripts/shared.py b/scripts/shared.py index 509801d9..bf9d9010 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -36,6 +36,16 @@ def __init__(self, message, exit_code=None): super().__init__(self.message) +def check_for_data_files(args, file_paths, QUARTER): + if args.force: + return + for path in file_paths: + if os.path.exists(path): + raise QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) + + def get_session(accept_header=None, session=None): """ Create or configure a reusable HTTPS session with retry logic and @@ -197,6 +207,13 @@ def paths_update(logger, paths, old_quarter, new_quarter): return paths +def paths_list_update(logger, paths_list, old_quarter, new_quarter): + logger.info(f"Updating paths: replacing {old_quarter} with {new_quarter}") + for index, path in enumerate(paths_list): + paths_list[index] = path.replace(old_quarter, new_quarter) + return paths_list + + class ColoredFormatter(logging.Formatter): """Adds colors to log messages.""" @@ -269,8 +286,16 @@ def setup(current_file): return logger, paths +def section_order(): + report_dir = os.path.join(os.path.dirname(__file__), "3-report") + report_files = os.listdir(report_dir) + report_files.sort() + return report_files + + def update_readme( args, + section_file, section_title, entry_title, image_path, @@ -280,6 +305,12 @@ def update_readme( """ Update the README.md file with the generated images and descriptions. """ + logger = args.logger + paths = args.paths + ordered_sections = section_order() + logger.info(f"ordered_sections: {ordered_sections}") + logger.info(f"section_title: {section_title}") + if not args.enable_save: return if image_path and not image_caption: @@ -293,18 +324,15 @@ def update_readme( " caption is provided" ) - logger = args.logger - paths = args.paths - readme_path = path_join(paths["data"], args.quarter, "README.md") # Define section markers for each data source - section_start_line = f"\n" - section_end_line = f"\n" + section_start_line = f"\n" + section_end_line = f"\n" # Define entry markers for each plot (optional) and description - entry_start_line = f"\n" - entry_end_line = f"\n" + entry_start_line = f"\n" + entry_end_line = f"\n" if os.path.exists(readme_path): with open(readme_path, "r", encoding="utf-8") as f: @@ -318,26 +346,39 @@ def update_readme( lines.insert(0, title_line) lines.insert(1, "\n") - # We only need to know the position of the end to append new entries + # Locate the data source section if it is already present if section_start_line in lines: - # Locate the data source section if it is already present section_end_index = lines.index(section_end_line) else: - # Add the data source section if it is absent - lines.extend( - [ - f"{section_start_line}", - "\n", - "\n", - f"## {section_title}\n", - "\n", - "\n", - f"{section_end_line}", - "\n", - ] - ) - section_end_index = lines.index(section_end_line) + insert_index = None + # If not present, we find the position to insert the section + current_postion = ordered_sections.index(section_file) + # Sections that should come before this section + sections_before = ordered_sections[:current_postion] + # we find the last existing section that comes before this section + for prev_section_title in reversed(sections_before): + prev_end_line = f"\n" + if prev_end_line in lines: + insert_index = lines.index(prev_end_line) + 1 + break + # If none exist, insert at the top (after README title) + if insert_index is None: + insert_index = 2 if len(lines) >= 2 else len(lines) + # Insert the new data source section at correct position + new_section_line = [ + f"{section_start_line}", + "\n", + "\n", + f"## {section_title}\n", + "\n", + "\n", + f"{section_end_line}", + "\n", + ] + # Insert the section at the correct position + lines = lines[:insert_index] + new_section_line + lines[insert_index:] + section_end_index = lines.index(section_end_line) # Locate the entry if it is already present if entry_start_line in lines: entry_start_index = lines.index(entry_start_line)