Source code for labzen.labzen

import os
from pathlib import Path
import pandas as pd
import numpy as np
from nbformat import read, NO_CONVERT
import re
import glob
from github import Github
import git
import warnings

import webbrowser


def _find_assignment(directory=None):
    """Find an Assignment Dyamically

    A helper function to validate and locate the lab file based on its
    extension being Rmd or ipynb. The utility will search recursively up the
    directory. If multiple candidate files are found, the user will be
    prompted to select which file they wish.

    Args:
        directory ([type], optional): A directory path to be searched
            recursively. If None given, will use the working directory.

    Returns:
        [str]: A path to the file selected.
    """
    if directory is None:
        directory = os.getcwd()

    types = ["*.ipynb", "*.Rmd"]
    files = []
    for type in types:
        path = str(Path(directory).expanduser())
        # path = str(directory)
        pathname = path + "/**/*" + type
        type_files = glob.glob(pathname, recursive=True)
        files += type_files

    names = [
        str(n + 1) + "." + os.path.basename(file)
        for n, file in enumerate(files)
    ]
    print("The existing files are:")
    for item in names:
        print(item)
    notebook = input("Enter your file number from the above list:")
    notebook = files[int(notebook) - 1]
    return notebook


[docs]def create_github_token(host="https://github.ubc.ca"):
    """Open A Browser to Generate a New Github Enterprise Token

    Args:
        host (str):
            The URL to the upstream host. Defaults to UBC Github Enterprise.
    Returns:
        None

    Examples:
        >>> from labzen import labzen as lz
        >>> # Open a web browser
        >>> lz.create_github_token()
    """
    opts = "scopes=repo,user,gist,workflow&description=LABZEN"
    url = f"{host}/settings/tokens/new?{opts}"
    webbrowser.open(url, new=2)


[docs]def parse_lab(notebook=None):
    """Parse MDS lab files to return the markdown content

    Args:
        notebook (str):
            A path or list of paths to MDS lab files (either .ipynb
            or .Rmd). If left blank, the function will recursively
            search for all labs in the working directory based on the file
            extension.

    Returns:
        list: Each element of list is a content of one markdown cell.

    Example:
        >>> # Download the demo files into the working directory
        >>> import urllib.request
        >>> from labzen import labzen as lz
        >>>
        >>> baseurl = (
        >>>     "https://raw.githubusercontent.com"
        >>>     + "/UBC-MDS/labzen/main/data-raw"
        >>> )
        >>> labs = {
        >>>     "dummylab.Rmd": f"{baseurl}/dummylab.Rmd",
        >>>     "dummylab.ipynb": f"{baseurl}/dummylab.ipynb",
        >>> }
        >>>
        >>> for name, url in labs.items():
        >>>     urllib.request.urlretrieve(url, name)
        >>>
        >>> # parse the labs
        >>> lz.parse_lab("dummylab.ipynb")
        >>> lz.parse_lab("dummylab.Rmd")
        >>>
        >>> # Alternatively, navigate to a student assignment repo and
        >>> # run the following code.
        >>> lz.parse_lab()
    """
    # If the user did not define the specific file, recursively
    # search for rmd and ipynb files in the working directory
    if notebook is None:
        notebook = _find_assignment()
    path = Path(notebook)
    _, extension = os.path.splitext(notebook)

    # defensive tests
    if extension != ".Rmd" and extension != ".ipynb":
        raise Exception(
            "Sorry, you have not provided Rmarkdown or jupyter notebook file"
        )

    if not isinstance(notebook, str):
        raise Exception("The file path should be string")

    # Parse the markdown contents of rmd or ipynb file
    source = []
    if extension == ".Rmd":
        text_and_code = path.expanduser().read_text()
        text_and_code = text_and_code.split("```")

        code_blocks = []
        for string in text_and_code:
            if string.startswith("{r"):
                code_blocks.append(string)
            elif string.startswith("{python"):
                code_blocks.append(string)
            else:
                source.append(string)
    else:
        with open(notebook, encoding="utf8") as file:
            notebook = read(file, NO_CONVERT)
            cells = notebook["cells"]
            code_cells = [c for c in cells if c["cell_type"] == "markdown"]
            for cell in code_cells:
                source.append(cell["source"])

    return source


[docs]def count_points(file_name: str = None, margins: bool = True):
    """Tally Available Points in Lab

    Args:
        file_name (str): A path or list of paths to MDS lab files (either
            .ipynb or .Rmd). If left blank, the function will recursively
            search for all labs in the working directory based on the file
            extension.
        margins (bool): A boolean indicating whether to add a row for the
            total number of points (optional + required). Defaults to True.

    Returns:
        (pandas.core.frame.DataFrame, pandas.core.frame.DataFrame):
            A tuple of DataFrames. The first is a section-by-section overview
            of points available. The second is a cross table summarising the
            number of optional, required, and total points per lab.

    Example:
        >>> from labzen import labzen as lz
        >>> import urllib.request
        >>>
        >>> # Download the demo files into the working directory
        >>> baseurl = (
        >>>     "https://raw.githubusercontent.com"
        >>>     + "/UBC-MDS/labzen/main/data-raw"
        >>> )
        >>> labs = {
        >>>     "dummylab.Rmd": f"{baseurl}/dummylab.Rmd",
        >>>     "dummylab.ipynb": f"{baseurl}/dummylab.ipynb",
        >>> }
        >>> for name, url in labs.items():
        >>>     urllib.request.urlretrieve(url, name)
        >>>
        >>> # for Jupyter notebooks:
        >>> df, tab = lz.count_points("dummylab.ipynb")
        >>> print(df[["rubric", "points", "type"]])
                        rubric  points          type
        0            [mechanics]     [5]  Non-Optional
        1            [reasoning]     [4]  Non-Optional
        2  [accuracy, reasoning]  [3, 2]  Non-Optional
        3  [accuracy, reasoning]  [6, 4]      Optional
        4  [accuracy, reasoning]  [7, 3]      Optional
        5                  [viz]     [5]  Non-Optional
        >>> print(tab)
                   type  total  prop
        0  Non-Optional     19  0.95
        1      Optional     20  1.00
        2           All     39  1.95
        >>>
        >>> # for Rmarkdown:
        >>> df, tab = lz.count_points("dummylab.Rmd")
        >>> print(tab)
                type  total      prop
        0  Non-Optional     42  0.950000
        1      Optional     12  0.271429
        2           All     54  1.221429
        >>>
        >>> # Alternatively, navigate to a student assignment repo and run the
        >>> # following code.
        >>> df, tab = lz.count_points()
    """
    # Parse a lab file into its markdown blocks
    res = parse_lab(file_name)
    df = pd.DataFrame({"block": np.arange(1, len(res) + 1), "txt": res})

    # Tidy breaks, new lines, extra spaces, and make each line a row
    df["txt"] = df["txt"].str.replace("<br>", "\n")
    df["txt"] = df["txt"].str.split("\n")
    df = df.explode("txt")
    df["txt"] = df["txt"].replace(["", "<hr>"], np.nan)
    df = df.dropna()
    df["txt"] = df["txt"].str.strip()

    # Add variable transformations
    df["header"] = df["txt"].shift(1)
    df["rubric"] = df["txt"].str.contains(r"^rubric\=\{")
    df["below_header"] = df["header"].str.contains(r"^[#]{1,6}\s")
    df["optional"] = df["header"].str.contains("optional|bonus", case=False)

    # Subset to lines containing rubrics only
    df = df.dropna().query("rubric")

    # Extract and sum points
    df["points"] = df["txt"].str.findall(r"(\d+)")
    df2 = df.explode("points")
    df2["points"] = df2["points"].astype(int)
    df["points"] = df2["points"].groupby(df2.index).apply(list)
    df["total"] = df["points"].apply(sum)

    # defensive check
    if not all(df["below_header"]):
        raise Exception(
            "There is a problem parsing this lab. Expecting a rubric tag to "
            + "below a markdown header."
        )

    # Tidy and make the result more human-readable
    booldict = {True: "Optional", False: "Non-Optional"}
    df["type"] = df["optional"].replace(booldict)
    df = df.drop(columns=["rubric", "below_header", "optional"])
    df = df.reset_index(drop=True)
    df["header"] = df["header"].str.replace(r"(^[#]+\s+)", "", regex=True)

    # Generate crosstab
    tab = df.pivot_table("total", "type", aggfunc=sum, margins=margins)
    tab = tab.reset_index()
    one_pt_worth = 0.95 / tab.loc[tab["type"] == "Non-Optional", "total"]
    tab["prop"] = tab["total"] * one_pt_worth[0]

    # add percent to full table
    df["prop"] = df["total"] * one_pt_worth[0]

    # simplify rubric names
    df["rubric"] = df["txt"].str.findall(r"([a-z]+)(?=\:\d)")

    # re-order columns
    df = df[["block", "header", "rubric", "points", "total", "prop", "type"]]

    return df, tab


def _check_repo_link(file_name: str = None):
    """Check whether the user has included the github repo link in his/her
        repository

    Args:
        file_name (str):
            A path or list of paths to MDS lab files (either
            .ipynb or .Rmd). If left blank, the function will recursively
            search for all labs in the working directory based on the file
            extension.

    Returns:
        bool: a boolean output

    Example:
        >>> # Navigate to the root of labzen repo and run the following code
        >>> # using the dummy files:
        >>>
        >>> # for jupyter notebook:
        >>> _check_repo_link("data-raw/dummylab.ipynb")
        >>>
        >>> # for Rmarkdown:
        >>> _check_repo_link("data-raw/dummylab.Rmd")
        >>>
        >>> # Alternatively, navigate to a student assignment repo and run the
        >>> # following code.
        >>> _check_repo_link()
    """

    # Parse a lab file into its markdown blocks
    res = parse_lab(file_name)

    df = pd.DataFrame({"block": np.arange(1, len(res) + 1), "txt": res})

    # finding out if there is any link
    rex = re.compile(
        r"((https://)?(www.)?github\.ubc\.ca"  # base url for GH Enterprise
        r"\/MDS-\d{4}-\d{2}"  # organization
        r"\/DSCI_\d{3}_lab\d_[a-z]+)"  # lab repo with CWL username
    )
    warnings.filterwarnings("ignore", "This pattern has match groups")
    df["link"] = df["txt"].str.contains(rex, regex=True)

    # displaying the result in boolean
    repo_link = df["link"].any()
    if repo_link:
        print("Check 3: Repository link is included in the file")
        print(f"Check 3: {repo_link}")
    else:
        print("Check 3: Repository link is not included in the file")
        print("Check 3:", False)

    return repo_link


def _check_lat_version(path: str, token: str):
    """Check whether the user has pushed the latest version in his/her
        repository

    Args:
        path (str): A local file path to either a lab directory or to a
            lab file inside a local git directory.
        token (str): A token for https://github.ubc.ca

    Returns:
        bool: a boolean output

    Example:
        >>> from labzen import labzen as lz
        >>>
        >>> # navigate to a student repo and run:
        >>> token = "544c96ce0d3dc9b66ac8d70b32c07bd0c46129db"
        >>> lz._check_lat_version(token=token)
    """
    # locate the repo root
    local_repo = git.Repo(path, search_parent_directories=True)

    # locate the Github Enterprise repo name
    ghe = Github(token, base_url="https://github.ubc.ca/api/v3")
    ghe_repo = ghe.get_repo(__find_ghe_repo(local_repo))

    # find latest commit on GHE
    ghe_commit = ghe_repo.get_commits()[0].sha

    # find the latest local commit
    local_commit = str(local_repo.head.commit)

    # comparing the both SHAs
    if ghe_commit == local_commit:
        print("Check 2: Remote has the latest version of the repository")
        print("Check 2:", True)
    else:
        print(
            "Check 2: Remote does not have the latest version of the ",
            "repository",
        )
        print("Check 2:", False)
    return ghe_commit == local_commit


def __find_ghe_repo(local_repo, org="MDS-2020-21"):
    """Find a Github Repo Path

    Args:
        local_repo ([git.repo.base.Repo]): A local Github repository
        org (str): The name of the organization on Github Enterprise to search.
            Defaults to 'MDS-2020-21'.

    Returns:
        [type]: [github.Repository.Repository] An UBC GHE remote (student
            project repo).
    """
    # find the name of the repo on Github Enterprise
    remote_urls = [list(x.urls)[0] for x in local_repo.remotes]
    rex = "([A-Za-z0-9_-]+)(?=\\.git$)"
    ghe_name = list(set([re.search(rex, x).group(1) for x in remote_urls]))[0]

    return f"{org}/{ghe_name}"


def _check_commits(path: str, token: str, verbose=False):
    """Check whether the user has at least three commits

    Args:
        path (str): A local repo path or local file path to a lab.git
        token (str): A token for Github Enterprise.
        verbose (bool): Whether to print commit details to screen

    Returns:
        bool: a boolean output

    Example:
        >>> from labzen import labzen as lz
        >>>
        >>> path = "/Users/jene/MDS/Block5/lab/DSCI_599_lab1_jene3456"
        >>> token = "544c96ce0d3dc9b66ac8d70b32c07bd0c46129db"
        >>> lz._check_commits(path, token)
    """

    # locate the repo root
    local_repo = git.Repo(path, search_parent_directories=True)

    # locate the Github Enterprise repo name
    ghe = Github(token, base_url="https://github.ubc.ca/api/v3")

    try:
        # org/repo_name
        ghe_name = __find_ghe_repo(local_repo)
        ghe_repo = ghe.get_repo(ghe_name)
    except Exception:
        raise Exception(f"{ghe_name} not found on github.ubc.ca")

    # count the total number of commits on the remote
    ghe_commits = ghe_repo.get_commits()
    ghe_commit_n = ghe_commits.totalCount

    if ghe_commit_n > 3:
        student_name = ghe.get_user().name
        student_commits_n = 0
        for commit in ghe_commits:
            valid_names = set()
            if hasattr(commit.author, "name"):
                valid_names.add(commit.author.name)
            if hasattr(commit.committer, "name"):
                valid_names.add(commit.committer.name)

            if student_name in valid_names:
                student_commits_n += 1

                if verbose:
                    print(
                        commit,
                        commit.commit.committer,
                        commit.commit.author.email,
                        commit.commit.author.date,
                        commit.author.name,
                    )
        check_result = student_commits_n >= 3
    else:
        check_result = False

    # print check result to screen
    if check_result:
        print(
            "Check 1: Repository has at least 3 commits with the ",
            f"student username {student_name}",
        )
        print("Check 1:", check_result)
    elif student_commits_n < 3:
        print(
            "Check 1: Repository does not have 3 commits with the "
            f"student username {student_name}"
        )
        print("Check 1:", check_result)
    else:
        print(
            f"Check 1: Repo {ghe_repo.name} has fewer than 3 commits ",
            f"with the student username {student_name}",
        )
        print("Check 1:", check_result)

    return check_result


[docs]def check_mechanics(path: str = None, token=None):
    """Performs Mechanics Checks on a MDS Lab

       This function checks that you...
        1. ... have a Github repo link;
        2. ... have pushed your latest commit; and
        3. ... have at least three commit messages authored by you in
            your history.

    Args:
        path (str): A local path to a Github directory or an MDS lab file
            (.ipynb or .Rmd) within such a directory.

        token (str) : A personal access token for https://github.ubc.ca. See
            ``create_github_token()`` for details.

    Returns:
        bool : A boolean whether all checks passed. The function also prints
            informative messages as a side effect.

    Example:
        >>> from labzen import labzen as lz
        >>>
        >>> # Step 1: get a token
        >>> lz.create_github_token()
        >>>
        >>> # Step 2: check mechanics
        >>> file = "~/MDS/Block5/lab1/DSCI_599_lab1_jene3456"
        >>> token = "544c96ce0d3dc9b66ac8d70b32c07bd0c46129db"
        >>> lz.check_mechanics(file, token)
        Check 1: Repository has at least 3 commits with the student
        username JENE SMITH
        Check 1: True
        Check 2: Remote has the latest version of the repository
        Check 2: True
        Check 3: Repository link is included in the file
        Check 3: True
        >>>
        >>> # Alternatively, just run the following from an MDS lab directory:
        >>> lz.check_mechanics(token = token)

    """
    # use the current working directory if no path given
    if path is None:
        path = os.getcwd()

    # local the repo root
    repo_path = git.Repo(path, search_parent_directories=True).git_dir

    # local lab file
    _, extension = os.path.splitext(path)
    if extension == ".Rmd" or extension == ".ipynb":
        lab_path = path
    else:
        lab_path = _find_assignment(path)

    result = [
        _check_commits(repo_path, token=token),
        _check_lat_version(repo_path, token=token),
        _check_repo_link(lab_path),
    ]

    return all(result)