#!/usr/bin/env python
# coding: utf-8

# # debugging O
# 
# ref: https://github.com/conda-forge/openmpi-feedstock/pull/142
# 
# Given two conda-forge build logs, compare them after normalization of random elements like timestamps, paths.
# 
# Also extract conda installs, as the most likely source

# First, download log files (this won't work after log retention expires)

# In[1]:


from pathlib import Path
import requests

urls = {
    "bad": "https://dev.azure.com/conda-forge/84710dde-1620-425b-80d0-4cf5baca359d/_apis/build/builds/866388/logs/79",
    "good": "https://dev.azure.com/conda-forge/84710dde-1620-425b-80d0-4cf5baca359d/_apis/build/builds/866253/logs/39",
}

logs_dir = Path("logs")
logs_dir.mkdir(exist_ok=True)

log_paths =  []
for name, url in urls.items():
    p = logs_dir / f"{name}.txt"
    log_paths.append(p)
    if not p.exists():
        print(f"Downloading {url} to {p}")
        r = requests.get(url)
        r.raise_for_status()
        with p.open("w") as f:
            f.write(r.text)


# Normalize path prefixes, strip timestamps, discard docker pull progress

# In[2]:


# normalize path prefixes, strip timestamps

import re

prefix_pattern = re.compile(r"/home/conda/feedstock_root/build_artifacts/([^/]+)/")
placehold_pattern = re.compile(r"_placehold[placehold_]+")
docker_pattern = re.compile(r"^[a-f0-9]{12}: ")


def process_line(line):
    if not line:
        return ""
    _ts, line = line.split(" ", 1)
    if docker_pattern.match(line):
        return ""
    line, _ = prefix_pattern.subn("$BUILD_DIR/", line)
    line, _ = placehold_pattern.subn("", line)

    return line


for log_path in log_paths:
    with log_path.open() as f_in, log_path.with_suffix(".strip.txt").open("w") as f_out:
        for line in f_in:
            line = process_line(line)
            f_out.write(line)


# Extract each conda-install plan into a separate file

# In[3]:


import shutil
from enum import Enum

class State:
    skipping = 1
    consuming = 2

chunk_bounds = {
    "Package Plan": re.compile("Preparing transaction"),
    "conda list": re.compile(r"^\+"),
}


def env_chunks(path):
    """Yield strings, each representing a single conda install"""
    chunk_lines = []
    state = State.skipping
    current_chunk_end = None
    with open(path) as f:
        for line in f:
            if not line:
                continue
            line = process_line(line)
            for chunk_start, chunk_end in chunk_bounds.items():
                if chunk_start in line:
                    chunk_lines = []
                    state = State.consuming
                    current_chunk_end = chunk_end
                    break
            if chunk_lines and current_chunk_end.search(line):
                yield ''.join(chunk_lines)
                chunk_lines = []
                state = State.skipping
            if state == State.consuming:
                chunk_lines.append(line)

for path in log_paths:
    env_dir = path.with_suffix(".envs")
    shutil.rmtree(env_dir)
    env_dir.mkdir(exist_ok=True)
    for i, chunk in enumerate(env_chunks(path)):
        env_path = env_dir / f"env.{i:02}.txt"
        with env_path.open("w") as f:
            f.write(chunk)

get_ipython().system('cat logs/good.envs/env.01.txt')


# First, show the diffs of environment installations:

# In[4]:


get_ipython().system('diff --color=always -U 1 -r logs/bad.envs logs/good.envs')


# Next, compare the full output to see if there's any useful info in there.
# 
# There are lots of small diffs, e.g. line ordering due to parallel builds.

# In[5]:


get_ipython().system('difft --context=0 --color=always --display=inline --background=light  logs/bad.strip.txt logs/good.strip.txt | head -n 950')