commit 94bbdca745915c32f5d87aa217da74d49b2695fc
Author: Alexandre Gut <alexandre.gut@icloud.com>
Date:   Tue Mar 31 13:28:42 2026 +0200

    Initial commit

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..cd59182
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,50 @@
+# ── Python ──────────────────────────────────────────────────────────────────
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.pyc
+
+# ── Environnements virtuels ──────────────────────────────────────────────────
+venv/
+.venv/
+env/
+ENV/
+Pipfile.lock
+
+# ── Secrets ───────────────────────────────────────────────────────────────────
+.env
+.env.*
+secret.env
+secrets.py
+*.secret
+credentials.json
+token.json
+
+# ── Logs ─────────────────────────────────────────────────────────────────────
+*.log
+logs/
+
+# ── IDE ───────────────────────────────────────────────────────────────────────
+.vscode/
+.idea/
+*.swp
+.DS_Store
+Thumbs.db
+
+# ── Tests / Coverage ─────────────────────────────────────────────────────────
+.coverage
+htmlcov/
+.pytest_cache/
+
+# ── Build ─────────────────────────────────────────────────────────────────────
+dist/
+build/
+*.egg-info/
+
+# ── Modèles IA (trop lourds pour git) ───────────────────────────────────────
+*.gguf
+*.bin
+*.safetensors
+models/
+weights/
diff --git a/Test.py b/Test.py
new file mode 100644
index 0000000..486476b
--- /dev/null
+++ b/Test.py
@@ -0,0 +1,36 @@
+import cv2
+import numpy as np
+
+# Créez un objet VideoCapture pour accéder à la webcam
+cap = cv2.VideoCapture(0)  # L'argument 0 représente la première webcam, vous pouvez changer cela si vous en avez plusieurs.
+
+# Chargez un modèle pré-entrainé de détection de bras (Haar Cascade)
+arm_cascade = cv2.CascadeClassifier('haarcascade_arm.xml')  # Assurez-vous d'avoir ce fichier XML dans le même répertoire.
+
+while True:
+    # Capturez une image à partir de la webcam
+    ret, frame = cap.read()
+    
+    if not ret:
+        break
+
+    # Convertissez l'image en niveaux de gris (c'est souvent plus rapide pour la détection)
+    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+
+    # Détectez les bras dans l'image
+    arms = arm_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
+
+    # Dessinez un rectangle autour de chaque bras détecté
+    for (x, y, w, h) in arms:
+        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
+
+    # Affichez l'image avec les rectangles dessinés
+    cv2.imshow('Arm Detection', frame)
+
+    # Arrêtez la boucle si l'utilisateur appuie sur la touche 'q'
+    if cv2.waitKey(1) & 0xFF == ord('q'):
+        break
+
+# Libérez la webcam et fermez toutes les fenêtres OpenCV
+cap.release()
+cv2.destroyAllWindows()
diff --git a/darknet-master/.circleci/config.yml b/darknet-master/.circleci/config.yml
new file mode 100644
index 0000000..d9e2e5d
--- /dev/null
+++ b/darknet-master/.circleci/config.yml
@@ -0,0 +1,30 @@
+version: 2.0
+jobs:
+  build:
+    docker:
+       - image: datamachines/cudnn_tensorflow_opencv:11.2.0_2.4.1_4.5.1-20210211
+#      - image: alexeyab84/dockerfiles:latest
+#      - image: alantrrs/cuda-opencv:latest
+#      - image: nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+    working_directory: ~/work
+    steps:
+      - checkout
+      - run: nvcc --version
+      - run: gcc --version
+      - run: export PATH=$PATH:/usr/local/include/opencv4/
+      - run: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/:/usr/lib/:/usr/lib64/
+      - run: make LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 -j 8
+      - run: make clean
+      - run: make LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 DEBUG=1 -j 8
+      - run: make clean
+      - run: make LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 AVX=1 -j 8
+      - run: make clean
+      - run: make LIBSO=1 GPU=0 CUDNN=0 OPENCV=1 -j 8
+      - run: make clean
+      - run: make LIBSO=1 GPU=1 CUDNN=0 OPENCV=1 -j 8
+      - run: make clean
+      - run: make LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 -j 8
+      - run: make clean
+      - run: make LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 CUDNN_HALF=1 -j 8
+      - run: make clean
+      - run: make LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 CUDNN_HALF=1 USE_CPP=1 -j 8
\ No newline at end of file
diff --git a/darknet-master/.github/FUNDING.yml b/darknet-master/.github/FUNDING.yml
new file mode 100644
index 0000000..0c5ae2e
--- /dev/null
+++ b/darknet-master/.github/FUNDING.yml
@@ -0,0 +1,12 @@
+# These are supported funding model platforms
+
+github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+custom: ['https://paypal.me/alexeyab84', 'https://blockchain.coinmarketcap.com/address/bitcoin/36La9T7DoLVMrUQzm6rBDGsxutyvDzbHnp', 'https://etherscan.io/address/0x193d56BE3C65e3Fb8f48c291B17C0702e211A588#', 'https://explorer.zcha.in/accounts/t1PzwJ28Prb7Nk8fgfT3RXCr6Xtw54tgjoy'] # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
diff --git a/darknet-master/.github/ISSUE_TEMPLATE/any-other-question-or-issue.md b/darknet-master/.github/ISSUE_TEMPLATE/any-other-question-or-issue.md
new file mode 100644
index 0000000..0904576
--- /dev/null
+++ b/darknet-master/.github/ISSUE_TEMPLATE/any-other-question-or-issue.md
@@ -0,0 +1,25 @@
+---
+name: Any other question or issue
+about: Any other question or issue
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+If something doesn’t work for you, then show 2 screenshots:
+1. screenshots of your issue
+2. screenshots with such information
+```
+./darknet detector test cfg/coco.data cfg/yolov4.cfg yolov4.weights data/dog.jpg
+ CUDA-version: 10000 (10000), cuDNN: 7.4.2, CUDNN_HALF=1, GPU count: 1
+ CUDNN_HALF=1
+ OpenCV version: 4.2.0
+ 0 : compute_capability = 750, cudnn_half = 1, GPU: GeForce RTX 2070
+net.optimized_memory = 0
+mini_batch = 1, batch = 8, time_steps = 1, train = 0
+   layer   filters  size/strd(dil)      input                output
+   0 conv     32       3 x 3/ 1    608 x 608 x   3 ->  608 x 608 x  32 0.639 BF
+```
+
+If you do not get an answer for a long time, try to find the answer among Issues with a Solved label: https://github.com/AlexeyAB/darknet/issues?q=is%3Aopen+is%3Aissue+label%3ASolved
diff --git a/darknet-master/.github/ISSUE_TEMPLATE/bug-report.md b/darknet-master/.github/ISSUE_TEMPLATE/bug-report.md
new file mode 100644
index 0000000..3204f84
--- /dev/null
+++ b/darknet-master/.github/ISSUE_TEMPLATE/bug-report.md
@@ -0,0 +1,27 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+If you want to report a bug - provide:
+    * description of a bug
+    * what command do you use?
+    * do you use Win/Linux/Mac?
+    * attach screenshot of a bug with previous messages in terminal
+    * in what cases a bug occurs, and in which not?
+    * if possible, specify date/commit of Darknet that works without this bug
+    * show such screenshot with info
+```
+./darknet detector test cfg/coco.data cfg/yolov4.cfg yolov4.weights data/dog.jpg
+ CUDA-version: 10000 (10000), cuDNN: 7.4.2, CUDNN_HALF=1, GPU count: 1
+ CUDNN_HALF=1
+ OpenCV version: 4.2.0
+ 0 : compute_capability = 750, cudnn_half = 1, GPU: GeForce RTX 2070
+net.optimized_memory = 0
+mini_batch = 1, batch = 8, time_steps = 1, train = 0
+   layer   filters  size/strd(dil)      input                output
+```
diff --git a/darknet-master/.github/ISSUE_TEMPLATE/feature_request.md b/darknet-master/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..fd23977
--- /dev/null
+++ b/darknet-master/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,13 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: Feature-request
+assignees: ''
+
+---
+
+For Feature-request:
+    * describe your feature as detailed as possible
+    * provide link to the paper and/or source code if it exist
+    * attach chart/table with comparison that shows improvement
diff --git a/darknet-master/.github/ISSUE_TEMPLATE/training-issue---no-detections---nan-avg-loss---low-accuracy.md b/darknet-master/.github/ISSUE_TEMPLATE/training-issue---no-detections---nan-avg-loss---low-accuracy.md
new file mode 100644
index 0000000..b826cdd
--- /dev/null
+++ b/darknet-master/.github/ISSUE_TEMPLATE/training-issue---no-detections---nan-avg-loss---low-accuracy.md
@@ -0,0 +1,30 @@
+---
+name: Training issue - no-detections / Nan avg-loss / low accuracy
+about: Training issue - no-detections / Nan avg-loss / low accuracy
+title: ''
+labels: Training issue
+assignees: ''
+
+---
+
+If you have an issue with training - no-detections / Nan avg-loss / low accuracy:
+    * read FAQ: https://github.com/AlexeyAB/darknet/wiki/FAQ---frequently-asked-questions
+    * what command do you use?
+    * what dataset do you use?   
+    * what Loss and mAP did you get?
+    * show chart.png with Loss and mAP    
+    * check your dataset - run training with flag `-show_imgs` i.e. `./darknet detector train ... -show_imgs` and look at the `aug_...jpg` images, do you see correct truth bounded boxes?
+    * rename your cfg-file to txt-file and drag-n-drop (attach) to your message here
+    * show content of generated files `bad.list` and `bad_label.list` if they exist
+    * Read `How to train (to detect your custom objects)` and `How to improve object detection` in the Readme: https://github.com/AlexeyAB/darknet/blob/master/README.md
+    * show such screenshot with info
+```
+./darknet detector test cfg/coco.data cfg/yolov4.cfg yolov4.weights data/dog.jpg
+ CUDA-version: 10000 (10000), cuDNN: 7.4.2, CUDNN_HALF=1, GPU count: 1
+ CUDNN_HALF=1
+ OpenCV version: 4.2.0
+ 0 : compute_capability = 750, cudnn_half = 1, GPU: GeForce RTX 2070
+net.optimized_memory = 0
+mini_batch = 1, batch = 8, time_steps = 1, train = 0
+   layer   filters  size/strd(dil)      input                output
+```
diff --git a/darknet-master/.github/workflows/ccpp.yml b/darknet-master/.github/workflows/ccpp.yml
new file mode 100644
index 0000000..ad386f4
--- /dev/null
+++ b/darknet-master/.github/workflows/ccpp.yml
@@ -0,0 +1,727 @@
+name: Darknet Continuous Integration
+
+on:
+  push:
+  workflow_dispatch:
+    inputs:
+      debug_enabled:
+        type: boolean
+        description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)'
+        required: false
+        default: false
+  schedule:
+    - cron: '0 0 * * *'
+
+env:
+  VCPKG_BINARY_SOURCES: 'clear;nuget,vcpkgbinarycache,readwrite'
+  VCPKG_FORCE_DOWNLOADED_BINARIES: "TRUE"
+
+jobs:
+  ubuntu-makefile:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt install libopencv-dev libgles2-mesa-dev
+
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.sh
+
+    - name: 'Create softlinks for CUDA'
+      run: |
+        source ${{ github.workspace }}/scripts/requested_cuda_version.sh
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so
+
+    - name: 'LIBSO=1 GPU=0 CUDNN=0 OPENCV=0'
+      run: |
+        make LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 DEBUG=1'
+      run: |
+        make LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 DEBUG=1 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 AVX=1'
+      run: |
+        make LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 AVX=1 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=0 CUDNN=0 OPENCV=1'
+      run: |
+        make LIBSO=1 GPU=0 CUDNN=0 OPENCV=1 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=1 CUDNN=1 OPENCV=1'
+      run: |
+        export PATH=/usr/local/cuda/bin:$PATH
+        export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
+        make LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 CUDNN_HALF=1'
+      run: |
+        export PATH=/usr/local/cuda/bin:$PATH
+        export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
+        make LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 CUDNN_HALF=1 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 CUDNN_HALF=1 USE_CPP=1'
+      run: |
+        export PATH=/usr/local/cuda/bin:$PATH
+        export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
+        make LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 CUDNN_HALF=1 USE_CPP=1 -j 8
+        make clean
+
+
+  ubuntu-vcpkg-opencv4-cuda:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Setup tmate session
+      uses: mxschmitt/action-tmate@v3
+      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
+
+    - uses: lukka/get-cmake@latest
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt-get install -y --no-install-recommends yasm nasm gperf libgles2-mesa-dev libx11-dev libxft-dev libxext-dev libxrandr-dev libxi-dev libxcursor-dev libxdamage-dev libxinerama-dev libdbus-1-dev libxtst-dev
+    - name: Clean downloads
+      run: sudo apt-get clean
+
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.sh
+
+    - name: 'Create softlinks for CUDA'
+      run: |
+        source ${{ github.workspace }}/scripts/requested_cuda_version.sh
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: Setup NuGet API key if found
+      shell: bash
+      env:
+        BAGET_API_KEY: ${{ secrets.BAGET_API_KEY }}
+      if: env.BAGET_API_KEY != null
+      run: >
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1)
+        setapikey ${{ secrets.BAGET_API_KEY }}
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      env:
+        CUDACXX: "/usr/local/cuda/bin/nvcc"
+        CUDA_PATH: "/usr/local/cuda"
+        CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
+        LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -DoNotUpdateVCPKG -EnableOPENCV -EnableCUDA -EnableCUDNN -DisableInteractive -DoNotUpdateTOOL -BuildInstaller
+
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-cuda-${{ runner.os }}
+        path: cfg
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-cuda-${{ runner.os }}
+        path: data
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-cuda-${{ runner.os }}
+        path: ${{ github.workspace }}/*dark*
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-cuda-${{ runner.os }}
+        path: ${{ github.workspace }}/uselib*
+
+
+  ubuntu-vcpkg-opencv3-cuda:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt-get install -y --no-install-recommends yasm nasm gperf libgles2-mesa-dev libx11-dev libxft-dev libxext-dev libxrandr-dev libxi-dev libxcursor-dev libxdamage-dev libxinerama-dev libdbus-1-dev libxtst-dev
+    - name: Clean downloads
+      run: sudo apt-get clean
+
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.sh
+
+    - name: 'Create softlinks for CUDA'
+      run: |
+        source ${{ github.workspace }}/scripts/requested_cuda_version.sh
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: Setup NuGet API key if found
+      shell: bash
+      env:
+        BAGET_API_KEY: ${{ secrets.BAGET_API_KEY }}
+      if: env.BAGET_API_KEY != null
+      run: >
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1)
+        setapikey ${{ secrets.BAGET_API_KEY }}
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      env:
+        CUDACXX: "/usr/local/cuda/bin/nvcc"
+        CUDA_PATH: "/usr/local/cuda"
+        CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
+        LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -DoNotUpdateVCPKG -EnableOPENCV -EnableCUDA -EnableCUDNN -ForceOpenCVVersion 3 -DisableInteractive -DoNotUpdateTOOL
+
+
+  ubuntu-vcpkg-opencv2-cuda:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt-get install -y --no-install-recommends yasm nasm gperf libgles2-mesa-dev libx11-dev libxft-dev libxext-dev libxrandr-dev libxi-dev libxcursor-dev libxdamage-dev libxinerama-dev libdbus-1-dev libxtst-dev
+    - name: Clean downloads
+      run: sudo apt-get clean
+
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.sh
+
+    - name: 'Create softlinks for CUDA'
+      run: |
+        source ${{ github.workspace }}/scripts/requested_cuda_version.sh
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: Setup NuGet API key if found
+      shell: bash
+      env:
+        BAGET_API_KEY: ${{ secrets.BAGET_API_KEY }}
+      if: env.BAGET_API_KEY != null
+      run: >
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1)
+        setapikey ${{ secrets.BAGET_API_KEY }}
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      env:
+        CUDACXX: "/usr/local/cuda/bin/nvcc"
+        CUDA_PATH: "/usr/local/cuda"
+        CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
+        LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -DoNotUpdateVCPKG -EnableOPENCV -EnableCUDA -EnableCUDNN -ForceOpenCVVersion 2 -DisableInteractive -DoNotUpdateTOOL
+
+
+  ubuntu:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt install libopencv-dev
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      env:
+        CUDACXX: "/usr/local/cuda/bin/nvcc"
+        CUDA_PATH: "/usr/local/cuda"
+        CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
+        LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+      run: ${{ github.workspace }}/build.ps1 -EnableOPENCV -DisableInteractive -DoNotUpdateTOOL
+
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: cfg
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: data
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: ${{ github.workspace }}/*dark*
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: ${{ github.workspace }}/uselib*
+
+
+  ubuntu-cuda:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt install libopencv-dev libgles2-mesa-dev
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.sh
+
+    - name: 'Create softlinks for CUDA'
+      run: |
+        source ${{ github.workspace }}/scripts/requested_cuda_version.sh
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so
+
+    - name: 'Build'
+      shell: pwsh
+      env:
+        CUDACXX: "/usr/local/cuda/bin/nvcc"
+        CUDA_PATH: "/usr/local/cuda"
+        CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
+        LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+      run: ${{ github.workspace }}/build.ps1 -EnableOPENCV -EnableCUDA -EnableCUDNN -DisableInteractive -DoNotUpdateTOOL
+
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-cuda-${{ runner.os }}
+        path: cfg
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-cuda-${{ runner.os }}
+        path: data
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-cuda-${{ runner.os }}
+        path: ${{ github.workspace }}/*dark*
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-cuda-${{ runner.os }}
+        path: ${{ github.workspace }}/uselib*
+
+
+  ubuntu-no-ocv-cpp:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -ForceCPP -DisableInteractive -DoNotUpdateTOOL
+
+    - name: Test on data/dog.jpg
+      shell: bash
+      run: >
+        wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights -O ${{ github.workspace }}/yolov4-tiny.weights;
+        ${{ github.workspace }}/build_release/darknet detect ${{ github.workspace }}/cfg/yolov4-tiny.cfg ${{ github.workspace }}/yolov4-tiny.weights ${{ github.workspace }}/data/dog.jpg -dont_show
+
+
+  ubuntu-setup-sh:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: Setup NuGet API key if found
+      shell: bash
+      env:
+        BAGET_API_KEY: ${{ secrets.BAGET_API_KEY }}
+      if: env.BAGET_API_KEY != null
+      run: >
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1)
+        setapikey ${{ secrets.BAGET_API_KEY }}
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Setup'
+      shell: bash
+      run: ${{ github.workspace }}/scripts/setup.sh -InstallTOOLS -InstallCUDA -BypassDRIVER
+
+
+  osx-vcpkg:
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Setup tmate session
+      uses: mxschmitt/action-tmate@v3
+      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
+
+    - name: Install dependencies
+      run: brew install libomp yasm nasm pkg-config
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: Setup NuGet API key if found
+      shell: bash
+      env:
+        BAGET_API_KEY: ${{ secrets.BAGET_API_KEY }}
+      if: env.BAGET_API_KEY != null
+      run: >
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1)
+        setapikey ${{ secrets.BAGET_API_KEY }}
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -DoNotUpdateVCPKG -EnableOPENCV -DisableInteractive -DoNotUpdateTOOL -BuildInstaller
+
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-${{ runner.os }}
+        path: cfg
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-${{ runner.os }}
+        path: data
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-${{ runner.os }}
+        path: ${{ github.workspace }}/*dark*
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-${{ runner.os }}
+        path: ${{ github.workspace }}/uselib*
+
+
+  osx:
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Install dependencies
+      run: brew install opencv libomp
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -EnableOPENCV -DisableInteractive -DoNotUpdateTOOL
+
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: cfg
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: data
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: ${{ github.workspace }}/*dark*
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: ${{ github.workspace }}/uselib*
+
+
+  osx-no-ocv-no-omp-cpp:
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -ForceCPP -DisableInteractive -DoNotUpdateTOOL
+
+
+  win-vcpkg:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Setup tmate session
+      uses: mxschmitt/action-tmate@v3
+      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: Setup NuGet API key if found
+      shell: bash
+      env:
+        BAGET_API_KEY: ${{ secrets.BAGET_API_KEY }}
+      if: env.BAGET_API_KEY != null
+      run: >
+        $(./vcpkg/vcpkg fetch nuget | tail -n 1)
+        setapikey ${{ secrets.BAGET_API_KEY }}
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -ForceLocalVCPKG -DoNotUpdateVCPKG -EnableOPENCV -DisableInteractive -DoNotUpdateTOOL -BuildInstaller
+
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-${{ runner.os }}
+        path: cfg
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-${{ runner.os }}
+        path: data
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-${{ runner.os }}
+        path: ${{ github.workspace }}/*dark*
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-${{ runner.os }}
+        path: ${{ github.workspace }}/build_release/*.dll
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-vcpkg-${{ runner.os }}
+        path: ${{ github.workspace }}/uselib*
+
+
+  win-vcpkg-port:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: Setup NuGet API key if found
+      shell: bash
+      env:
+        BAGET_API_KEY: ${{ secrets.BAGET_API_KEY }}
+      if: env.BAGET_API_KEY != null
+      run: >
+        $(./vcpkg/vcpkg fetch nuget | tail -n 1)
+        setapikey ${{ secrets.BAGET_API_KEY }}
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -ForceLocalVCPKG -InstallDARKNETthroughVCPKG -ForceVCPKGDarknetHEAD -EnableOPENCV -DisableInteractive -DoNotUpdateTOOL
+
+
+  win-intlibs:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -DisableInteractive -DoNotUpdateTOOL
+
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: cfg
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: data
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: ${{ github.workspace }}/*dark*
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: ${{ github.workspace }}/3rdparty/pthreads/bin/*.dll
+    - uses: actions/upload-artifact@v3
+      with:
+        name: darknet-${{ runner.os }}
+        path: ${{ github.workspace }}/uselib*
+
+
+  win-setup-ps1:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: Setup NuGet API key if found
+      shell: bash
+      env:
+        BAGET_API_KEY: ${{ secrets.BAGET_API_KEY }}
+      if: env.BAGET_API_KEY != null
+      run: >
+        $(./vcpkg/vcpkg fetch nuget | tail -n 1)
+        setapikey ${{ secrets.BAGET_API_KEY }}
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Setup'
+      shell: pwsh
+      run: ${{ github.workspace }}/scripts/setup.ps1 -InstallCUDA
+
+
+  win-vcpkg-base-cpp:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: Setup NuGet API key if found
+      shell: bash
+      env:
+        BAGET_API_KEY: ${{ secrets.BAGET_API_KEY }}
+      if: env.BAGET_API_KEY != null
+      run: >
+        $(./vcpkg/vcpkg fetch nuget | tail -n 1)
+        setapikey ${{ secrets.BAGET_API_KEY }}
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -ForceLocalVCPKG -DoNotUpdateVCPKG -ForceCPP -DisableInteractive -DoNotUpdateTOOL
+
+    - name: Download yolov4-tiny.weights
+      run: curl.exe -L https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights -o ${{ github.workspace }}\yolov4-tiny.weights
+    - name: Test on data/dog.jpg
+      run: ${{ github.workspace }}\build_release\darknet.exe detect ${{ github.workspace }}\cfg\yolov4-tiny.cfg ${{ github.workspace }}\yolov4-tiny.weights ${{ github.workspace }}\data\dog.jpg
+
+
+  win-csharp:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -EnableCSharpWrapper -DisableInteractive -DoNotUpdateTOOL
+
+
+  win-intlibs-cuda:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.ps1
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      env:
+        CUDA_PATH: "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.2"
+        CUDA_TOOLKIT_ROOT_DIR: "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.2"
+        CUDACXX: "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.2\\bin\\nvcc.exe"
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -EnableCUDA -DisableInteractive -DoNotUpdateTOOL
+
+
+  win-powershell51:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: powershell
+      run: ${{ github.workspace }}/build.ps1 -DisableInteractive -DoNotUpdateTOOL
+
+
+  mingw:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build with CMake'
+      run: |
+        mkdir build_release
+        cd build_release
+        cmake .. -G"MinGW Makefiles" -DCMAKE_BUILD_TYPE=Release -DENABLE_CUDA=OFF -DENABLE_CUDNN=OFF -DENABLE_OPENCV=OFF
+        cmake --build . --config Release --target install
diff --git a/darknet-master/.github/workflows/on_pr.yml b/darknet-master/.github/workflows/on_pr.yml
new file mode 100644
index 0000000..6e7d36b
--- /dev/null
+++ b/darknet-master/.github/workflows/on_pr.yml
@@ -0,0 +1,469 @@
+name: Darknet Pull Requests
+
+on: [pull_request]
+
+env:
+  VCPKG_BINARY_SOURCES: 'clear;nuget,vcpkgbinarycache,read'
+  VCPKG_FORCE_DOWNLOADED_BINARIES: "TRUE"
+
+jobs:
+  ubuntu-makefile:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt install libopencv-dev libgles2-mesa-dev
+
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.sh
+
+    - name: 'Create softlinks for CUDA'
+      run: |
+        source ${{ github.workspace }}/scripts/requested_cuda_version.sh
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so
+
+    - name: 'LIBSO=1 GPU=0 CUDNN=0 OPENCV=0'
+      run: |
+        make LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 DEBUG=1'
+      run: |
+        make LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 DEBUG=1 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 AVX=1'
+      run: |
+        make LIBSO=1 GPU=0 CUDNN=0 OPENCV=0 AVX=1 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=0 CUDNN=0 OPENCV=1'
+      run: |
+        make LIBSO=1 GPU=0 CUDNN=0 OPENCV=1 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=1 CUDNN=1 OPENCV=1'
+      run: |
+        export PATH=/usr/local/cuda/bin:$PATH
+        export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
+        make LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 CUDNN_HALF=1'
+      run: |
+        export PATH=/usr/local/cuda/bin:$PATH
+        export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
+        make LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 CUDNN_HALF=1 -j 8
+        make clean
+    - name: 'LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 CUDNN_HALF=1 USE_CPP=1'
+      run: |
+        export PATH=/usr/local/cuda/bin:$PATH
+        export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
+        make LIBSO=1 GPU=1 CUDNN=1 OPENCV=1 CUDNN_HALF=1 USE_CPP=1 -j 8
+        make clean
+
+
+  ubuntu-vcpkg-opencv4-cuda:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt-get install -y --no-install-recommends yasm nasm gperf libgles2-mesa-dev libx11-dev libxft-dev libxext-dev libxrandr-dev libxi-dev libxcursor-dev libxdamage-dev libxinerama-dev libdbus-1-dev libxtst-dev
+    - name: Clean downloads
+      run: sudo apt-get clean
+
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.sh
+
+    - name: 'Create softlinks for CUDA'
+      run: |
+        source ${{ github.workspace }}/scripts/requested_cuda_version.sh
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      env:
+        CUDACXX: "/usr/local/cuda/bin/nvcc"
+        CUDA_PATH: "/usr/local/cuda"
+        CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
+        LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -DoNotUpdateVCPKG -EnableOPENCV -EnableCUDA -EnableCUDNN -DisableInteractive -DoNotUpdateTOOL
+
+
+  ubuntu-vcpkg-opencv3-cuda:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt-get install -y --no-install-recommends yasm nasm gperf libgles2-mesa-dev libx11-dev libxft-dev libxext-dev libxrandr-dev libxi-dev libxcursor-dev libxdamage-dev libxinerama-dev libdbus-1-dev libxtst-dev
+    - name: Clean downloads
+      run: sudo apt-get clean
+
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.sh
+
+    - name: 'Create softlinks for CUDA'
+      run: |
+        source ${{ github.workspace }}/scripts/requested_cuda_version.sh
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      env:
+        CUDACXX: "/usr/local/cuda/bin/nvcc"
+        CUDA_PATH: "/usr/local/cuda"
+        CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
+        LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -DoNotUpdateVCPKG -EnableOPENCV -EnableCUDA -EnableCUDNN -ForceOpenCVVersion 3 -DisableInteractive -DoNotUpdateTOOL
+
+
+  ubuntu-vcpkg-opencv2-cuda:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt-get install -y --no-install-recommends yasm nasm gperf libgles2-mesa-dev libx11-dev libxft-dev libxext-dev libxrandr-dev libxi-dev libxcursor-dev libxdamage-dev libxinerama-dev libdbus-1-dev libxtst-dev
+    - name: Clean downloads
+      run: sudo apt-get clean
+
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.sh
+
+    - name: 'Create softlinks for CUDA'
+      run: |
+        source ${{ github.workspace }}/scripts/requested_cuda_version.sh
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      env:
+        CUDACXX: "/usr/local/cuda/bin/nvcc"
+        CUDA_PATH: "/usr/local/cuda"
+        CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
+        LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -DoNotUpdateVCPKG -EnableOPENCV -EnableCUDA -EnableCUDNN -ForceOpenCVVersion 2 -DisableInteractive -DoNotUpdateTOOL
+
+
+  ubuntu:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt install libopencv-dev
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      env:
+        CUDACXX: "/usr/local/cuda/bin/nvcc"
+        CUDA_PATH: "/usr/local/cuda"
+        CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
+        LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+      run: ${{ github.workspace }}/build.ps1 -EnableOPENCV -DisableInteractive -DoNotUpdateTOOL
+
+
+  ubuntu-cuda:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Update apt
+      run: sudo apt update
+    - name: Install dependencies
+      run: sudo apt install libopencv-dev libgles2-mesa-dev
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.sh
+
+    - name: 'Create softlinks for CUDA'
+      run: |
+        source ${{ github.workspace }}/scripts/requested_cuda_version.sh
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so.1
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so
+
+    - name: 'Build'
+      shell: pwsh
+      env:
+        CUDACXX: "/usr/local/cuda/bin/nvcc"
+        CUDA_PATH: "/usr/local/cuda"
+        CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
+        LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+      run: ${{ github.workspace }}/build.ps1 -EnableOPENCV -EnableCUDA -EnableCUDNN -DisableInteractive -DoNotUpdateTOOL
+
+
+  ubuntu-no-ocv-cpp:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -ForceCPP -DisableInteractive -DoNotUpdateTOOL
+
+    - name: Test on data/dog.jpg
+      shell: bash
+      run: >
+        wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights -O ${{ github.workspace }}/yolov4-tiny.weights;
+        ${{ github.workspace }}/build_release/darknet detect ${{ github.workspace }}/cfg/yolov4-tiny.cfg ${{ github.workspace }}/yolov4-tiny.weights ${{ github.workspace }}/data/dog.jpg -dont_show
+
+
+  ubuntu-setup-sh:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Setup'
+      shell: bash
+      run: ${{ github.workspace }}/scripts/setup.sh -InstallTOOLS -InstallCUDA -BypassDRIVER
+
+
+  osx-vcpkg:
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Install dependencies
+      run: brew install libomp yasm nasm pkg-config
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        mono $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -DoNotUpdateVCPKG -EnableOPENCV -DisableInteractive -DoNotUpdateTOOL
+
+
+  osx:
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Install dependencies
+      run: brew install opencv libomp
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -EnableOPENCV -DisableInteractive -DoNotUpdateTOOL
+
+
+  osx-no-ocv-no-omp-cpp:
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -ForceCPP -DisableInteractive -DoNotUpdateTOOL
+
+
+  win-vcpkg:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -ForceLocalVCPKG -DoNotUpdateVCPKG -EnableOPENCV -DisableInteractive -DoNotUpdateTOOL
+
+
+  win-intlibs:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -DisableInteractive -DoNotUpdateTOOL
+
+
+  win-setup-ps1:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Setup'
+      shell: pwsh
+      run: ${{ github.workspace }}/scripts/setup.ps1 -InstallCUDA
+
+
+  win-vcpkg-base-cpp:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Setup vcpkg and NuGet artifacts backend'
+      shell: bash
+      run: >
+        git clone --depth 1 https://github.com/microsoft/vcpkg ;
+        ./vcpkg/bootstrap-vcpkg.sh ;
+        $(./vcpkg/vcpkg fetch nuget | tail -n 1) sources add
+        -Name "vcpkgbinarycache"
+        -Source http://93.49.111.10:5555/v3/index.json
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -UseVCPKG -ForceLocalVCPKG -DoNotUpdateVCPKG -ForceCPP -DisableInteractive -DoNotUpdateTOOL
+
+    - name: Download yolov4-tiny.weights
+      run: curl.exe -L https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights -o ${{ github.workspace }}\yolov4-tiny.weights
+    - name: Test on data/dog.jpg
+      run: ${{ github.workspace }}\build_release\darknet.exe detect ${{ github.workspace }}\cfg\yolov4-tiny.cfg ${{ github.workspace }}\yolov4-tiny.weights ${{ github.workspace }}\data\dog.jpg
+
+
+  win-csharp:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -EnableCSharpWrapper -DisableInteractive -DoNotUpdateTOOL
+
+
+  win-intlibs-cuda:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: 'Install CUDA'
+      run: ${{ github.workspace }}/scripts/deploy-cuda.ps1
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      env:
+        CUDA_PATH: "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.2"
+        CUDA_TOOLKIT_ROOT_DIR: "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.2"
+        CUDACXX: "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.2\\bin\\nvcc.exe"
+      shell: pwsh
+      run: ${{ github.workspace }}/build.ps1 -EnableCUDA -DisableInteractive -DoNotUpdateTOOL
+
+
+  win-powershell51:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build'
+      shell: powershell
+      run: ${{ github.workspace }}/build.ps1 -DisableInteractive -DoNotUpdateTOOL
+
+
+  mingw:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: lukka/get-cmake@latest
+
+    - name: 'Build with CMake'
+      run: |
+        mkdir build_release
+        cd build_release
+        cmake .. -G"MinGW Makefiles" -DCMAKE_BUILD_TYPE=Release -DENABLE_CUDA=OFF -DENABLE_CUDNN=OFF -DENABLE_OPENCV=OFF
+        cmake --build . --config Release --target install
diff --git a/darknet-master/.github/workflows/rebase.yml b/darknet-master/.github/workflows/rebase.yml
new file mode 100644
index 0000000..f9e853e
--- /dev/null
+++ b/darknet-master/.github/workflows/rebase.yml
@@ -0,0 +1,26 @@
+name: Automatic Rebase
+on:
+  issue_comment:
+    types: [created]
+jobs:
+  rebase:
+    name: Rebase
+    runs-on: ubuntu-latest
+    if: >-
+      github.event.issue.pull_request != '' &&
+      (
+        contains(github.event.comment.body, '/rebase') ||
+        contains(github.event.comment.body, '/autosquash')
+      )
+    steps:
+      - name: Checkout the latest code
+        uses: actions/checkout@v3
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          fetch-depth: 0 # otherwise, you will fail to push refs to dest repo
+      - name: Automatic Rebase
+        uses: cirrus-actions/rebase@1.8
+        with:
+          autosquash: ${{ contains(github.event.comment.body, '/autosquash') || contains(github.event.comment.body, '/rebase-autosquash') }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/darknet-master/.gitignore b/darknet-master/.gitignore
new file mode 100644
index 0000000..bb62a60
--- /dev/null
+++ b/darknet-master/.gitignore
@@ -0,0 +1,70 @@
+*.o
+*.a
+*.dSYM
+*.csv
+*.out
+*.png
+*.jpg
+*.so
+*.exe
+*.dll
+*.lib
+*.dylib
+*.pyc
+*.bak
+mnist/
+data/
+caffe/
+grasp/
+images/
+opencv/
+convnet/
+decaf/
+submission/
+cfg/
+temp/
+build/darknet/*
+build_*/
+debug/
+ninja/
+ninja.zip
+vcpkg_installed/
+!build/darknet/YoloWrapper.cs
+.fuse*
+*.weights
+build/*.cmake
+build/*.ninja
+build/*.txt
+build/*.json
+build/CMakeFiles/
+build/detect_cuda_compute_capabilities.cu
+build/.ninja_deps
+build/.ninja_log
+build/Makefile
+*/vcpkg-manifest-install.log
+build.log
+__pycache__/
+
+# OS Generated #
+.DS_Store*
+ehthumbs.db
+Icon?
+Thumbs.db
+*.swp
+
+# IDE generated #
+.vs/
+.vscode/
+
+# Managed by CMake
+src/version.h
+
+# Build artifacts
+lib/
+share/
+include/darknet/
+uselib
+uselib_track
+darknet
+kmeansiou
+vcpkg/
diff --git a/darknet-master/3rdparty/getopt/getopt.c b/darknet-master/3rdparty/getopt/getopt.c
new file mode 100644
index 0000000..61aa096
--- /dev/null
+++ b/darknet-master/3rdparty/getopt/getopt.c
@@ -0,0 +1,498 @@
+#ifdef _MSC_VER
+#include "getopt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef REPLACE_GETOPT
+int opterr = 1; /* if error message should be printed */
+int optind = 1; /* index into parent argv vector */
+int optopt = '?'; /* character checked for validity */
+#undef optreset /* see getopt.h */
+#define optreset __mingw_optreset
+int optreset; /* reset getopt */
+char* optarg; /* argument associated with option */
+#endif
+
+static void
+_vwarnx(const char* fmt, va_list ap)
+{
+  (void)fprintf(stderr, "%s: ", __progname);
+  if (fmt != NULL)
+    (void)vfprintf(stderr, fmt, ap);
+  (void)fprintf(stderr, "\n");
+}
+
+static void
+warnx(const char* fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  _vwarnx(fmt, ap);
+  va_end(ap);
+}
+
+/*
+ * Compute the greatest common divisor of a and b.
+ */
+static int
+gcd(int a, int b)
+{
+  int c;
+
+  c = a % b;
+  while (c != 0) {
+    a = b;
+    b = c;
+    c = a % b;
+  }
+
+  return (b);
+}
+
+/*
+ * Exchange the block from nonopt_start to nonopt_end with the block
+ * from nonopt_end to opt_end (keeping the same order of arguments
+ * in each block).
+ */
+static void
+permute_args(int panonopt_start, int panonopt_end, int opt_end,
+    char* const* nargv)
+{
+  int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
+  char* swap;
+
+  /*
+     * compute lengths of blocks and number and size of cycles
+     */
+  nnonopts = panonopt_end - panonopt_start;
+  nopts = opt_end - panonopt_end;
+  ncycle = gcd(nnonopts, nopts);
+  cyclelen = (opt_end - panonopt_start) / ncycle;
+
+  for (i = 0; i < ncycle; i++) {
+    cstart = panonopt_end + i;
+    pos = cstart;
+    for (j = 0; j < cyclelen; j++) {
+      if (pos >= panonopt_end)
+        pos -= nnonopts;
+      else
+        pos += nopts;
+      swap = nargv[pos];
+      /* LINTED const cast */
+      ((char**)nargv)[pos] = nargv[cstart];
+      /* LINTED const cast */
+      ((char**)nargv)[cstart] = swap;
+    }
+  }
+}
+
+#ifdef REPLACE_GETOPT
+/*
+ * getopt --
+ *    Parse argc/argv argument vector.
+ *
+ * [eventually this will replace the BSD getopt]
+ */
+int getopt(int nargc, char* const* nargv, const char* options)
+{
+
+  /*
+     * We don't pass FLAG_PERMUTE to getopt_internal() since
+     * the BSD getopt(3) (unlike GNU) has never done this.
+     *
+     * Furthermore, since many privileged programs call getopt()
+     * before dropping privileges it makes sense to keep things
+     * as simple (and bug-free) as possible.
+     */
+  return (getopt_internal(nargc, nargv, options, NULL, NULL, 0));
+}
+#endif /* REPLACE_GETOPT */
+
+//extern int getopt(int nargc, char * const *nargv, const char *options);
+
+#ifdef __cplusplus
+}
+#endif
+/*
+ * POSIX requires the `getopt' API to be specified in `unistd.h';
+ * thus, `unistd.h' includes this header.  However, we do not want
+ * to expose the `getopt_long' or `getopt_long_only' APIs, when
+ * included in this manner.  Thus, close the standard __GETOPT_H__
+ * declarations block, and open an additional __GETOPT_LONG_H__
+ * specific block, only when *not* __UNISTD_H_SOURCED__, in which
+ * to declare the extended API.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct option /* specification for a long form option...    */
+{
+  const char* name; /* option name, without leading hyphens */
+  int has_arg; /* does it take an argument?        */
+  int* flag; /* where to save its status, or NULL    */
+  int val; /* its associated status value        */
+};
+
+enum /* permitted values for its `has_arg' field...    */
+{
+  no_argument = 0, /* option never takes an argument    */
+  required_argument, /* option always requires an argument    */
+  optional_argument /* option may take an argument        */
+};
+
+/*
+ * parse_long_options --
+ *    Parse long options in argc/argv argument vector.
+ * Returns -1 if short_too is set and the option does not match long_options.
+ */
+static int
+parse_long_options(char* const* nargv, const char* options,
+    const struct option* long_options, int* idx, int short_too)
+{
+  char *current_argv, *has_equal;
+  size_t current_argv_len;
+  int i, ambiguous, match;
+
+#define IDENTICAL_INTERPRETATION(_x, _y) \
+  (long_options[(_x)].has_arg == long_options[(_y)].has_arg && long_options[(_x)].flag == long_options[(_y)].flag && long_options[(_x)].val == long_options[(_y)].val)
+
+  current_argv = place;
+  match = -1;
+  ambiguous = 0;
+
+  optind++;
+
+  if ((has_equal = strchr(current_argv, '=')) != NULL) {
+    /* argument found (--option=arg) */
+    current_argv_len = has_equal - current_argv;
+    has_equal++;
+  } else
+    current_argv_len = strlen(current_argv);
+
+  for (i = 0; long_options[i].name; i++) {
+    /* find matching long option */
+    if (strncmp(current_argv, long_options[i].name,
+            current_argv_len))
+      continue;
+
+    if (strlen(long_options[i].name) == current_argv_len) {
+      /* exact match */
+      match = i;
+      ambiguous = 0;
+      break;
+    }
+    /*
+         * If this is a known short option, don't allow
+         * a partial match of a single character.
+         */
+    if (short_too && current_argv_len == 1)
+      continue;
+
+    if (match == -1) /* partial match */
+      match = i;
+    else if (!IDENTICAL_INTERPRETATION(i, match))
+      ambiguous = 1;
+  }
+  if (ambiguous) {
+    /* ambiguous abbreviation */
+    if (PRINT_ERROR)
+      warnx(ambig, (int)current_argv_len,
+          current_argv);
+    optopt = 0;
+    return (BADCH);
+  }
+  if (match != -1) { /* option found */
+    if (long_options[match].has_arg == no_argument
+        && has_equal) {
+      if (PRINT_ERROR)
+        warnx(noarg, (int)current_argv_len,
+            current_argv);
+      /*
+             * XXX: GNU sets optopt to val regardless of flag
+             */
+      if (long_options[match].flag == NULL)
+        optopt = long_options[match].val;
+      else
+        optopt = 0;
+      return (BADARG);
+    }
+    if (long_options[match].has_arg == required_argument || long_options[match].has_arg == optional_argument) {
+      if (has_equal)
+        optarg = has_equal;
+      else if (long_options[match].has_arg == required_argument) {
+        /*
+                 * optional argument doesn't use next nargv
+                 */
+        optarg = nargv[optind++];
+      }
+    }
+    if ((long_options[match].has_arg == required_argument)
+        && (optarg == NULL)) {
+      /*
+             * Missing argument; leading ':' indicates no error
+             * should be generated.
+             */
+      if (PRINT_ERROR)
+        warnx(recargstring,
+            current_argv);
+      /*
+             * XXX: GNU sets optopt to val regardless of flag
+             */
+      if (long_options[match].flag == NULL)
+        optopt = long_options[match].val;
+      else
+        optopt = 0;
+      --optind;
+      return (BADARG);
+    }
+  } else { /* unknown option */
+    if (short_too) {
+      --optind;
+      return (-1);
+    }
+    if (PRINT_ERROR)
+      warnx(illoptstring, current_argv);
+    optopt = 0;
+    return (BADCH);
+  }
+  if (idx)
+    *idx = match;
+  if (long_options[match].flag) {
+    *long_options[match].flag = long_options[match].val;
+    return (0);
+  } else
+    return (long_options[match].val);
+#undef IDENTICAL_INTERPRETATION
+}
+
+/*
+ * getopt_internal --
+ *    Parse argc/argv argument vector.  Called by user level routines.
+ */
+static int
+getopt_internal(int nargc, char* const* nargv, const char* options,
+    const struct option* long_options, int* idx, int flags)
+{
+  char* oli; /* option letter list index */
+  int optchar, short_too;
+  static int posixly_correct = -1;
+
+  if (options == NULL)
+    return (-1);
+
+  /*
+     * XXX Some GNU programs (like cvs) set optind to 0 instead of
+     * XXX using optreset.  Work around this braindamage.
+     */
+  if (optind == 0)
+    optind = optreset = 1;
+
+  /*
+     * Disable GNU extensions if POSIXLY_CORRECT is set or options
+     * string begins with a '+'.
+     *
+     * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or
+     *                 optreset != 0 for GNU compatibility.
+     */
+  if (posixly_correct == -1 || optreset != 0)
+    posixly_correct = (getenv("POSIXLY_CORRECT") != NULL);
+  if (*options == '-')
+    flags |= FLAG_ALLARGS;
+  else if (posixly_correct || *options == '+')
+    flags &= ~FLAG_PERMUTE;
+  if (*options == '+' || *options == '-')
+    options++;
+
+  optarg = NULL;
+  if (optreset)
+    nonopt_start = nonopt_end = -1;
+start:
+  if (optreset || !*place) { /* update scanning pointer */
+    optreset = 0;
+    if (optind >= nargc) { /* end of argument vector */
+      place = EMSG;
+      if (nonopt_end != -1) {
+        /* do permutation, if we have to */
+        permute_args(nonopt_start, nonopt_end,
+            optind, nargv);
+        optind -= nonopt_end - nonopt_start;
+      } else if (nonopt_start != -1) {
+        /*
+                 * If we skipped non-options, set optind
+                 * to the first of them.
+                 */
+        optind = nonopt_start;
+      }
+      nonopt_start = nonopt_end = -1;
+      return (-1);
+    }
+    if (*(place = nargv[optind]) != '-' || (place[1] == '\0' && strchr(options, '-') == NULL)) {
+      place = EMSG; /* found non-option */
+      if (flags & FLAG_ALLARGS) {
+        /*
+                 * GNU extension:
+                 * return non-option as argument to option 1
+                 */
+        optarg = nargv[optind++];
+        return (INORDER);
+      }
+      if (!(flags & FLAG_PERMUTE)) {
+        /*
+                 * If no permutation wanted, stop parsing
+                 * at first non-option.
+                 */
+        return (-1);
+      }
+      /* do permutation */
+      if (nonopt_start == -1)
+        nonopt_start = optind;
+      else if (nonopt_end != -1) {
+        permute_args(nonopt_start, nonopt_end,
+            optind, nargv);
+        nonopt_start = optind - (nonopt_end - nonopt_start);
+        nonopt_end = -1;
+      }
+      optind++;
+      /* process next argument */
+      goto start;
+    }
+    if (nonopt_start != -1 && nonopt_end == -1)
+      nonopt_end = optind;
+
+    /*
+         * If we have "-" do nothing, if "--" we are done.
+         */
+    if (place[1] != '\0' && *++place == '-' && place[1] == '\0') {
+      optind++;
+      place = EMSG;
+      /*
+             * We found an option (--), so if we skipped
+             * non-options, we have to permute.
+             */
+      if (nonopt_end != -1) {
+        permute_args(nonopt_start, nonopt_end,
+            optind, nargv);
+        optind -= nonopt_end - nonopt_start;
+      }
+      nonopt_start = nonopt_end = -1;
+      return (-1);
+    }
+  }
+
+  /*
+     * Check long options if:
+     *  1) we were passed some
+     *  2) the arg is not just "-"
+     *  3) either the arg starts with -- we are getopt_long_only()
+     */
+  if (long_options != NULL && place != nargv[optind] && (*place == '-' || (flags & FLAG_LONGONLY))) {
+    short_too = 0;
+    if (*place == '-')
+      place++; /* --foo long option */
+    else if (*place != ':' && strchr(options, *place) != NULL)
+      short_too = 1; /* could be short option too */
+
+    optchar = parse_long_options(nargv, options, long_options,
+        idx, short_too);
+    if (optchar != -1) {
+      place = EMSG;
+      return (optchar);
+    }
+  }
+
+  if ((optchar = (int)*place++) == (int)':' || (optchar == (int)'-' && *place != '\0') || (oli = (char*)strchr(options, optchar)) == NULL) {
+    /*
+         * If the user specified "-" and  '-' isn't listed in
+         * options, return -1 (non-option) as per POSIX.
+         * Otherwise, it is an unknown option character (or ':').
+         */
+    if (optchar == (int)'-' && *place == '\0')
+      return (-1);
+    if (!*place)
+      ++optind;
+    if (PRINT_ERROR)
+      warnx(illoptchar, optchar);
+    optopt = optchar;
+    return (BADCH);
+  }
+  if (long_options != NULL && optchar == 'W' && oli[1] == ';') {
+    /* -W long-option */
+    if (*place) /* no space */
+      /* NOTHING */;
+    else if (++optind >= nargc) { /* no arg */
+      place = EMSG;
+      if (PRINT_ERROR)
+        warnx(recargchar, optchar);
+      optopt = optchar;
+      return (BADARG);
+    } else /* white space */
+      place = nargv[optind];
+    optchar = parse_long_options(nargv, options, long_options,
+        idx, 0);
+    place = EMSG;
+    return (optchar);
+  }
+  if (*++oli != ':') { /* doesn't take argument */
+    if (!*place)
+      ++optind;
+  } else { /* takes (optional) argument */
+    optarg = NULL;
+    if (*place) /* no white space */
+      optarg = place;
+    else if (oli[1] != ':') { /* arg not optional */
+      if (++optind >= nargc) { /* no arg */
+        place = EMSG;
+        if (PRINT_ERROR)
+          warnx(recargchar, optchar);
+        optopt = optchar;
+        return (BADARG);
+      } else
+        optarg = nargv[optind];
+    }
+    place = EMSG;
+    ++optind;
+  }
+  /* dump back option letter */
+  return (optchar);
+}
+
+/*
+ * getopt_long --
+ *    Parse argc/argv argument vector.
+ */
+int getopt_long(int nargc, char* const* nargv, const char* options,
+    const struct option* long_options, int* idx)
+{
+
+  return (getopt_internal(nargc, nargv, options, long_options, idx,
+      FLAG_PERMUTE));
+}
+
+/*
+ * getopt_long_only --
+ *    Parse argc/argv argument vector.
+ */
+int getopt_long_only(int nargc, char* const* nargv, const char* options,
+    const struct option* long_options, int* idx)
+{
+
+  return (getopt_internal(nargc, nargv, options, long_options, idx,
+      FLAG_PERMUTE | FLAG_LONGONLY));
+}
+
+//extern int getopt_long(int nargc, char * const *nargv, const char *options,
+//    const struct option *long_options, int *idx);
+//extern int getopt_long_only(int nargc, char * const *nargv, const char *options,
+//    const struct option *long_options, int *idx);
+/*
+ * Previous MinGW implementation had...
+ */
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/3rdparty/getopt/getopt.h b/darknet-master/3rdparty/getopt/getopt.h
new file mode 100644
index 0000000..a9f913d
--- /dev/null
+++ b/darknet-master/3rdparty/getopt/getopt.h
@@ -0,0 +1,228 @@
+#ifdef _MSC_VER
+#ifndef __GETOPT_H__
+/**
+ * DISCLAIMER
+ * This file is part of the mingw-w64 runtime package.
+ *
+ * The mingw-w64 runtime package and its code is distributed in the hope that it
+ * will be useful but WITHOUT ANY WARRANTY.  ALL WARRANTIES, EXPRESSED OR
+ * IMPLIED ARE HEREBY DISCLAIMED.  This includes but is not limited to
+ * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+/*
+ * Copyright (c) 2002 Todd C. Miller <Todd.Miller@courtesan.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F39502-99-1-0512.
+ */
+/*-
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define __GETOPT_H__
+
+/* All the headers include this file. */
+#include <crtdefs.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define REPLACE_GETOPT /* use this getopt as the system getopt(3) */
+
+//extern int optind;        /* index of first non-option in argv      */
+//extern int optopt;        /* single option character, as parsed     */
+//extern int opterr;        /* flag to enable built-in diagnostics... */
+//                /* (user may set to zero, to suppress)    */
+//
+//extern char *optarg;        /* pointer to argument of current option  */
+
+#define PRINT_ERROR ((opterr) && (*options != ':'))
+
+#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */
+#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */
+#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */
+
+/* return values */
+#define BADCH (int)'?'
+#define BADARG ((*options == ':') ? (int)':' : (int)'?')
+#define INORDER (int)1
+
+#ifndef __CYGWIN__
+#define __progname __argv[0]
+#else
+extern char __declspec(dllimport) * __progname;
+#endif
+
+#ifdef __CYGWIN__
+static char EMSG[] = "";
+#else
+#define EMSG ""
+#endif
+
+static int getopt_internal(int, char* const*, const char*,
+    const struct option*, int*, int);
+static int parse_long_options(char* const*, const char*,
+    const struct option*, int*, int);
+static int gcd(int, int);
+static void permute_args(int, int, int, char* const*);
+
+static char* place = EMSG; /* option letter processing */
+
+/* XXX: set optreset to 1 rather than these two */
+static int nonopt_start = -1; /* first non option argument (for permute) */
+static int nonopt_end = -1; /* first option after non options (for permute) */
+
+/* Error messages */
+static const char recargchar[] = "option requires an argument -- %c";
+static const char recargstring[] = "option requires an argument -- %s";
+static const char ambig[] = "ambiguous option -- %.*s";
+static const char noarg[] = "option doesn't take an argument -- %.*s";
+static const char illoptchar[] = "unknown option -- %c";
+static const char illoptstring[] = "unknown option -- %s";
+
+static void _vwarnx(const char* fmt, va_list ap);
+
+static void warnx(const char* fmt, ...);
+
+/*
+ * Compute the greatest common divisor of a and b.
+ */
+static int gcd(int a, int b);
+
+/*
+ * Exchange the block from nonopt_start to nonopt_end with the block
+ * from nonopt_end to opt_end (keeping the same order of arguments
+ * in each block).
+ */
+static void permute_args(int panonopt_start, int panonopt_end, int opt_end, char* const* nargv);
+
+#ifdef REPLACE_GETOPT
+/*
+ * getopt --
+ *    Parse argc/argv argument vector.
+ *
+ * [eventually this will replace the BSD getopt]
+ */
+int getopt(int nargc, char* const* nargv, const char* options);
+#endif /* REPLACE_GETOPT */
+
+//extern int getopt(int nargc, char * const *nargv, const char *options);
+
+#ifdef _BSD_SOURCE
+/*
+ * BSD adds the non-standard `optreset' feature, for reinitialisation
+ * of `getopt' parsing.  We support this feature, for applications which
+ * proclaim their BSD heritage, before including this header; however,
+ * to maintain portability, developers are advised to avoid it.
+ */
+#define optreset __mingw_optreset
+extern int optreset;
+#endif
+#ifdef __cplusplus
+}
+#endif
+/*
+ * POSIX requires the `getopt' API to be specified in `unistd.h';
+ * thus, `unistd.h' includes this header.  However, we do not want
+ * to expose the `getopt_long' or `getopt_long_only' APIs, when
+ * included in this manner.  Thus, close the standard __GETOPT_H__
+ * declarations block, and open an additional __GETOPT_LONG_H__
+ * specific block, only when *not* __UNISTD_H_SOURCED__, in which
+ * to declare the extended API.
+ */
+#endif /* !defined(__GETOPT_H__) */
+
+#if !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__)
+#define __GETOPT_LONG_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * parse_long_options --
+ *    Parse long options in argc/argv argument vector.
+ * Returns -1 if short_too is set and the option does not match long_options.
+ */
+/* static int parse_long_options(char* const* nargv, const char* options, const struct option* long_options, int* idx, int short_too); */
+
+/*
+ * getopt_internal --
+ *    Parse argc/argv argument vector.  Called by user level routines.
+ */
+/* static int getopt_internal(int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx, int flags); */
+
+/*
+ * getopt_long --
+ *    Parse argc/argv argument vector.
+ */
+int getopt_long(int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx);
+
+/*
+ * getopt_long_only --
+ *    Parse argc/argv argument vector.
+ */
+int getopt_long_only(int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx);
+
+/*
+ * Previous MinGW implementation had...
+ */
+#ifndef HAVE_DECL_GETOPT
+/*
+ * ...for the long form API only; keep this for compatibility.
+ */
+#define HAVE_DECL_GETOPT 1
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) */
+#endif
diff --git a/darknet-master/3rdparty/pthreads/include/pthread.h b/darknet-master/3rdparty/pthreads/include/pthread.h
new file mode 100644
index 0000000..b4072f7
--- /dev/null
+++ b/darknet-master/3rdparty/pthreads/include/pthread.h
@@ -0,0 +1,1368 @@
+/* This is an implementation of the threads API of POSIX 1003.1-2001.
+ *
+ * --------------------------------------------------------------------------
+ *
+ *      Pthreads-win32 - POSIX Threads Library for Win32
+ *      Copyright(C) 1998 John E. Bossom
+ *      Copyright(C) 1999,2005 Pthreads-win32 contributors
+ * 
+ *      Contact Email: rpj@callisto.canberra.edu.au
+ * 
+ *      The current list of contributors is contained
+ *      in the file CONTRIBUTORS included with the source
+ *      code distribution. The list can also be seen at the
+ *      following World Wide Web location:
+ *      http://sources.redhat.com/pthreads-win32/contributors.html
+ * 
+ *      This library is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU Lesser General Public
+ *      License as published by the Free Software Foundation; either
+ *      version 2 of the License, or (at your option) any later version.
+ * 
+ *      This library is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *      Lesser General Public License for more details.
+ * 
+ *      You should have received a copy of the GNU Lesser General Public
+ *      License along with this library in the file COPYING.LIB;
+ *      if not, write to the Free Software Foundation, Inc.,
+ *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#if !defined( PTHREAD_H )
+#define PTHREAD_H
+
+/*
+ * See the README file for an explanation of the pthreads-win32 version
+ * numbering scheme and how the DLL is named etc.
+ */
+#define PTW32_VERSION 2,9,1,0
+#define PTW32_VERSION_STRING "2, 9, 1, 0\0"
+
+/* There are three implementations of cancel cleanup.
+ * Note that pthread.h is included in both application
+ * compilation units and also internally for the library.
+ * The code here and within the library aims to work
+ * for all reasonable combinations of environments.
+ *
+ * The three implementations are:
+ *
+ *   WIN32 SEH
+ *   C
+ *   C++
+ *
+ * Please note that exiting a push/pop block via
+ * "return", "exit", "break", or "continue" will
+ * lead to different behaviour amongst applications
+ * depending upon whether the library was built
+ * using SEH, C++, or C. For example, a library built
+ * with SEH will call the cleanup routine, while both
+ * C++ and C built versions will not.
+ */
+
+/*
+ * Define defaults for cleanup code.
+ * Note: Unless the build explicitly defines one of the following, then
+ * we default to standard C style cleanup. This style uses setjmp/longjmp
+ * in the cancelation and thread exit implementations and therefore won't
+ * do stack unwinding if linked to applications that have it (e.g.
+ * C++ apps). This is currently consistent with most/all commercial Unix
+ * POSIX threads implementations.
+ */
+#if !defined( __CLEANUP_SEH ) && !defined( __CLEANUP_CXX ) && !defined( __CLEANUP_C )
+# define __CLEANUP_C
+#endif
+
+#if defined( __CLEANUP_SEH ) && ( !defined( _MSC_VER ) && !defined(PTW32_RC_MSC))
+#error ERROR [__FILE__, line __LINE__]: SEH is not supported for this compiler.
+#endif
+
+/*
+ * Stop here if we are being included by the resource compiler.
+ */
+#if !defined(RC_INVOKED)
+
+#undef PTW32_LEVEL
+
+#if defined(_POSIX_SOURCE)
+#define PTW32_LEVEL 0
+/* Early POSIX */
+#endif
+
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309
+#undef PTW32_LEVEL
+#define PTW32_LEVEL 1
+/* Include 1b, 1c and 1d */
+#endif
+
+#if defined(INCLUDE_NP)
+#undef PTW32_LEVEL
+#define PTW32_LEVEL 2
+/* Include Non-Portable extensions */
+#endif
+
+#define PTW32_LEVEL_MAX 3
+
+#if ( defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112 )  || !defined(PTW32_LEVEL)
+#define PTW32_LEVEL PTW32_LEVEL_MAX
+/* Include everything */
+#endif
+
+#if defined(_UWIN)
+#   define HAVE_STRUCT_TIMESPEC 1
+#   define HAVE_SIGNAL_H        1
+#   undef HAVE_PTW32_CONFIG_H
+#   pragma comment(lib, "pthread")
+#endif
+
+/*
+ * -------------------------------------------------------------
+ *
+ *
+ * Module: pthread.h
+ *
+ * Purpose:
+ *      Provides an implementation of PThreads based upon the
+ *      standard:
+ *
+ *              POSIX 1003.1-2001
+ *  and
+ *    The Single Unix Specification version 3
+ *
+ *    (these two are equivalent)
+ *
+ *      in order to enhance code portability between Windows,
+ *  various commercial Unix implementations, and Linux.
+ *
+ *      See the ANNOUNCE file for a full list of conforming
+ *      routines and defined constants, and a list of missing
+ *      routines and constants not defined in this implementation.
+ *
+ * Authors:
+ *      There have been many contributors to this library.
+ *      The initial implementation was contributed by
+ *      John Bossom, and several others have provided major
+ *      sections or revisions of parts of the implementation.
+ *      Often significant effort has been contributed to
+ *      find and fix important bugs and other problems to
+ *      improve the reliability of the library, which sometimes
+ *      is not reflected in the amount of code which changed as
+ *      result.
+ *      As much as possible, the contributors are acknowledged
+ *      in the ChangeLog file in the source code distribution
+ *      where their changes are noted in detail.
+ *
+ *      Contributors are listed in the CONTRIBUTORS file.
+ *
+ *      As usual, all bouquets go to the contributors, and all
+ *      brickbats go to the project maintainer.
+ *
+ * Maintainer:
+ *      The code base for this project is coordinated and
+ *      eventually pre-tested, packaged, and made available by
+ *
+ *              Ross Johnson <rpj@callisto.canberra.edu.au>
+ *
+ * QA Testers:
+ *      Ultimately, the library is tested in the real world by
+ *      a host of competent and demanding scientists and
+ *      engineers who report bugs and/or provide solutions
+ *      which are then fixed or incorporated into subsequent
+ *      versions of the library. Each time a bug is fixed, a
+ *      test case is written to prove the fix and ensure
+ *      that later changes to the code don't reintroduce the
+ *      same error. The number of test cases is slowly growing
+ *      and therefore so is the code reliability.
+ *
+ * Compliance:
+ *      See the file ANNOUNCE for the list of implemented
+ *      and not-implemented routines and defined options.
+ *      Of course, these are all defined is this file as well.
+ *
+ * Web site:
+ *      The source code and other information about this library
+ *      are available from
+ *
+ *              http://sources.redhat.com/pthreads-win32/
+ *
+ * -------------------------------------------------------------
+ */
+
+/* Try to avoid including windows.h */
+#if (defined(__MINGW64__) || defined(__MINGW32__)) && defined(__cplusplus)
+#define PTW32_INCLUDE_WINDOWS_H
+#endif
+
+#if defined(PTW32_INCLUDE_WINDOWS_H)
+#include <windows.h>
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER < 1300 || defined(__DMC__)
+/*
+ * VC++6.0 or early compiler's header has no DWORD_PTR type.
+ */
+typedef unsigned long DWORD_PTR;
+typedef unsigned long ULONG_PTR;
+#endif
+/*
+ * -----------------
+ * autoconf switches
+ * -----------------
+ */
+
+#if defined(HAVE_PTW32_CONFIG_H)
+#include "config.h"
+#endif /* HAVE_PTW32_CONFIG_H */
+
+#if !defined(NEED_FTIME)
+#include <time.h>
+#else /* NEED_FTIME */
+/* use native WIN32 time API */
+#endif /* NEED_FTIME */
+
+#if defined(HAVE_SIGNAL_H)
+#include <signal.h>
+#endif /* HAVE_SIGNAL_H */
+
+#include <limits.h>
+
+/*
+ * Boolean values to make us independent of system includes.
+ */
+enum {
+  PTW32_FALSE = 0,
+  PTW32_TRUE = (! PTW32_FALSE)
+};
+
+/*
+ * This is a duplicate of what is in the autoconf config.h,
+ * which is only used when building the pthread-win32 libraries.
+ */
+
+#if !defined(PTW32_CONFIG_H)
+#  if defined(WINCE)
+#    define NEED_ERRNO
+#    define NEED_SEM
+#  endif
+#  if defined(__MINGW64__)
+#    define HAVE_STRUCT_TIMESPEC
+#    define HAVE_MODE_T
+#  elif defined(_UWIN) || defined(__MINGW32__)
+#    define HAVE_MODE_T
+#  endif
+#endif
+
+/*
+ *
+ */
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+#if defined(NEED_ERRNO)
+#include "need_errno.h"
+#else
+#include <errno.h>
+#endif
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Several systems don't define some error numbers.
+ */
+#if !defined(ENOTSUP)
+#  define ENOTSUP 48   /* This is the value in Solaris. */
+#endif
+
+#if !defined(ETIMEDOUT)
+#  define ETIMEDOUT 10060 /* Same as WSAETIMEDOUT */
+#endif
+
+#if !defined(ENOSYS)
+#  define ENOSYS 140     /* Semi-arbitrary value */
+#endif
+
+#if !defined(EDEADLK)
+#  if defined(EDEADLOCK)
+#    define EDEADLK EDEADLOCK
+#  else
+#    define EDEADLK 36     /* This is the value in MSVC. */
+#  endif
+#endif
+
+/* POSIX 2008 - related to robust mutexes */
+#if !defined(EOWNERDEAD)
+#  define EOWNERDEAD 43
+#endif
+#if !defined(ENOTRECOVERABLE)
+#  define ENOTRECOVERABLE 44
+#endif
+
+#include <sched.h>
+
+/*
+ * To avoid including windows.h we define only those things that we
+ * actually need from it.
+ */
+#if !defined(PTW32_INCLUDE_WINDOWS_H)
+#if !defined(HANDLE)
+# define PTW32__HANDLE_DEF
+# define HANDLE void *
+#endif
+#if !defined(DWORD)
+# define PTW32__DWORD_DEF
+# define DWORD unsigned long
+#endif
+#endif
+
+#if !defined(HAVE_STRUCT_TIMESPEC)
+#define HAVE_STRUCT_TIMESPEC
+#if !defined(_TIMESPEC_DEFINED)
+#define _TIMESPEC_DEFINED
+struct timespec {
+        time_t tv_sec;
+        long tv_nsec;
+};
+#endif /* _TIMESPEC_DEFINED */
+#endif /* HAVE_STRUCT_TIMESPEC */
+
+#if !defined(SIG_BLOCK)
+#define SIG_BLOCK 0
+#endif /* SIG_BLOCK */
+
+#if !defined(SIG_UNBLOCK)
+#define SIG_UNBLOCK 1
+#endif /* SIG_UNBLOCK */
+
+#if !defined(SIG_SETMASK)
+#define SIG_SETMASK 2
+#endif /* SIG_SETMASK */
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif                          /* __cplusplus */
+
+/*
+ * -------------------------------------------------------------
+ *
+ * POSIX 1003.1-2001 Options
+ * =========================
+ *
+ * Options are normally set in <unistd.h>, which is not provided
+ * with pthreads-win32.
+ *
+ * For conformance with the Single Unix Specification (version 3), all of the
+ * options below are defined, and have a value of either -1 (not supported)
+ * or 200112L (supported).
+ *
+ * These options can neither be left undefined nor have a value of 0, because
+ * either indicates that sysconf(), which is not implemented, may be used at
+ * runtime to check the status of the option.
+ *
+ * _POSIX_THREADS (== 200112L)
+ *                      If == 200112L, you can use threads
+ *
+ * _POSIX_THREAD_ATTR_STACKSIZE (== 200112L)
+ *                      If == 200112L, you can control the size of a thread's
+ *                      stack
+ *                              pthread_attr_getstacksize
+ *                              pthread_attr_setstacksize
+ *
+ * _POSIX_THREAD_ATTR_STACKADDR (== -1)
+ *                      If == 200112L, you can allocate and control a thread's
+ *                      stack. If not supported, the following functions
+ *                      will return ENOSYS, indicating they are not
+ *                      supported:
+ *                              pthread_attr_getstackaddr
+ *                              pthread_attr_setstackaddr
+ *
+ * _POSIX_THREAD_PRIORITY_SCHEDULING (== -1)
+ *                      If == 200112L, you can use realtime scheduling.
+ *                      This option indicates that the behaviour of some
+ *                      implemented functions conforms to the additional TPS
+ *                      requirements in the standard. E.g. rwlocks favour
+ *                      writers over readers when threads have equal priority.
+ *
+ * _POSIX_THREAD_PRIO_INHERIT (== -1)
+ *                      If == 200112L, you can create priority inheritance
+ *                      mutexes.
+ *                              pthread_mutexattr_getprotocol +
+ *                              pthread_mutexattr_setprotocol +
+ *
+ * _POSIX_THREAD_PRIO_PROTECT (== -1)
+ *                      If == 200112L, you can create priority ceiling mutexes
+ *                      Indicates the availability of:
+ *                              pthread_mutex_getprioceiling
+ *                              pthread_mutex_setprioceiling
+ *                              pthread_mutexattr_getprioceiling
+ *                              pthread_mutexattr_getprotocol     +
+ *                              pthread_mutexattr_setprioceiling
+ *                              pthread_mutexattr_setprotocol     +
+ *
+ * _POSIX_THREAD_PROCESS_SHARED (== -1)
+ *                      If set, you can create mutexes and condition
+ *                      variables that can be shared with another
+ *                      process.If set, indicates the availability
+ *                      of:
+ *                              pthread_mutexattr_getpshared
+ *                              pthread_mutexattr_setpshared
+ *                              pthread_condattr_getpshared
+ *                              pthread_condattr_setpshared
+ *
+ * _POSIX_THREAD_SAFE_FUNCTIONS (== 200112L)
+ *                      If == 200112L you can use the special *_r library
+ *                      functions that provide thread-safe behaviour
+ *
+ * _POSIX_READER_WRITER_LOCKS (== 200112L)
+ *                      If == 200112L, you can use read/write locks
+ *
+ * _POSIX_SPIN_LOCKS (== 200112L)
+ *                      If == 200112L, you can use spin locks
+ *
+ * _POSIX_BARRIERS (== 200112L)
+ *                      If == 200112L, you can use barriers
+ *
+ *      + These functions provide both 'inherit' and/or
+ *        'protect' protocol, based upon these macro
+ *        settings.
+ *
+ * -------------------------------------------------------------
+ */
+
+/*
+ * POSIX Options
+ */
+#undef _POSIX_THREADS
+#define _POSIX_THREADS 200809L
+
+#undef _POSIX_READER_WRITER_LOCKS
+#define _POSIX_READER_WRITER_LOCKS 200809L
+
+#undef _POSIX_SPIN_LOCKS
+#define _POSIX_SPIN_LOCKS 200809L
+
+#undef _POSIX_BARRIERS
+#define _POSIX_BARRIERS 200809L
+
+#undef _POSIX_THREAD_SAFE_FUNCTIONS
+#define _POSIX_THREAD_SAFE_FUNCTIONS 200809L
+
+#undef _POSIX_THREAD_ATTR_STACKSIZE
+#define _POSIX_THREAD_ATTR_STACKSIZE 200809L
+
+/*
+ * The following options are not supported
+ */
+#undef _POSIX_THREAD_ATTR_STACKADDR
+#define _POSIX_THREAD_ATTR_STACKADDR -1
+
+#undef _POSIX_THREAD_PRIO_INHERIT
+#define _POSIX_THREAD_PRIO_INHERIT -1
+
+#undef _POSIX_THREAD_PRIO_PROTECT
+#define _POSIX_THREAD_PRIO_PROTECT -1
+
+/* TPS is not fully supported.  */
+#undef _POSIX_THREAD_PRIORITY_SCHEDULING
+#define _POSIX_THREAD_PRIORITY_SCHEDULING -1
+
+#undef _POSIX_THREAD_PROCESS_SHARED
+#define _POSIX_THREAD_PROCESS_SHARED -1
+
+
+/*
+ * POSIX 1003.1-2001 Limits
+ * ===========================
+ *
+ * These limits are normally set in <limits.h>, which is not provided with
+ * pthreads-win32.
+ *
+ * PTHREAD_DESTRUCTOR_ITERATIONS
+ *                      Maximum number of attempts to destroy
+ *                      a thread's thread-specific data on
+ *                      termination (must be at least 4)
+ *
+ * PTHREAD_KEYS_MAX
+ *                      Maximum number of thread-specific data keys
+ *                      available per process (must be at least 128)
+ *
+ * PTHREAD_STACK_MIN
+ *                      Minimum supported stack size for a thread
+ *
+ * PTHREAD_THREADS_MAX
+ *                      Maximum number of threads supported per
+ *                      process (must be at least 64).
+ *
+ * SEM_NSEMS_MAX
+ *                      The maximum number of semaphores a process can have.
+ *                      (must be at least 256)
+ *
+ * SEM_VALUE_MAX
+ *                      The maximum value a semaphore can have.
+ *                      (must be at least 32767)
+ *
+ */
+#undef _POSIX_THREAD_DESTRUCTOR_ITERATIONS
+#define _POSIX_THREAD_DESTRUCTOR_ITERATIONS     4
+
+#undef PTHREAD_DESTRUCTOR_ITERATIONS
+#define PTHREAD_DESTRUCTOR_ITERATIONS           _POSIX_THREAD_DESTRUCTOR_ITERATIONS
+
+#undef _POSIX_THREAD_KEYS_MAX
+#define _POSIX_THREAD_KEYS_MAX                  128
+
+#undef PTHREAD_KEYS_MAX
+#define PTHREAD_KEYS_MAX                        _POSIX_THREAD_KEYS_MAX
+
+#undef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN                       0
+
+#undef _POSIX_THREAD_THREADS_MAX
+#define _POSIX_THREAD_THREADS_MAX               64
+
+  /* Arbitrary value */
+#undef PTHREAD_THREADS_MAX
+#define PTHREAD_THREADS_MAX                     2019
+
+#undef _POSIX_SEM_NSEMS_MAX
+#define _POSIX_SEM_NSEMS_MAX                    256
+
+  /* Arbitrary value */
+#undef SEM_NSEMS_MAX
+#define SEM_NSEMS_MAX                           1024
+
+#undef _POSIX_SEM_VALUE_MAX
+#define _POSIX_SEM_VALUE_MAX                    32767
+
+#undef SEM_VALUE_MAX
+#define SEM_VALUE_MAX                           INT_MAX
+
+
+#if defined(__GNUC__) && !defined(__declspec)
+# error Please upgrade your GNU compiler to one that supports __declspec.
+#endif
+
+/*
+ * When building the library, you should define PTW32_BUILD so that
+ * the variables/functions are exported correctly. When using the library,
+ * do NOT define PTW32_BUILD, and then the variables/functions will
+ * be imported correctly.
+ */
+#if !defined(PTW32_STATIC_LIB)
+#  if defined(PTW32_BUILD)
+#    define PTW32_DLLPORT __declspec (dllexport)
+#  else
+#    define PTW32_DLLPORT __declspec (dllimport)
+#  endif
+#else
+#  define PTW32_DLLPORT
+#endif
+
+/*
+ * The Open Watcom C/C++ compiler uses a non-standard calling convention
+ * that passes function args in registers unless __cdecl is explicitly specified
+ * in exposed function prototypes.
+ *
+ * We force all calls to cdecl even though this could slow Watcom code down
+ * slightly. If you know that the Watcom compiler will be used to build both
+ * the DLL and application, then you can probably define this as a null string.
+ * Remember that pthread.h (this file) is used for both the DLL and application builds.
+ */
+#define PTW32_CDECL __cdecl
+
+#if defined(_UWIN) && PTW32_LEVEL >= PTW32_LEVEL_MAX
+#   include     <sys/types.h>
+#else
+/*
+ * Generic handle type - intended to extend uniqueness beyond
+ * that available with a simple pointer. It should scale for either
+ * IA-32 or IA-64.
+ */
+typedef struct {
+    void * p;                   /* Pointer to actual object */
+    unsigned int x;             /* Extra information - reuse count etc */
+} ptw32_handle_t;
+
+typedef ptw32_handle_t pthread_t;
+typedef struct pthread_attr_t_ * pthread_attr_t;
+typedef struct pthread_once_t_ pthread_once_t;
+typedef struct pthread_key_t_ * pthread_key_t;
+typedef struct pthread_mutex_t_ * pthread_mutex_t;
+typedef struct pthread_mutexattr_t_ * pthread_mutexattr_t;
+typedef struct pthread_cond_t_ * pthread_cond_t;
+typedef struct pthread_condattr_t_ * pthread_condattr_t;
+#endif
+typedef struct pthread_rwlock_t_ * pthread_rwlock_t;
+typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t;
+typedef struct pthread_spinlock_t_ * pthread_spinlock_t;
+typedef struct pthread_barrier_t_ * pthread_barrier_t;
+typedef struct pthread_barrierattr_t_ * pthread_barrierattr_t;
+
+/*
+ * ====================
+ * ====================
+ * POSIX Threads
+ * ====================
+ * ====================
+ */
+
+enum {
+/*
+ * pthread_attr_{get,set}detachstate
+ */
+  PTHREAD_CREATE_JOINABLE       = 0,  /* Default */
+  PTHREAD_CREATE_DETACHED       = 1,
+
+/*
+ * pthread_attr_{get,set}inheritsched
+ */
+  PTHREAD_INHERIT_SCHED         = 0,
+  PTHREAD_EXPLICIT_SCHED        = 1,  /* Default */
+
+/*
+ * pthread_{get,set}scope
+ */
+  PTHREAD_SCOPE_PROCESS         = 0,
+  PTHREAD_SCOPE_SYSTEM          = 1,  /* Default */
+
+/*
+ * pthread_setcancelstate paramters
+ */
+  PTHREAD_CANCEL_ENABLE         = 0,  /* Default */
+  PTHREAD_CANCEL_DISABLE        = 1,
+
+/*
+ * pthread_setcanceltype parameters
+ */
+  PTHREAD_CANCEL_ASYNCHRONOUS   = 0,
+  PTHREAD_CANCEL_DEFERRED       = 1,  /* Default */
+
+/*
+ * pthread_mutexattr_{get,set}pshared
+ * pthread_condattr_{get,set}pshared
+ */
+  PTHREAD_PROCESS_PRIVATE       = 0,
+  PTHREAD_PROCESS_SHARED        = 1,
+
+/*
+ * pthread_mutexattr_{get,set}robust
+ */
+  PTHREAD_MUTEX_STALLED         = 0,  /* Default */
+  PTHREAD_MUTEX_ROBUST          = 1,
+
+/*
+ * pthread_barrier_wait
+ */
+  PTHREAD_BARRIER_SERIAL_THREAD = -1
+};
+
+/*
+ * ====================
+ * ====================
+ * Cancelation
+ * ====================
+ * ====================
+ */
+#define PTHREAD_CANCELED       ((void *)(size_t) -1)
+
+
+/*
+ * ====================
+ * ====================
+ * Once Key
+ * ====================
+ * ====================
+ */
+#define PTHREAD_ONCE_INIT       { PTW32_FALSE, 0, 0, 0}
+
+struct pthread_once_t_
+{
+  int          done;        /* indicates if user function has been executed */
+  void *       lock;
+  int          reserved1;
+  int          reserved2;
+};
+
+
+/*
+ * ====================
+ * ====================
+ * Object initialisers
+ * ====================
+ * ====================
+ */
+#define PTHREAD_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -1)
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -2)
+#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -3)
+
+/*
+ * Compatibility with LinuxThreads
+ */
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP PTHREAD_RECURSIVE_MUTEX_INITIALIZER
+#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP PTHREAD_ERRORCHECK_MUTEX_INITIALIZER
+
+#define PTHREAD_COND_INITIALIZER ((pthread_cond_t)(size_t) -1)
+
+#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1)
+
+#define PTHREAD_SPINLOCK_INITIALIZER ((pthread_spinlock_t)(size_t) -1)
+
+
+/*
+ * Mutex types.
+ */
+enum
+{
+  /* Compatibility with LinuxThreads */
+  PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_RECURSIVE_NP,
+  PTHREAD_MUTEX_ERRORCHECK_NP,
+  PTHREAD_MUTEX_TIMED_NP = PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_ADAPTIVE_NP = PTHREAD_MUTEX_FAST_NP,
+  /* For compatibility with POSIX */
+  PTHREAD_MUTEX_NORMAL = PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_RECURSIVE = PTHREAD_MUTEX_RECURSIVE_NP,
+  PTHREAD_MUTEX_ERRORCHECK = PTHREAD_MUTEX_ERRORCHECK_NP,
+  PTHREAD_MUTEX_DEFAULT = PTHREAD_MUTEX_NORMAL
+};
+
+
+typedef struct ptw32_cleanup_t ptw32_cleanup_t;
+
+#if defined(_MSC_VER)
+/* Disable MSVC 'anachronism used' warning */
+#pragma warning( disable : 4229 )
+#endif
+
+typedef void (* PTW32_CDECL ptw32_cleanup_callback_t)(void *);
+
+#if defined(_MSC_VER)
+#pragma warning( default : 4229 )
+#endif
+
+struct ptw32_cleanup_t
+{
+  ptw32_cleanup_callback_t routine;
+  void *arg;
+  struct ptw32_cleanup_t *prev;
+};
+
+#if defined(__CLEANUP_SEH)
+        /*
+         * WIN32 SEH version of cancel cleanup.
+         */
+
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            ptw32_cleanup_t     _cleanup; \
+            \
+        _cleanup.routine        = (ptw32_cleanup_callback_t)(_rout); \
+            _cleanup.arg        = (_arg); \
+            __try \
+              { \
+
+#define pthread_cleanup_pop( _execute ) \
+              } \
+            __finally \
+                { \
+                    if( _execute || AbnormalTermination()) \
+                      { \
+                          (*(_cleanup.routine))( _cleanup.arg ); \
+                      } \
+                } \
+        }
+
+#else /* __CLEANUP_SEH */
+
+#if defined(__CLEANUP_C)
+
+        /*
+         * C implementation of PThreads cancel cleanup
+         */
+
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            ptw32_cleanup_t     _cleanup; \
+            \
+            ptw32_push_cleanup( &_cleanup, (ptw32_cleanup_callback_t) (_rout), (_arg) ); \
+
+#define pthread_cleanup_pop( _execute ) \
+            (void) ptw32_pop_cleanup( _execute ); \
+        }
+
+#else /* __CLEANUP_C */
+
+#if defined(__CLEANUP_CXX)
+
+        /*
+         * C++ version of cancel cleanup.
+         * - John E. Bossom.
+         */
+
+        class PThreadCleanup {
+          /*
+           * PThreadCleanup
+           *
+           * Purpose
+           *      This class is a C++ helper class that is
+           *      used to implement pthread_cleanup_push/
+           *      pthread_cleanup_pop.
+           *      The destructor of this class automatically
+           *      pops the pushed cleanup routine regardless
+           *      of how the code exits the scope
+           *      (i.e. such as by an exception)
+           */
+      ptw32_cleanup_callback_t cleanUpRout;
+          void    *       obj;
+          int             executeIt;
+
+        public:
+          PThreadCleanup() :
+            cleanUpRout( 0 ),
+            obj( 0 ),
+            executeIt( 0 )
+            /*
+             * No cleanup performed
+             */
+            {
+            }
+
+          PThreadCleanup(
+             ptw32_cleanup_callback_t routine,
+                         void    *       arg ) :
+            cleanUpRout( routine ),
+            obj( arg ),
+            executeIt( 1 )
+            /*
+             * Registers a cleanup routine for 'arg'
+             */
+            {
+            }
+
+          ~PThreadCleanup()
+            {
+              if ( executeIt && ((void *) cleanUpRout != (void *) 0) )
+                {
+                  (void) (*cleanUpRout)( obj );
+                }
+            }
+
+          void execute( int exec )
+            {
+              executeIt = exec;
+            }
+        };
+
+        /*
+         * C++ implementation of PThreads cancel cleanup;
+         * This implementation takes advantage of a helper
+         * class who's destructor automatically calls the
+         * cleanup routine if we exit our scope weirdly
+         */
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            PThreadCleanup  cleanup((ptw32_cleanup_callback_t)(_rout), \
+                                    (void *) (_arg) );
+
+#define pthread_cleanup_pop( _execute ) \
+            cleanup.execute( _execute ); \
+        }
+
+#else
+
+#error ERROR [__FILE__, line __LINE__]: Cleanup type undefined.
+
+#endif /* __CLEANUP_CXX */
+
+#endif /* __CLEANUP_C */
+
+#endif /* __CLEANUP_SEH */
+
+/*
+ * ===============
+ * ===============
+ * Methods
+ * ===============
+ * ===============
+ */
+
+/*
+ * PThread Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_init (pthread_attr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_destroy (pthread_attr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getdetachstate (const pthread_attr_t * attr,
+                                         int *detachstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getstackaddr (const pthread_attr_t * attr,
+                                       void **stackaddr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getstacksize (const pthread_attr_t * attr,
+                                       size_t * stacksize);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setdetachstate (pthread_attr_t * attr,
+                                         int detachstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setstackaddr (pthread_attr_t * attr,
+                                       void *stackaddr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setstacksize (pthread_attr_t * attr,
+                                       size_t stacksize);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getschedparam (const pthread_attr_t *attr,
+                                        struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setschedparam (pthread_attr_t *attr,
+                                        const struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setschedpolicy (pthread_attr_t *,
+                                         int);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getschedpolicy (const pthread_attr_t *,
+                                         int *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setinheritsched(pthread_attr_t * attr,
+                                         int inheritsched);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getinheritsched(const pthread_attr_t * attr,
+                                         int * inheritsched);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setscope (pthread_attr_t *,
+                                   int);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getscope (const pthread_attr_t *,
+                                   int *);
+
+/*
+ * PThread Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_create (pthread_t * tid,
+                            const pthread_attr_t * attr,
+                            void *(PTW32_CDECL *start) (void *),
+                            void *arg);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_detach (pthread_t tid);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_equal (pthread_t t1,
+                           pthread_t t2);
+
+PTW32_DLLPORT void PTW32_CDECL pthread_exit (void *value_ptr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_join (pthread_t thread,
+                          void **value_ptr);
+
+PTW32_DLLPORT pthread_t PTW32_CDECL pthread_self (void);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cancel (pthread_t thread);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setcancelstate (int state,
+                                    int *oldstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setcanceltype (int type,
+                                   int *oldtype);
+
+PTW32_DLLPORT void PTW32_CDECL pthread_testcancel (void);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_once (pthread_once_t * once_control,
+                          void (PTW32_CDECL *init_routine) (void));
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+PTW32_DLLPORT ptw32_cleanup_t * PTW32_CDECL ptw32_pop_cleanup (int execute);
+
+PTW32_DLLPORT void PTW32_CDECL ptw32_push_cleanup (ptw32_cleanup_t * cleanup,
+                                 ptw32_cleanup_callback_t routine,
+                                 void *arg);
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Thread Specific Data Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_key_create (pthread_key_t * key,
+                                void (PTW32_CDECL *destructor) (void *));
+
+PTW32_DLLPORT int PTW32_CDECL pthread_key_delete (pthread_key_t key);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setspecific (pthread_key_t key,
+                                 const void *value);
+
+PTW32_DLLPORT void * PTW32_CDECL pthread_getspecific (pthread_key_t key);
+
+
+/*
+ * Mutex Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_init (pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_destroy (pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getpshared (const pthread_mutexattr_t
+                                          * attr,
+                                          int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setpshared (pthread_mutexattr_t * attr,
+                                          int pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_settype (pthread_mutexattr_t * attr, int kind);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_gettype (const pthread_mutexattr_t * attr, int *kind);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setrobust(
+                                           pthread_mutexattr_t *attr,
+                                           int robust);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getrobust(
+                                           const pthread_mutexattr_t * attr,
+                                           int * robust);
+
+/*
+ * Barrier Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_init (pthread_barrierattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_destroy (pthread_barrierattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_getpshared (const pthread_barrierattr_t
+                                            * attr,
+                                            int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_setpshared (pthread_barrierattr_t * attr,
+                                            int pshared);
+
+/*
+ * Mutex Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_init (pthread_mutex_t * mutex,
+                                const pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_destroy (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_lock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_timedlock(pthread_mutex_t * mutex,
+                                    const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_trylock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_unlock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_consistent (pthread_mutex_t * mutex);
+
+/*
+ * Spinlock Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_init (pthread_spinlock_t * lock, int pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_destroy (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_lock (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_trylock (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_unlock (pthread_spinlock_t * lock);
+
+/*
+ * Barrier Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_init (pthread_barrier_t * barrier,
+                                  const pthread_barrierattr_t * attr,
+                                  unsigned int count);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_destroy (pthread_barrier_t * barrier);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_wait (pthread_barrier_t * barrier);
+
+/*
+ * Condition Variable Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_init (pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_destroy (pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_getpshared (const pthread_condattr_t * attr,
+                                         int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_setpshared (pthread_condattr_t * attr,
+                                         int pshared);
+
+/*
+ * Condition Variable Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_init (pthread_cond_t * cond,
+                               const pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_destroy (pthread_cond_t * cond);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_wait (pthread_cond_t * cond,
+                               pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_timedwait (pthread_cond_t * cond,
+                                    pthread_mutex_t * mutex,
+                                    const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_signal (pthread_cond_t * cond);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_broadcast (pthread_cond_t * cond);
+
+/*
+ * Scheduling
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_setschedparam (pthread_t thread,
+                                   int policy,
+                                   const struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_getschedparam (pthread_t thread,
+                                   int *policy,
+                                   struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setconcurrency (int);
+ 
+PTW32_DLLPORT int PTW32_CDECL pthread_getconcurrency (void);
+
+/*
+ * Read-Write Lock Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_init(pthread_rwlock_t *lock,
+                                const pthread_rwlockattr_t *attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_destroy(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_tryrdlock(pthread_rwlock_t *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_trywrlock(pthread_rwlock_t *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_rdlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_timedrdlock(pthread_rwlock_t *lock,
+                                       const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_wrlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_timedwrlock(pthread_rwlock_t *lock,
+                                       const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_unlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_init (pthread_rwlockattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_destroy (pthread_rwlockattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_getpshared (const pthread_rwlockattr_t * attr,
+                                           int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_setpshared (pthread_rwlockattr_t * attr,
+                                           int pshared);
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX - 1
+
+/*
+ * Signal Functions. Should be defined in <signal.h> but MSVC and MinGW32
+ * already have signal.h that don't define these.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_kill(pthread_t thread, int sig);
+
+/*
+ * Non-portable functions
+ */
+
+/*
+ * Compatibility with Linux.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setkind_np(pthread_mutexattr_t * attr,
+                                         int kind);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getkind_np(pthread_mutexattr_t * attr,
+                                         int *kind);
+
+/*
+ * Possibly supported by other POSIX threads implementations
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_delay_np (struct timespec * interval);
+PTW32_DLLPORT int PTW32_CDECL pthread_num_processors_np(void);
+PTW32_DLLPORT unsigned __int64 PTW32_CDECL pthread_getunique_np(pthread_t thread);
+
+/*
+ * Useful if an application wants to statically link
+ * the lib rather than load the DLL at run-time.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_process_attach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_process_detach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_thread_attach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_thread_detach_np(void);
+
+/*
+ * Features that are auto-detected at load/run time.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_test_features_np(int);
+enum ptw32_features {
+  PTW32_SYSTEM_INTERLOCKED_COMPARE_EXCHANGE = 0x0001, /* System provides it. */
+  PTW32_ALERTABLE_ASYNC_CANCEL              = 0x0002  /* Can cancel blocked threads. */
+};
+
+/*
+ * Register a system time change with the library.
+ * Causes the library to perform various functions
+ * in response to the change. Should be called whenever
+ * the application's top level window receives a
+ * WM_TIMECHANGE message. It can be passed directly to
+ * pthread_create() as a new thread if desired.
+ */
+PTW32_DLLPORT void * PTW32_CDECL pthread_timechange_handler_np(void *);
+
+#endif /*PTW32_LEVEL >= PTW32_LEVEL_MAX - 1 */
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+
+/*
+ * Returns the Win32 HANDLE for the POSIX thread.
+ */
+PTW32_DLLPORT HANDLE PTW32_CDECL pthread_getw32threadhandle_np(pthread_t thread);
+/*
+ * Returns the win32 thread ID for POSIX thread.
+ */
+PTW32_DLLPORT DWORD PTW32_CDECL pthread_getw32threadid_np (pthread_t thread);
+
+
+/*
+ * Protected Methods
+ *
+ * This function blocks until the given WIN32 handle
+ * is signaled or pthread_cancel had been called.
+ * This function allows the caller to hook into the
+ * PThreads cancel mechanism. It is implemented using
+ *
+ *              WaitForMultipleObjects
+ *
+ * on 'waitHandle' and a manually reset WIN32 Event
+ * used to implement pthread_cancel. The 'timeout'
+ * argument to TimedWait is simply passed to
+ * WaitForMultipleObjects.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthreadCancelableWait (HANDLE waitHandle);
+PTW32_DLLPORT int PTW32_CDECL pthreadCancelableTimedWait (HANDLE waitHandle,
+                                        DWORD timeout);
+
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Thread-Safe C Runtime Library Mappings.
+ */
+#if !defined(_UWIN)
+#  if defined(NEED_ERRNO)
+     PTW32_DLLPORT int * PTW32_CDECL _errno( void );
+#  else
+#    if !defined(errno)
+#      if (defined(_MT) || defined(_DLL))
+         __declspec(dllimport) extern int * __cdecl _errno(void);
+#        define errno   (*_errno())
+#      endif
+#    endif
+#  endif
+#endif
+
+/*
+ * Some compiler environments don't define some things.
+ */
+#if defined(__BORLANDC__)
+#  define _ftime ftime
+#  define _timeb timeb
+#endif
+
+#if defined(__cplusplus)
+
+/*
+ * Internal exceptions
+ */
+class ptw32_exception {};
+class ptw32_exception_cancel : public ptw32_exception {};
+class ptw32_exception_exit   : public ptw32_exception {};
+
+#endif
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+
+/* FIXME: This is only required if the library was built using SEH */
+/*
+ * Get internal SEH tag
+ */
+PTW32_DLLPORT DWORD PTW32_CDECL ptw32_get_exception_services_code(void);
+
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+#if !defined(PTW32_BUILD)
+
+#if defined(__CLEANUP_SEH)
+
+/*
+ * Redefine the SEH __except keyword to ensure that applications
+ * propagate our internal exceptions up to the library's internal handlers.
+ */
+#define __except( E ) \
+        __except( ( GetExceptionCode() == ptw32_get_exception_services_code() ) \
+                 ? EXCEPTION_CONTINUE_SEARCH : ( E ) )
+
+#endif /* __CLEANUP_SEH */
+
+#if defined(__CLEANUP_CXX)
+
+/*
+ * Redefine the C++ catch keyword to ensure that applications
+ * propagate our internal exceptions up to the library's internal handlers.
+ */
+#if defined(_MSC_VER)
+        /*
+         * WARNING: Replace any 'catch( ... )' with 'PtW32CatchAll'
+         * if you want Pthread-Win32 cancelation and pthread_exit to work.
+         */
+
+#if !defined(PtW32NoCatchWarn)
+
+#pragma message("Specify \"/DPtW32NoCatchWarn\" compiler flag to skip this message.")
+#pragma message("------------------------------------------------------------------")
+#pragma message("When compiling applications with MSVC++ and C++ exception handling:")
+#pragma message("  Replace any 'catch( ... )' in routines called from POSIX threads")
+#pragma message("  with 'PtW32CatchAll' or 'CATCHALL' if you want POSIX thread")
+#pragma message("  cancelation and pthread_exit to work. For example:")
+#pragma message("")
+#pragma message("    #if defined(PtW32CatchAll)")
+#pragma message("      PtW32CatchAll")
+#pragma message("    #else")
+#pragma message("      catch(...)")
+#pragma message("    #endif")
+#pragma message("        {")
+#pragma message("          /* Catchall block processing */")
+#pragma message("        }")
+#pragma message("------------------------------------------------------------------")
+
+#endif
+
+#define PtW32CatchAll \
+        catch( ptw32_exception & ) { throw; } \
+        catch( ... )
+
+#else /* _MSC_VER */
+
+#define catch( E ) \
+        catch( ptw32_exception & ) { throw; } \
+        catch( E )
+
+#endif /* _MSC_VER */
+
+#endif /* __CLEANUP_CXX */
+
+#endif /* ! PTW32_BUILD */
+
+#if defined(__cplusplus)
+}                               /* End of extern "C" */
+#endif                          /* __cplusplus */
+
+#if defined(PTW32__HANDLE_DEF)
+# undef HANDLE
+#endif
+#if defined(PTW32__DWORD_DEF)
+# undef DWORD
+#endif
+
+#undef PTW32_LEVEL
+#undef PTW32_LEVEL_MAX
+
+#endif /* ! RC_INVOKED */
+
+#endif /* PTHREAD_H */
diff --git a/darknet-master/3rdparty/pthreads/include/sched.h b/darknet-master/3rdparty/pthreads/include/sched.h
new file mode 100644
index 0000000..f36a97a
--- /dev/null
+++ b/darknet-master/3rdparty/pthreads/include/sched.h
@@ -0,0 +1,183 @@
+/*
+ * Module: sched.h
+ *
+ * Purpose:
+ *      Provides an implementation of POSIX realtime extensions
+ *      as defined in 
+ *
+ *              POSIX 1003.1b-1993      (POSIX.1b)
+ *
+ * --------------------------------------------------------------------------
+ *
+ *      Pthreads-win32 - POSIX Threads Library for Win32
+ *      Copyright(C) 1998 John E. Bossom
+ *      Copyright(C) 1999,2005 Pthreads-win32 contributors
+ * 
+ *      Contact Email: rpj@callisto.canberra.edu.au
+ * 
+ *      The current list of contributors is contained
+ *      in the file CONTRIBUTORS included with the source
+ *      code distribution. The list can also be seen at the
+ *      following World Wide Web location:
+ *      http://sources.redhat.com/pthreads-win32/contributors.html
+ * 
+ *      This library is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU Lesser General Public
+ *      License as published by the Free Software Foundation; either
+ *      version 2 of the License, or (at your option) any later version.
+ * 
+ *      This library is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *      Lesser General Public License for more details.
+ * 
+ *      You should have received a copy of the GNU Lesser General Public
+ *      License along with this library in the file COPYING.LIB;
+ *      if not, write to the Free Software Foundation, Inc.,
+ *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+#if !defined(_SCHED_H)
+#define _SCHED_H
+
+#undef PTW32_SCHED_LEVEL
+
+#if defined(_POSIX_SOURCE)
+#define PTW32_SCHED_LEVEL 0
+/* Early POSIX */
+#endif
+
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309
+#undef PTW32_SCHED_LEVEL
+#define PTW32_SCHED_LEVEL 1
+/* Include 1b, 1c and 1d */
+#endif
+
+#if defined(INCLUDE_NP)
+#undef PTW32_SCHED_LEVEL
+#define PTW32_SCHED_LEVEL 2
+/* Include Non-Portable extensions */
+#endif
+
+#define PTW32_SCHED_LEVEL_MAX 3
+
+#if ( defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112 )  || !defined(PTW32_SCHED_LEVEL)
+#define PTW32_SCHED_LEVEL PTW32_SCHED_LEVEL_MAX
+/* Include everything */
+#endif
+
+
+#if defined(__GNUC__) && !defined(__declspec)
+# error Please upgrade your GNU compiler to one that supports __declspec.
+#endif
+
+/*
+ * When building the library, you should define PTW32_BUILD so that
+ * the variables/functions are exported correctly. When using the library,
+ * do NOT define PTW32_BUILD, and then the variables/functions will
+ * be imported correctly.
+ */
+#if !defined(PTW32_STATIC_LIB)
+#  if defined(PTW32_BUILD)
+#    define PTW32_DLLPORT __declspec (dllexport)
+#  else
+#    define PTW32_DLLPORT __declspec (dllimport)
+#  endif
+#else
+#  define PTW32_DLLPORT
+#endif
+
+/*
+ * This is a duplicate of what is in the autoconf config.h,
+ * which is only used when building the pthread-win32 libraries.
+ */
+
+#if !defined(PTW32_CONFIG_H)
+#  if defined(WINCE)
+#    define NEED_ERRNO
+#    define NEED_SEM
+#  endif
+#  if defined(__MINGW64__)
+#    define HAVE_STRUCT_TIMESPEC
+#    define HAVE_MODE_T
+#  elif defined(_UWIN) || defined(__MINGW32__)
+#    define HAVE_MODE_T
+#  endif
+#endif
+
+/*
+ *
+ */
+
+#if PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX
+#if defined(NEED_ERRNO)
+#include "need_errno.h"
+#else
+#include <errno.h>
+#endif
+#endif /* PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX */
+
+#if (defined(__MINGW64__) || defined(__MINGW32__)) || defined(_UWIN)
+# if PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX
+/* For pid_t */
+#  include <sys/types.h>
+/* Required by Unix 98 */
+#  include <time.h>
+# else
+   typedef int pid_t;
+# endif
+#else
+ typedef int pid_t;
+#endif
+
+/* Thread scheduling policies */
+
+enum {
+  SCHED_OTHER = 0,
+  SCHED_FIFO,
+  SCHED_RR,
+  SCHED_MIN   = SCHED_OTHER,
+  SCHED_MAX   = SCHED_RR
+};
+
+struct sched_param {
+  int sched_priority;
+};
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif                          /* __cplusplus */
+
+PTW32_DLLPORT int __cdecl sched_yield (void);
+
+PTW32_DLLPORT int __cdecl sched_get_priority_min (int policy);
+
+PTW32_DLLPORT int __cdecl sched_get_priority_max (int policy);
+
+PTW32_DLLPORT int __cdecl sched_setscheduler (pid_t pid, int policy);
+
+PTW32_DLLPORT int __cdecl sched_getscheduler (pid_t pid);
+
+/*
+ * Note that this macro returns ENOTSUP rather than
+ * ENOSYS as might be expected. However, returning ENOSYS
+ * should mean that sched_get_priority_{min,max} are
+ * not implemented as well as sched_rr_get_interval.
+ * This is not the case, since we just don't support
+ * round-robin scheduling. Therefore I have chosen to
+ * return the same value as sched_setscheduler when
+ * SCHED_RR is passed to it.
+ */
+#define sched_rr_get_interval(_pid, _interval) \
+  ( errno = ENOTSUP, (int) -1 )
+
+
+#if defined(__cplusplus)
+}                               /* End of extern "C" */
+#endif                          /* __cplusplus */
+
+#undef PTW32_SCHED_LEVEL
+#undef PTW32_SCHED_LEVEL_MAX
+
+#endif                          /* !_SCHED_H */
+
diff --git a/darknet-master/3rdparty/pthreads/include/semaphore.h b/darknet-master/3rdparty/pthreads/include/semaphore.h
new file mode 100644
index 0000000..c6e9407
--- /dev/null
+++ b/darknet-master/3rdparty/pthreads/include/semaphore.h
@@ -0,0 +1,169 @@
+/*
+ * Module: semaphore.h
+ *
+ * Purpose:
+ *	Semaphores aren't actually part of the PThreads standard.
+ *	They are defined by the POSIX Standard:
+ *
+ *		POSIX 1003.1b-1993	(POSIX.1b)
+ *
+ * --------------------------------------------------------------------------
+ *
+ *      Pthreads-win32 - POSIX Threads Library for Win32
+ *      Copyright(C) 1998 John E. Bossom
+ *      Copyright(C) 1999,2005 Pthreads-win32 contributors
+ * 
+ *      Contact Email: rpj@callisto.canberra.edu.au
+ * 
+ *      The current list of contributors is contained
+ *      in the file CONTRIBUTORS included with the source
+ *      code distribution. The list can also be seen at the
+ *      following World Wide Web location:
+ *      http://sources.redhat.com/pthreads-win32/contributors.html
+ * 
+ *      This library is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU Lesser General Public
+ *      License as published by the Free Software Foundation; either
+ *      version 2 of the License, or (at your option) any later version.
+ * 
+ *      This library is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *      Lesser General Public License for more details.
+ * 
+ *      You should have received a copy of the GNU Lesser General Public
+ *      License along with this library in the file COPYING.LIB;
+ *      if not, write to the Free Software Foundation, Inc.,
+ *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+#if !defined( SEMAPHORE_H )
+#define SEMAPHORE_H
+
+#undef PTW32_SEMAPHORE_LEVEL
+
+#if defined(_POSIX_SOURCE)
+#define PTW32_SEMAPHORE_LEVEL 0
+/* Early POSIX */
+#endif
+
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309
+#undef PTW32_SEMAPHORE_LEVEL
+#define PTW32_SEMAPHORE_LEVEL 1
+/* Include 1b, 1c and 1d */
+#endif
+
+#if defined(INCLUDE_NP)
+#undef PTW32_SEMAPHORE_LEVEL
+#define PTW32_SEMAPHORE_LEVEL 2
+/* Include Non-Portable extensions */
+#endif
+
+#define PTW32_SEMAPHORE_LEVEL_MAX 3
+
+#if !defined(PTW32_SEMAPHORE_LEVEL)
+#define PTW32_SEMAPHORE_LEVEL PTW32_SEMAPHORE_LEVEL_MAX
+/* Include everything */
+#endif
+
+#if defined(__GNUC__) && ! defined (__declspec)
+# error Please upgrade your GNU compiler to one that supports __declspec.
+#endif
+
+/*
+ * When building the library, you should define PTW32_BUILD so that
+ * the variables/functions are exported correctly. When using the library,
+ * do NOT define PTW32_BUILD, and then the variables/functions will
+ * be imported correctly.
+ */
+#if !defined(PTW32_STATIC_LIB)
+#  if defined(PTW32_BUILD)
+#    define PTW32_DLLPORT __declspec (dllexport)
+#  else
+#    define PTW32_DLLPORT __declspec (dllimport)
+#  endif
+#else
+#  define PTW32_DLLPORT
+#endif
+
+/*
+ * This is a duplicate of what is in the autoconf config.h,
+ * which is only used when building the pthread-win32 libraries.
+ */
+
+#if !defined(PTW32_CONFIG_H)
+#  if defined(WINCE)
+#    define NEED_ERRNO
+#    define NEED_SEM
+#  endif
+#  if defined(__MINGW64__)
+#    define HAVE_STRUCT_TIMESPEC
+#    define HAVE_MODE_T
+#  elif defined(_UWIN) || defined(__MINGW32__)
+#    define HAVE_MODE_T
+#  endif
+#endif
+
+/*
+ *
+ */
+
+#if PTW32_SEMAPHORE_LEVEL >= PTW32_SEMAPHORE_LEVEL_MAX
+#if defined(NEED_ERRNO)
+#include "need_errno.h"
+#else
+#include <errno.h>
+#endif
+#endif /* PTW32_SEMAPHORE_LEVEL >= PTW32_SEMAPHORE_LEVEL_MAX */
+
+#define _POSIX_SEMAPHORES
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif				/* __cplusplus */
+
+#if !defined(HAVE_MODE_T)
+typedef unsigned int mode_t;
+#endif
+
+
+typedef struct sem_t_ * sem_t;
+
+PTW32_DLLPORT int __cdecl sem_init (sem_t * sem,
+			    int pshared,
+			    unsigned int value);
+
+PTW32_DLLPORT int __cdecl sem_destroy (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_trywait (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_wait (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_timedwait (sem_t * sem,
+				 const struct timespec * abstime);
+
+PTW32_DLLPORT int __cdecl sem_post (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_post_multiple (sem_t * sem,
+				     int count);
+
+PTW32_DLLPORT int __cdecl sem_open (const char * name,
+			    int oflag,
+			    mode_t mode,
+			    unsigned int value);
+
+PTW32_DLLPORT int __cdecl sem_close (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_unlink (const char * name);
+
+PTW32_DLLPORT int __cdecl sem_getvalue (sem_t * sem,
+				int * sval);
+
+#if defined(__cplusplus)
+}				/* End of extern "C" */
+#endif				/* __cplusplus */
+
+#undef PTW32_SEMAPHORE_LEVEL
+#undef PTW32_SEMAPHORE_LEVEL_MAX
+
+#endif				/* !SEMAPHORE_H */
diff --git a/darknet-master/3rdparty/stb/include/stb_image.h b/darknet-master/3rdparty/stb/include/stb_image.h
new file mode 100644
index 0000000..5e807a0
--- /dev/null
+++ b/darknet-master/3rdparty/stb/include/stb_image.h
@@ -0,0 +1,7987 @@
+/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8/16-bit-per-channel
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
+      2.26  (2020-07-13) many minor fixes
+      2.25  (2020-02-02) fix warnings
+      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
+      2.23  (2019-08-11) fix clang static analysis warning
+      2.22  (2019-03-04) gif fixes, fix warnings
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
+    Phil Jordan                                Dave Moore           Roy Eltham
+    Hayaki Saito            Nathan Reed        Won Chun
+    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
+    Thomas Ruf              Ronny Chevalier                         github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
+    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
+    Cass Everitt            Ryamond Barbiero                        github:grim210
+    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
+    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
+    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
+    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
+    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
+                            Brad Weinberger    Matvey Cherevko      github:mosra
+    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
+    Ryan C. Gordon          [reserved]                              [reserved]
+                     DO NOT ADD YOUR NAME HERE
+
+                     Jacko Dirks
+
+  To add your name to the credits, pick a random blank space in the middle and fill it.
+  80% of merge conflicts on stb PRs are due to people adding their name at the end
+  of the credits.
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data);
+//
+// Standard parameters:
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
+// ===========================================================================
+//
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// provide more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small source code footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+//    than that size (in either width or height) without further processing.
+//    This is to let programs in the wild set an upper bound to prevent
+//    denial-of-service attacks on untrusted data, as one could generate a
+//    valid image of gigantic dimensions and force stb_image to allocate a
+//    huge block of memory and spend disproportionate time decoding it. By
+//    default this is set to (1 << 24), which is 16777216, but that's still
+//    very big.
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for desired_channels
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+#include <stdlib.h>
+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef STBIDEF
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// on most compilers (and ALL modern mainstream compilers) this is threadsafe
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// as above, but only applies to images loaded on the thread that calls the function
+// this function is only available if your compiler supports thread-local variables;
+// calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp, pow
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+#ifndef STBI_NO_THREAD_LOCALS
+   #if defined(__cplusplus) &&  __cplusplus >= 201103L
+      #define STBI_THREAD_LOCAL       thread_local
+   #elif defined(__GNUC__) && __GNUC__ < 5
+      #define STBI_THREAD_LOCAL       __thread
+   #elif defined(_MSC_VER)
+      #define STBI_THREAD_LOCAL       __declspec(thread)
+   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+      #define STBI_THREAD_LOCAL       _Thread_local
+   #endif
+
+   #ifndef STBI_THREAD_LOCAL
+      #if defined(__GNUC__)
+        #define STBI_THREAD_LOCAL       __thread
+      #endif
+   #endif
+#endif
+
+#if defined(_MSC_VER) || defined(__SYMBIAN32__)
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#endif
+
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
+}
+#endif
+
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+   int callback_already_read;
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   int ch;
+   fseek((FILE*) user, n, SEEK_CUR);
+   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
+   if (ch != EOF) {
+      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
+   }
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user) || ferror((FILE *) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
+#endif
+
+static
+#ifdef STBI_THREAD_LOCAL
+STBI_THREAD_LOCAL
+#endif
+const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+#ifndef STBI_NO_FAILURE_STRINGS
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+#endif
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+#endif
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+#endif
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b)
+{
+   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
+   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two signed shorts is valid, 0 on overflow.
+static int stbi__mul2shorts_valid(short a, short b)
+{
+   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
+   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+   return a >= SHRT_MIN / b;
+}
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load_global = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
+#else
+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
+
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_set = 1;
+}
+
+#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
+                                         ? stbi__vertically_flip_on_load_local  \
+                                         : stbi__vertically_flip_on_load_global)
+#endif // STBI_THREAD_LOCAL
+
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #else
+   STBI_NOTUSED(bpc);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
+
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
+      }
+   }
+}
+
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+      bytes += slice_size;
+   }
+}
+#endif
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 8) {
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 16) {
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+#endif
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
+   }
+
+   return result;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+// nothing
+#else
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n == 0) return;  // already there!
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
+// nothing
+#else
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+#endif
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi__uint16 dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0;
+   unsigned int code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i) {
+      for (j=0; j < count[i]; ++j) {
+         h->size[k++] = (stbi_uc) (i+1);
+         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
+      }
+   }
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (~0U << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   if(c < 0 || c >= 256) // symbol id out of bounds!
+       return -1;
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & (sgn - 1));
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      data[0] = (short) (dc * (1 << j->succ_low));
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) * (1 << shift));
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) * 4096)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0]*4;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15,i;
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
+      return 1;
+   }
+
+   return stbi__err("unknown marker","Corrupt JPEG");
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   c = stbi__get8(s);
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      if (z->progressive) {
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+{
+   // some JPEGs have junk at end, skip over it but if we find what looks
+   // like a valid marker, resume there
+   while (!stbi__at_eof(j->s)) {
+      int x = stbi__get8(j->s);
+      while (x == 255) { // might be a marker
+         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+         x = stbi__get8(j->s);
+         if (x != 0x00 && x != 0xff) {
+            // not a stuffed zero or lead-in to another marker, looks
+            // like an actual marker, return it
+            return x;
+         }
+         // stuffed zero has x=0 now which ends the loop, meaning we go
+         // back to regular scan loop.
+         // repeated 0xff keeps trying to read the next byte of the marker.
+      }
+   }
+   return STBI__MARKER_none;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+         j->marker = stbi__skip_jpeg_junk_at_end(j);
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+         m = stbi__get_marker(j);
+         if (STBI__RESTART(m))
+            m = stbi__get_marker(j);
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+         m = stbi__get_marker(j);
+      } else {
+         if (!stbi__process_marker(j, m)) return 1;
+         m = stbi__get_marker(j);
+      }
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n, is_rgb;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+            }
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+      return output;
+   }
+}
+
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+   stbi__rewind(s);
+   STBI_FREE(j);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static int stbi__zeof(stbi__zbuf *z)
+{
+   return (z->zbuffer >= z->zbuffer_end);
+}
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   return stbi__zeof(z) ? 0 : *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      if (z->code_buffer >= (1U << z->num_bits)) {
+        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+        return;
+      }
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s >= 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
+   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) {
+      if (stbi__zeof(a)) {
+         return -1;   /* report error for unexpected end of data. */
+      }
+      stbi__fill_bits(a);
+   }
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   unsigned int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (unsigned int) (z->zout - z->zout_start);
+   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
+   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
+   while (cur + n > limit) {
+      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
+      limit *= 2;
+   }
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static const int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static const int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static const int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
+         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17) {
+            c = stbi__zreceive(a,3)+3;
+         } else if (c == 18) {
+            c = stbi__zreceive(a,7)+11;
+         } else {
+            return stbi__err("bad codelengths", "Corrupt PNG");
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+*/
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+
+   for (j=0; j < y; ++j) {
+      stbi_uc *cur = a->out + stride*j;
+      stbi_uc *prior;
+      int filter = *raw++;
+
+      if (filter > 4)
+         return stbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
+         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // handle first byte explicitly
+      for (k=0; k < filter_bytes; ++k) {
+         switch (filter) {
+            case STBI__F_none       : cur[k] = raw[k]; break;
+            case STBI__F_sub        : cur[k] = raw[k]; break;
+            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
+            case STBI__F_avg_first  : cur[k] = raw[k]; break;
+            case STBI__F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255; // first pixel
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
+      // this is a little gross, so that we don't switch per-pixel or per-component
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+         switch (filter) {
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+         raw += nk;
+      } else {
+         STBI_ASSERT(img_n+1 == out_n);
+         #define STBI__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+         switch (filter) {
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
+         }
+      }
+   }
+
+   // we make a separate pass to expand bits to pixels; for performance,
+   // this could run two scanlines behind the above code, so it won't
+   // intefere with filtering but will still be in the cache.
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         stbi_uc *cur = a->out + stride*j;
+         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+         // note that the final byte might overshoot and write more data than desired.
+         // we can allocate enough data that this never writes out of memory, but it
+         // could also overwrite the next scanline. can it overwrite non-empty data
+         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+         // so we need to explicitly clamp the final ones
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            int q;
+            // insert alpha = 255
+            cur = a->out + stride*j;
+            if (img_n == 1) {
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
+               }
+            }
+         }
+      }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3]={0};
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s);
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+            }
+            // even with SCAN_header, have to scan to see if we have a tRNS
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) {
+               // header scan definitely stops at first IDAT
+               if (pal_img_n)
+                  s->img_n = pal_img_n;
+               return 1;
+            }
+            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            // end of PNG chunk, read and skip CRC
+            stbi__get32be(s);
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth <= 8)
+         ri->bits_per_channel = 8;
+      else if (p->depth == 16)
+         ri->bits_per_channel = 16;
+      else
+         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+   int extra_read;
+} stbi__bmp_data;
+
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+   info->extra_read = 14;
+
+   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
+
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               stbi__bmp_set_mask_defaults(info, compress);
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               info->extra_read += 12;
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         // V4/V5 header
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - info.extra_read - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - info.extra_read - info.hsz) >> 2;
+   }
+   if (psize == 0) {
+      // accept some number of extra bytes after the header, but if the offset points either to before
+      // the header ends or implies a large amount of extra data, reject the file as malformed
+      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+      int header_limit = 1024; // max we actually read is below 256 bytes currently.
+      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+         return stbi__errpuc("bad header", "Corrupt BMP");
+      }
+      // we established that bytes_read_so_far is positive and sensible.
+      // the first half of this test rejects offsets that are either too small positives, or
+      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+      // ensures the number computed in the second half of the test can't overflow.
+      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+         return stbi__errpuc("bad offset", "Corrupt BMP");
+      } else {
+         stbi__skip(s, info.offset - bytes_read_so_far);
+      }
+   }
+
+   if (info.bpp == 24 && ma == 0xff000000)
+      s->img_n = 3;
+   else
+      s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
+            }
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
+         }
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - info.extra_read - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               unsigned int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4] = {0};
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+   STBI_NOTUSED(tga_x_origin); // @TODO
+   STBI_NOTUSED(tga_y_origin); // @TODO
+
+   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
+            STBI_FREE(tga_data);
+            return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   STBI_NOTUSED(tga_palette_start);
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
+   int channelCount, compression;
+   int channel, i;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
+   // Create the destination image.
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
+         }
+      }
+   }
+
+   // convert to desired output format
+   if (req_comp && req_comp != 4) {
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *result;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+
+   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history;
+   int flags, bgindex, ratio, transparent, eflags;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[8192];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+   int delay;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+   int idx;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   idx = g->cur_x + g->cur_y;
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;
+
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels;
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
+{
+   int dispose;
+   int first_frame;
+   int pi;
+   int pcount;
+   STBI_NOTUSED(req_comp);
+
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0;
+   if (g->out == 0) {
+      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
+      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+         return stbi__errpuc("too large", "GIF image is too large");
+      pcount = g->w * g->h;
+      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->history = (stbi_uc *) stbi__malloc(pcount);
+      if (!g->out || !g->background || !g->history)
+         return stbi__errpuc("outofmem", "Out of memory");
+
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame.
+      memset(g->out, 0x00, 4 * pcount);
+      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+      first_frame = 1;
+   } else {
+      // second frame - how do we dispose of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2;
+      pcount = g->w * g->h;
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
+            }
+         }
+      } else if (dispose == 2) {
+         // restore what was changed last frame to background before that frame;
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame;
+      memcpy( g->background, g->out, 4 * g->w * g->h );
+   }
+
+   // clear my history;
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
+   for (;;) {
+      int tag = stbi__get8(s);
+      switch (tag) {
+         case 0x2C: /* Image Descriptor */
+         {
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            // if the width of the specified rectangle is 0, that means
+            // we may not see *any* pixels or the image is malformed;
+            // to make sure this is caught, move the current y down to
+            // max_y (which is what out_gif_code checks).
+            if (w == 0)
+               g->cur_y = g->max_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (!o) return NULL;
+
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
+                  }
+               }
+            }
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            int ext = stbi__get8(s);
+            if (ext == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255;
+                  }
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0;
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1);
+                     g->transparent = -1;
+                  }
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            }
+            while ((len = stbi__get8(s)) != 0) {
+               stbi__skip(s, len);
+            }
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+}
+
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0;
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0;
+      stbi__gif g;
+      int stride;
+      int out_size = 0;
+      int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0;
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers;
+            stride = g.w * g.h * 4;
+
+            if (out) {
+               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               else {
+                   out = (stbi_uc*) tmp;
+                   out_size = layers * stride;
+               }
+
+               if (delays) {
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
+                  delays_size = layers * sizeof(int);
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               out_size = layers * stride;
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  delays_size = layers * sizeof(int);
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride );
+            if (layers >= 2) {
+               two_back = out - 2 * stride;
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay;
+            }
+         }
+      } while (u != 0);
+
+      // free temp buffer;
+      STBI_FREE(g.out);
+      STBI_FREE(g.history);
+      STBI_FREE(g.background);
+
+      // do the final conversion after loading everything;
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers;
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type.");
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *u = 0;
+   stbi__gif g;
+   memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
+
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames.
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+   } else if (g.out) {
+      // if there was an error and we allocated an image buffer, free it!
+      STBI_FREE(g.out);
+   }
+
+   // free buffers needed for multiple frame loading;
+   STBI_FREE(g.history);
+   STBI_FREE(g.background);
+
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
+{
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+          return 0;
+   stbi__rewind(s);
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+   stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+   const char *headerToken;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
+   // Read data
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
+
+         for (k = 0; k < 4; ++k) {
+            int nleft;
+            i = 0;
+            while ((nleft = width - i) > 0) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      if (scanline)
+         STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   if (p == NULL) {
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) {
+      if (info.bpp == 24 && info.ma == 0xff000000)
+         *comp = 3;
+      else
+         *comp = info.ma ? 4 : 3;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained,dummy;
+   stbi__pic_packet packets[10];
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
+      return 0;
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+      STBI_FREE(out);
+      return stbi__errpuc("bad PNM", "PNM file truncated");
+   }
+
+   if (req_comp && req_comp != s->img_n) {
+      if (ri->bits_per_channel == 16) {
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
+      } else {
+         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      }
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+      if((value > 214748364) || (value == 214748364 && *c > '7'))
+          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv, dummy;
+   char c, p, t;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind(s);
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   if(*x == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   if (*y == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
+   else
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
+   return 0;
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/darknet-master/3rdparty/stb/include/stb_image_write.h b/darknet-master/3rdparty/stb/include/stb_image_write.h
new file mode 100644
index 0000000..e4b32ed
--- /dev/null
+++ b/darknet-master/3rdparty/stb/include/stb_image_write.h
@@ -0,0 +1,1724 @@
+/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
+   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
+                                     no warranty implied; use at your own risk
+
+   Before #including,
+
+       #define STB_IMAGE_WRITE_IMPLEMENTATION
+
+   in the file that you want to have the implementation.
+
+   Will probably not work correctly with strict-aliasing optimizations.
+
+ABOUT:
+
+   This header file is a library for writing images to C stdio or a callback.
+
+   The PNG output is not optimal; it is 20-50% larger than the file
+   written by a decent optimizing implementation; though providing a custom
+   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
+   This library is designed for source code compactness and simplicity,
+   not optimal image file size or run-time performance.
+
+BUILDING:
+
+   You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
+   You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
+   malloc,realloc,free.
+   You can #define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
+   for PNG compression (instead of the builtin one), it must have the following signature:
+   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
+   The returned data will be freed with STBIW_FREE() (free() by default),
+   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
+
+UNICODE:
+
+   If compiling for Windows and you wish to use Unicode filenames, compile
+   with
+       #define STBIW_WINDOWS_UTF8
+   and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
+   Windows wchar_t filenames to utf8.
+
+USAGE:
+
+   There are five functions, one for each image file format:
+
+     int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
+     int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
+     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+
+     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
+
+   There are also five equivalent functions that use an arbitrary write function. You are
+   expected to open/close your file-equivalent before and after calling these:
+
+     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
+
+   where the callback is:
+      void stbi_write_func(void *context, void *data, int size);
+
+   You can configure it with these global variables:
+      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
+      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
+      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
+
+
+   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
+   functions, so the library will not use stdio.h at all. However, this will
+   also disable HDR writing, because it requires stdio for formatted output.
+
+   Each function returns 0 on failure and non-0 on success.
+
+   The functions create an image file defined by the parameters. The image
+   is a rectangle of pixels stored from left-to-right, top-to-bottom.
+   Each pixel contains 'comp' channels of data stored interleaved with 8-bits
+   per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
+   monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
+   The *data pointer points to the first byte of the top-left-most pixel.
+   For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
+   a row of pixels to the first byte of the next row of pixels.
+
+   PNG creates output files with the same number of components as the input.
+   The BMP format expands Y to RGB in the file format and does not
+   output alpha.
+
+   PNG supports writing rectangles of data even when the bytes storing rows of
+   data are not consecutive in memory (e.g. sub-rectangles of a larger image),
+   by supplying the stride between the beginning of adjacent rows. The other
+   formats do not. (Thus you cannot write a native-format BMP through the BMP
+   writer, both because it is in BGR order and because it may have padding
+   at the end of the line.)
+
+   PNG allows you to set the deflate compression level by setting the global
+   variable 'stbi_write_png_compression_level' (it defaults to 8).
+
+   HDR expects linear float data. Since the format is always 32-bit rgb(e)
+   data, alpha (if provided) is discarded, and for monochrome data it is
+   replicated across all three channels.
+
+   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
+   data, set the global variable 'stbi_write_tga_with_rle' to 0.
+
+   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
+   Higher quality looks better but results in a bigger image.
+   JPEG baseline (no JPEG progressive).
+
+CREDITS:
+
+
+   Sean Barrett           -    PNG/BMP/TGA
+   Baldur Karlsson        -    HDR
+   Jean-Sebastien Guay    -    TGA monochrome
+   Tim Kelsey             -    misc enhancements
+   Alan Hickman           -    TGA RLE
+   Emmanuel Julien        -    initial file IO callback implementation
+   Jon Olick              -    original jo_jpeg.cpp code
+   Daniel Gibson          -    integrate JPEG, allow external zlib
+   Aarni Koskela          -    allow choosing PNG filter
+
+   bugfixes:
+      github:Chribba
+      Guillaume Chereau
+      github:jry2
+      github:romigrou
+      Sergio Gonzalez
+      Jonas Karlsson
+      Filip Wasil
+      Thatcher Ulrich
+      github:poppolopoppo
+      Patrick Boettcher
+      github:xeekworx
+      Cap Petschulat
+      Simon Rodriguez
+      Ivan Tikhonov
+      github:ignotion
+      Adam Schackart
+      Andrew Kensler
+
+LICENSE
+
+  See end of file for license information.
+
+*/
+
+#ifndef INCLUDE_STB_IMAGE_WRITE_H
+#define INCLUDE_STB_IMAGE_WRITE_H
+
+#include <stdlib.h>
+
+// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
+#ifndef STBIWDEF
+#ifdef STB_IMAGE_WRITE_STATIC
+#define STBIWDEF  static
+#else
+#ifdef __cplusplus
+#define STBIWDEF  extern "C"
+#else
+#define STBIWDEF  extern
+#endif
+#endif
+#endif
+
+#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
+STBIWDEF int stbi_write_tga_with_rle;
+STBIWDEF int stbi_write_png_compression_level;
+STBIWDEF int stbi_write_force_png_filter;
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+
+#ifdef STBIW_WINDOWS_UTF8
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+#endif
+
+typedef void stbi_write_func(void *context, void *data, int size);
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
+
+STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
+
+#endif//INCLUDE_STB_IMAGE_WRITE_H
+
+#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
+
+#ifdef _WIN32
+   #ifndef _CRT_SECURE_NO_WARNINGS
+   #define _CRT_SECURE_NO_WARNINGS
+   #endif
+   #ifndef _CRT_NONSTDC_NO_DEPRECATE
+   #define _CRT_NONSTDC_NO_DEPRECATE
+   #endif
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+#include <stdio.h>
+#endif // STBI_WRITE_NO_STDIO
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
+// ok
+#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
+#endif
+
+#ifndef STBIW_MALLOC
+#define STBIW_MALLOC(sz)        malloc(sz)
+#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
+#define STBIW_FREE(p)           free(p)
+#endif
+
+#ifndef STBIW_REALLOC_SIZED
+#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
+#endif
+
+
+#ifndef STBIW_MEMMOVE
+#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
+#endif
+
+
+#ifndef STBIW_ASSERT
+#include <assert.h>
+#define STBIW_ASSERT(x) assert(x)
+#endif
+
+#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
+
+#ifdef STB_IMAGE_WRITE_STATIC
+static int stbi_write_png_compression_level = 8;
+static int stbi_write_tga_with_rle = 1;
+static int stbi_write_force_png_filter = -1;
+#else
+int stbi_write_png_compression_level = 8;
+int stbi_write_tga_with_rle = 1;
+int stbi_write_force_png_filter = -1;
+#endif
+
+static int stbi__flip_vertically_on_write = 0;
+
+STBIWDEF void stbi_flip_vertically_on_write(int flag)
+{
+   stbi__flip_vertically_on_write = flag;
+}
+
+typedef struct
+{
+   stbi_write_func *func;
+   void *context;
+   unsigned char buffer[64];
+   int buf_used;
+} stbi__write_context;
+
+// initialize a callback-based context
+static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
+{
+   s->func    = c;
+   s->context = context;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbi__stdio_write(void *context, void *data, int size)
+{
+   fwrite(data,1,size,(FILE*) context);
+}
+
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+#ifdef __cplusplus
+#define STBIW_EXTERN extern "C"
+#else
+#define STBIW_EXTERN extern
+#endif
+STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbiw__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != _wfopen_s(&f, wFilename, wMode))
+      f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+static int stbi__start_write_file(stbi__write_context *s, const char *filename)
+{
+   FILE *f = stbiw__fopen(filename, "wb");
+   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
+   return f != NULL;
+}
+
+static void stbi__end_write_file(stbi__write_context *s)
+{
+   fclose((FILE *)s->context);
+}
+
+#endif // !STBI_WRITE_NO_STDIO
+
+typedef unsigned int stbiw_uint32;
+typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
+
+static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
+{
+   while (*fmt) {
+      switch (*fmt++) {
+         case ' ': break;
+         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
+                     s->func(s->context,&x,1);
+                     break; }
+         case '2': { int x = va_arg(v,int);
+                     unsigned char b[2];
+                     b[0] = STBIW_UCHAR(x);
+                     b[1] = STBIW_UCHAR(x>>8);
+                     s->func(s->context,b,2);
+                     break; }
+         case '4': { stbiw_uint32 x = va_arg(v,int);
+                     unsigned char b[4];
+                     b[0]=STBIW_UCHAR(x);
+                     b[1]=STBIW_UCHAR(x>>8);
+                     b[2]=STBIW_UCHAR(x>>16);
+                     b[3]=STBIW_UCHAR(x>>24);
+                     s->func(s->context,b,4);
+                     break; }
+         default:
+            STBIW_ASSERT(0);
+            return;
+      }
+   }
+}
+
+static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
+{
+   va_list v;
+   va_start(v, fmt);
+   stbiw__writefv(s, fmt, v);
+   va_end(v);
+}
+
+static void stbiw__write_flush(stbi__write_context *s)
+{
+   if (s->buf_used) {
+      s->func(s->context, &s->buffer, s->buf_used);
+      s->buf_used = 0;
+   }
+}
+
+static void stbiw__putc(stbi__write_context *s, unsigned char c)
+{
+   s->func(s->context, &c, 1);
+}
+
+static void stbiw__write1(stbi__write_context *s, unsigned char a)
+{
+   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   s->buffer[s->buf_used++] = a;
+}
+
+static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
+{
+   int n;
+   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   n = s->buf_used;
+   s->buf_used = n+3;
+   s->buffer[n+0] = a;
+   s->buffer[n+1] = b;
+   s->buffer[n+2] = c;
+}
+
+static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
+{
+   unsigned char bg[3] = { 255, 0, 255}, px[3];
+   int k;
+
+   if (write_alpha < 0)
+      stbiw__write1(s, d[comp - 1]);
+
+   switch (comp) {
+      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+      case 1:
+         if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+         else
+            stbiw__write1(s, d[0]);  // monochrome TGA
+         break;
+      case 4:
+         if (!write_alpha) {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
+            break;
+         }
+         /* FALLTHROUGH */
+      case 3:
+         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+         break;
+   }
+   if (write_alpha > 0)
+      stbiw__write1(s, d[comp - 1]);
+}
+
+static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+{
+   stbiw_uint32 zero = 0;
+   int i,j, j_end;
+
+   if (y <= 0)
+      return;
+
+   if (stbi__flip_vertically_on_write)
+      vdir *= -1;
+
+   if (vdir < 0) {
+      j_end = -1; j = y-1;
+   } else {
+      j_end =  y; j = 0;
+   }
+
+   for (; j != j_end; j += vdir) {
+      for (i=0; i < x; ++i) {
+         unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
+         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
+      }
+      stbiw__write_flush(s);
+      s->func(s->context, &zero, scanline_pad);
+   }
+}
+
+static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
+{
+   if (y < 0 || x < 0) {
+      return 0;
+   } else {
+      va_list v;
+      va_start(v, fmt);
+      stbiw__writefv(s, fmt, v);
+      va_end(v);
+      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
+      return 1;
+   }
+}
+
+static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
+{
+   if (comp != 4) {
+      // write RGB bitmap
+      int pad = (-x*3) & 3;
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
+              "11 4 22 4" "4 44 22 444444",
+              'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
+               40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   } else {
+      // RGBA bitmaps need a v4 header
+      // use BI_BITFIELDS mode with 32bpp and alpha mask
+      // (straight BI_RGB with alpha mask doesn't work in most readers)
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
+         "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
+         'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
+         108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
+   }
+}
+
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_bmp_core(&s, x, y, comp, data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_bmp_core(&s, x, y, comp, data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif //!STBI_WRITE_NO_STDIO
+
+static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
+{
+   int has_alpha = (comp == 2 || comp == 4);
+   int colorbytes = has_alpha ? comp-1 : comp;
+   int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
+
+   if (y < 0 || x < 0)
+      return 0;
+
+   if (!stbi_write_tga_with_rle) {
+      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
+         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+   } else {
+      int i,j,k;
+      int jend, jdir;
+
+      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+
+      if (stbi__flip_vertically_on_write) {
+         j = 0;
+         jend = y;
+         jdir = 1;
+      } else {
+         j = y-1;
+         jend = -1;
+         jdir = -1;
+      }
+      for (; j != jend; j += jdir) {
+         unsigned char *row = (unsigned char *) data + j * x * comp;
+         int len;
+
+         for (i = 0; i < x; i += len) {
+            unsigned char *begin = row + i * comp;
+            int diff = 1;
+            len = 1;
+
+            if (i < x - 1) {
+               ++len;
+               diff = memcmp(begin, row + (i + 1) * comp, comp);
+               if (diff) {
+                  const unsigned char *prev = begin;
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (memcmp(prev, row + k * comp, comp)) {
+                        prev += comp;
+                        ++len;
+                     } else {
+                        --len;
+                        break;
+                     }
+                  }
+               } else {
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (!memcmp(begin, row + k * comp, comp)) {
+                        ++len;
+                     } else {
+                        break;
+                     }
+                  }
+               }
+            }
+
+            if (diff) {
+               unsigned char header = STBIW_UCHAR(len - 1);
+               stbiw__write1(s, header);
+               for (k = 0; k < len; ++k) {
+                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
+               }
+            } else {
+               unsigned char header = STBIW_UCHAR(len - 129);
+               stbiw__write1(s, header);
+               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+            }
+         }
+      }
+      stbiw__write_flush(s);
+   }
+   return 1;
+}
+
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR writer
+// by Baldur Karlsson
+
+#define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
+{
+   int exponent;
+   float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
+
+   if (maxcomp < 1e-32f) {
+      rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
+   } else {
+      float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
+
+      rgbe[0] = (unsigned char)(linear[0] * normalize);
+      rgbe[1] = (unsigned char)(linear[1] * normalize);
+      rgbe[2] = (unsigned char)(linear[2] * normalize);
+      rgbe[3] = (unsigned char)(exponent + 128);
+   }
+}
+
+static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length+128);
+   STBIW_ASSERT(length+128 <= 255);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, &databyte, 1);
+}
+
+static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length);
+   STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, data, length);
+}
+
+static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
+{
+   unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
+   unsigned char rgbe[4];
+   float linear[3];
+   int x;
+
+   scanlineheader[2] = (width&0xff00)>>8;
+   scanlineheader[3] = (width&0x00ff);
+
+   /* skip RLE for images too small or large */
+   if (width < 8 || width >= 32768) {
+      for (x=0; x < width; x++) {
+         switch (ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         s->func(s->context, rgbe, 4);
+      }
+   } else {
+      int c,r;
+      /* encode into scratch buffer */
+      for (x=0; x < width; x++) {
+         switch(ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         scratch[x + width*0] = rgbe[0];
+         scratch[x + width*1] = rgbe[1];
+         scratch[x + width*2] = rgbe[2];
+         scratch[x + width*3] = rgbe[3];
+      }
+
+      s->func(s->context, scanlineheader, 4);
+
+      /* RLE each component separately */
+      for (c=0; c < 4; c++) {
+         unsigned char *comp = &scratch[width*c];
+
+         x = 0;
+         while (x < width) {
+            // find first run
+            r = x;
+            while (r+2 < width) {
+               if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
+                  break;
+               ++r;
+            }
+            if (r+2 >= width)
+               r = width;
+            // dump up to first run
+            while (x < r) {
+               int len = r-x;
+               if (len > 128) len = 128;
+               stbiw__write_dump_data(s, len, &comp[x]);
+               x += len;
+            }
+            // if there's a run, output it
+            if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
+               // find next byte after run
+               while (r < width && comp[r] == comp[x])
+                  ++r;
+               // output run up to r
+               while (x < r) {
+                  int len = r-x;
+                  if (len > 127) len = 127;
+                  stbiw__write_run_data(s, len, comp[x]);
+                  x += len;
+               }
+            }
+         }
+      }
+   }
+}
+
+static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
+{
+   if (y <= 0 || x <= 0 || data == NULL)
+      return 0;
+   else {
+      // Each component is stored separately. Allocate scratch space for full output scanline.
+      unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
+      int i, len;
+      char buffer[128];
+      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
+      s->func(s->context, header, sizeof(header)-1);
+
+#ifdef __STDC_LIB_EXT1__
+      len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#else
+      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#endif
+      s->func(s->context, buffer, len);
+
+      for(i=0; i < y; i++)
+         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
+      STBIW_FREE(scratch);
+      return 1;
+   }
+}
+
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+}
+
+STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif // STBI_WRITE_NO_STDIO
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PNG writer
+//
+
+#ifndef STBIW_ZLIB_COMPRESS
+// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
+#define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
+#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
+#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
+
+#define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
+#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
+#define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
+
+#define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
+#define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
+#define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
+
+static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
+{
+   int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
+   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
+   STBIW_ASSERT(p);
+   if (p) {
+      if (!*arr) ((int *) p)[1] = 0;
+      *arr = (void *) ((int *) p + 2);
+      stbiw__sbm(*arr) = m;
+   }
+   return *arr;
+}
+
+static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
+{
+   while (*bitcount >= 8) {
+      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
+      *bitbuffer >>= 8;
+      *bitcount -= 8;
+   }
+   return data;
+}
+
+static int stbiw__zlib_bitrev(int code, int codebits)
+{
+   int res=0;
+   while (codebits--) {
+      res = (res << 1) | (code & 1);
+      code >>= 1;
+   }
+   return res;
+}
+
+static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
+{
+   int i;
+   for (i=0; i < limit && i < 258; ++i)
+      if (a[i] != b[i]) break;
+   return i;
+}
+
+static unsigned int stbiw__zhash(unsigned char *data)
+{
+   stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
+   hash ^= hash << 3;
+   hash += hash >> 5;
+   hash ^= hash << 4;
+   hash += hash >> 17;
+   hash ^= hash << 25;
+   hash += hash >> 6;
+   return hash;
+}
+
+#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
+#define stbiw__zlib_add(code,codebits) \
+      (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
+#define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
+// default huffman tables
+#define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
+#define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
+#define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
+#define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
+#define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
+#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
+
+#define stbiw__ZHASH   16384
+
+#endif // STBIW_ZLIB_COMPRESS
+
+STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
+{
+#ifdef STBIW_ZLIB_COMPRESS
+   // user provided a zlib compress implementation, use that
+   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
+#else // use builtin
+   static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
+   static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
+   static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
+   static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+   unsigned int bitbuf=0;
+   int i,j, bitcount=0;
+   unsigned char *out = NULL;
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
+   if (hash_table == NULL)
+      return NULL;
+   if (quality < 5) quality = 5;
+
+   stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
+   stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
+   stbiw__zlib_add(1,1);  // BFINAL = 1
+   stbiw__zlib_add(1,2);  // BTYPE = 1 -- fixed huffman
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      hash_table[i] = NULL;
+
+   i=0;
+   while (i < data_len-3) {
+      // hash next 3 bytes of data to be compressed
+      int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
+      unsigned char *bestloc = 0;
+      unsigned char **hlist = hash_table[h];
+      int n = stbiw__sbcount(hlist);
+      for (j=0; j < n; ++j) {
+         if (hlist[j]-data > i-32768) { // if entry lies within window
+            int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
+            if (d >= best) { best=d; bestloc=hlist[j]; }
+         }
+      }
+      // when hash table entry is too long, delete half the entries
+      if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
+         STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
+         stbiw__sbn(hash_table[h]) = quality;
+      }
+      stbiw__sbpush(hash_table[h],data+i);
+
+      if (bestloc) {
+         // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
+         h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
+         hlist = hash_table[h];
+         n = stbiw__sbcount(hlist);
+         for (j=0; j < n; ++j) {
+            if (hlist[j]-data > i-32767) {
+               int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
+               if (e > best) { // if next match is better, bail on current match
+                  bestloc = NULL;
+                  break;
+               }
+            }
+         }
+      }
+
+      if (bestloc) {
+         int d = (int) (data+i - bestloc); // distance back
+         STBIW_ASSERT(d <= 32767 && best <= 258);
+         for (j=0; best > lengthc[j+1]-1; ++j);
+         stbiw__zlib_huff(j+257);
+         if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
+         for (j=0; d > distc[j+1]-1; ++j);
+         stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
+         if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
+         i += best;
+      } else {
+         stbiw__zlib_huffb(data[i]);
+         ++i;
+      }
+   }
+   // write out final bytes
+   for (;i < data_len; ++i)
+      stbiw__zlib_huffb(data[i]);
+   stbiw__zlib_huff(256); // end of block
+   // pad with 0 bits to byte boundary
+   while (bitcount)
+      stbiw__zlib_add(0,1);
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      (void) stbiw__sbfree(hash_table[i]);
+   STBIW_FREE(hash_table);
+
+   // store uncompressed instead if compression was worse
+   if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
+      stbiw__sbn(out) = 2;  // truncate to DEFLATE 32K window and FLEVEL = 1
+      for (j = 0; j < data_len;) {
+         int blocklen = data_len - j;
+         if (blocklen > 32767) blocklen = 32767;
+         stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
+         memcpy(out+stbiw__sbn(out), data+j, blocklen);
+         stbiw__sbn(out) += blocklen;
+         j += blocklen;
+      }
+   }
+
+   {
+      // compute adler32 on input
+      unsigned int s1=1, s2=0;
+      int blocklen = (int) (data_len % 5552);
+      j=0;
+      while (j < data_len) {
+         for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
+         s1 %= 65521; s2 %= 65521;
+         j += blocklen;
+         blocklen = 5552;
+      }
+      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s2));
+      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s1));
+   }
+   *out_len = stbiw__sbn(out);
+   // make returned pointer freeable
+   STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
+   return (unsigned char *) stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
+}
+
+static unsigned int stbiw__crc32(unsigned char *buffer, int len)
+{
+#ifdef STBIW_CRC32
+    return STBIW_CRC32(buffer, len);
+#else
+   static unsigned int crc_table[256] =
+   {
+      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+   };
+
+   unsigned int crc = ~0u;
+   int i;
+   for (i=0; i < len; ++i)
+      crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
+   return ~crc;
+#endif
+}
+
+#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
+#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
+#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
+
+static void stbiw__wpcrc(unsigned char **data, int len)
+{
+   unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
+   stbiw__wp32(*data, crc);
+}
+
+static unsigned char stbiw__paeth(int a, int b, int c)
+{
+   int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
+   if (pb <= pc) return STBIW_UCHAR(b);
+   return STBIW_UCHAR(c);
+}
+
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
+{
+   static int mapping[] = { 0,1,2,3,4 };
+   static int firstmap[] = { 0,1,0,5,6 };
+   int *mymap = (y != 0) ? mapping : firstmap;
+   int i;
+   int type = mymap[filter_type];
+   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
+   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+
+   if (type==0) {
+      memcpy(line_buffer, z, width*n);
+      return;
+   }
+
+   // first loop isn't optimized since it's just one pixel
+   for (i = 0; i < n; ++i) {
+      switch (type) {
+         case 1: line_buffer[i] = z[i]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
+         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
+         case 5: line_buffer[i] = z[i]; break;
+         case 6: line_buffer[i] = z[i]; break;
+      }
+   }
+   switch (type) {
+      case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
+      case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
+      case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
+      case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
+      case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
+      case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
+   }
+}
+
+STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
+{
+   int force_filter = stbi_write_force_png_filter;
+   int ctype[5] = { -1, 0, 4, 2, 6 };
+   unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
+   unsigned char *out,*o, *filt, *zlib;
+   signed char *line_buffer;
+   int j,zlen;
+
+   if (stride_bytes == 0)
+      stride_bytes = x * n;
+
+   if (force_filter >= 5) {
+      force_filter = -1;
+   }
+
+   filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
+   line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
+   for (j=0; j < y; ++j) {
+      int filter_type;
+      if (force_filter > -1) {
+         filter_type = force_filter;
+         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
+      } else { // Estimate the best filter by running through all of them:
+         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+         for (filter_type = 0; filter_type < 5; filter_type++) {
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
+
+            // Estimate the entropy of the line using this filter; the less, the better.
+            est = 0;
+            for (i = 0; i < x*n; ++i) {
+               est += abs((signed char) line_buffer[i]);
+            }
+            if (est < best_filter_val) {
+               best_filter_val = est;
+               best_filter = filter_type;
+            }
+         }
+         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
+            filter_type = best_filter;
+         }
+      }
+      // when we get here, filter_type contains the filter type, and line_buffer contains the data
+      filt[j*(x*n+1)] = (unsigned char) filter_type;
+      STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
+   }
+   STBIW_FREE(line_buffer);
+   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
+   STBIW_FREE(filt);
+   if (!zlib) return 0;
+
+   // each tag requires 12 bytes of overhead
+   out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12);
+   if (!out) return 0;
+   *out_len = 8 + 12+13 + 12+zlen + 12;
+
+   o=out;
+   STBIW_MEMMOVE(o,sig,8); o+= 8;
+   stbiw__wp32(o, 13); // header length
+   stbiw__wptag(o, "IHDR");
+   stbiw__wp32(o, x);
+   stbiw__wp32(o, y);
+   *o++ = 8;
+   *o++ = STBIW_UCHAR(ctype[n]);
+   *o++ = 0;
+   *o++ = 0;
+   *o++ = 0;
+   stbiw__wpcrc(&o,13);
+
+   stbiw__wp32(o, zlen);
+   stbiw__wptag(o, "IDAT");
+   STBIW_MEMMOVE(o, zlib, zlen);
+   o += zlen;
+   STBIW_FREE(zlib);
+   stbiw__wpcrc(&o, zlen);
+
+   stbiw__wp32(o,0);
+   stbiw__wptag(o, "IEND");
+   stbiw__wpcrc(&o,0);
+
+   STBIW_ASSERT(o == out + *out_len);
+
+   return out;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   FILE *f;
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+
+   f = stbiw__fopen(filename, "wb");
+   if (!f) { STBIW_FREE(png); return 0; }
+   fwrite(png, 1, len, f);
+   fclose(f);
+   STBIW_FREE(png);
+   return 1;
+}
+#endif
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+   func(context, png, len);
+   STBIW_FREE(png);
+   return 1;
+}
+
+
+/* ***************************************************************************
+ *
+ * JPEG writer
+ *
+ * This is based on Jon Olick's jo_jpeg.cpp:
+ * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
+ */
+
+static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
+      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
+
+static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
+   int bitBuf = *bitBufP, bitCnt = *bitCntP;
+   bitCnt += bs[1];
+   bitBuf |= bs[0] << (24 - bitCnt);
+   while(bitCnt >= 8) {
+      unsigned char c = (bitBuf >> 16) & 255;
+      stbiw__putc(s, c);
+      if(c == 255) {
+         stbiw__putc(s, 0);
+      }
+      bitBuf <<= 8;
+      bitCnt -= 8;
+   }
+   *bitBufP = bitBuf;
+   *bitCntP = bitCnt;
+}
+
+static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
+   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+   float z1, z2, z3, z4, z5, z11, z13;
+
+   float tmp0 = d0 + d7;
+   float tmp7 = d0 - d7;
+   float tmp1 = d1 + d6;
+   float tmp6 = d1 - d6;
+   float tmp2 = d2 + d5;
+   float tmp5 = d2 - d5;
+   float tmp3 = d3 + d4;
+   float tmp4 = d3 - d4;
+
+   // Even part
+   float tmp10 = tmp0 + tmp3;   // phase 2
+   float tmp13 = tmp0 - tmp3;
+   float tmp11 = tmp1 + tmp2;
+   float tmp12 = tmp1 - tmp2;
+
+   d0 = tmp10 + tmp11;       // phase 3
+   d4 = tmp10 - tmp11;
+
+   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+   d2 = tmp13 + z1;       // phase 5
+   d6 = tmp13 - z1;
+
+   // Odd part
+   tmp10 = tmp4 + tmp5;       // phase 2
+   tmp11 = tmp5 + tmp6;
+   tmp12 = tmp6 + tmp7;
+
+   // The rotator is modified from fig 4-8 to avoid extra negations.
+   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+   z2 = tmp10 * 0.541196100f + z5; // c2-c6
+   z4 = tmp12 * 1.306562965f + z5; // c2+c6
+   z3 = tmp11 * 0.707106781f; // c4
+
+   z11 = tmp7 + z3;      // phase 5
+   z13 = tmp7 - z3;
+
+   *d5p = z13 + z2;         // phase 6
+   *d3p = z13 - z2;
+   *d1p = z11 + z4;
+   *d7p = z11 - z4;
+
+   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+}
+
+static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
+   int tmp1 = val < 0 ? -val : val;
+   val = val < 0 ? val-1 : val;
+   bits[1] = 1;
+   while(tmp1 >>= 1) {
+      ++bits[1];
+   }
+   bits[0] = val & ((1<<bits[1])-1);
+}
+
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
+   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
+   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
+   int dataOff, i, j, n, diff, end0pos, x, y;
+   int DU[64];
+
+   // DCT rows
+   for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
+   }
+   // DCT columns
+   for(dataOff=0; dataOff<8; ++dataOff) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
+                     &CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
+   }
+   // Quantize/descale/zigzag the coefficients
+   for(y = 0, j=0; y < 8; ++y) {
+      for(x = 0; x < 8; ++x,++j) {
+         float v;
+         i = y*du_stride+x;
+         v = CDU[i]*fdtbl[j];
+         // DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
+         // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
+         DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+      }
+   }
+
+   // Encode DC
+   diff = DU[0] - DC;
+   if (diff == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
+   } else {
+      unsigned short bits[2];
+      stbiw__jpg_calcBits(diff, bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   // Encode ACs
+   end0pos = 63;
+   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
+   }
+   // end0pos = first element in reverse order !=0
+   if(end0pos == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+      return DU[0];
+   }
+   for(i = 1; i <= end0pos; ++i) {
+      int startpos = i;
+      int nrzeroes;
+      unsigned short bits[2];
+      for (; DU[i]==0 && i<=end0pos; ++i) {
+      }
+      nrzeroes = i-startpos;
+      if ( nrzeroes >= 16 ) {
+         int lng = nrzeroes>>4;
+         int nrmarker;
+         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
+            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
+         nrzeroes &= 15;
+      }
+      stbiw__jpg_calcBits(DU[i], bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   if(end0pos != 63) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+   }
+   return DU[0];
+}
+
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+   // Constants that don't pollute global namespace
+   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
+   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
+   static const unsigned char std_ac_luminance_values[] = {
+      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
+      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
+      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
+      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
+      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
+      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
+      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
+   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
+   static const unsigned char std_ac_chrominance_values[] = {
+      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
+      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
+      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
+      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
+      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
+      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
+      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   // Huffman tables
+   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
+   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
+   static const unsigned short YAC_HT[256][2] = {
+      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const unsigned short UVAC_HT[256][2] = {
+      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
+                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
+   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
+                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
+   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
+                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+
+   int row, col, i, k, subsample;
+   float fdtbl_Y[64], fdtbl_UV[64];
+   unsigned char YTable[64], UVTable[64];
+
+   if(!data || !width || !height || comp > 4 || comp < 1) {
+      return 0;
+   }
+
+   quality = quality ? quality : 90;
+   subsample = quality <= 90 ? 1 : 0;
+   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
+
+   for(i = 0; i < 64; ++i) {
+      int uvti, yti = (YQT[i]*quality+50)/100;
+      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
+      uvti = (UVQT[i]*quality+50)/100;
+      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+   }
+
+   for(row = 0, k = 0; row < 8; ++row) {
+      for(col = 0; col < 8; ++col, ++k) {
+         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+      }
+   }
+
+   // Write Headers
+   {
+      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
+      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
+      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
+                                      3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
+      s->func(s->context, (void*)head0, sizeof(head0));
+      s->func(s->context, (void*)YTable, sizeof(YTable));
+      stbiw__putc(s, 1);
+      s->func(s->context, UVTable, sizeof(UVTable));
+      s->func(s->context, (void*)head1, sizeof(head1));
+      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+      stbiw__putc(s, 0x10); // HTYACinfo
+      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+      stbiw__putc(s, 1); // HTUDCinfo
+      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+      stbiw__putc(s, 0x11); // HTUACinfo
+      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+      s->func(s->context, (void*)head2, sizeof(head2));
+   }
+
+   // Encode 8x8 macroblocks
+   {
+      static const unsigned short fillBits[] = {0x7F, 7};
+      int DCY=0, DCU=0, DCV=0;
+      int bitBuf=0, bitCnt=0;
+      // comp == 2 is grey+alpha (alpha is ignored)
+      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+      const unsigned char *dataR = (const unsigned char *)data;
+      const unsigned char *dataG = dataR + ofsG;
+      const unsigned char *dataB = dataR + ofsB;
+      int x, y, pos;
+      if(subsample) {
+         for(y = 0; y < height; y += 16) {
+            for(x = 0; x < width; x += 16) {
+               float Y[256], U[256], V[256];
+               for(row = y, pos = 0; row < y+16; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+16; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+
+               // subsample U,V
+               {
+                  float subU[64], subV[64];
+                  int yy, xx;
+                  for(yy = 0, pos = 0; yy < 8; ++yy) {
+                     for(xx = 0; xx < 8; ++xx, ++pos) {
+                        int j = yy*32+xx*2;
+                        subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
+                        subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
+                     }
+                  }
+                  DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+                  DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+               }
+            }
+         }
+      } else {
+         for(y = 0; y < height; y += 8) {
+            for(x = 0; x < width; x += 8) {
+               float Y[64], U[64], V[64];
+               for(row = y, pos = 0; row < y+8; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+8; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y,  DCY, YDC_HT, YAC_HT);
+               DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+               DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+            }
+         }
+      }
+
+      // Do the bit alignment of the EOI marker
+      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
+   }
+
+   // EOI
+   stbiw__putc(s, 0xFF);
+   stbiw__putc(s, 0xD9);
+
+   return 1;
+}
+
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+}
+
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
+#endif // STB_IMAGE_WRITE_IMPLEMENTATION
+
+/* Revision history
+      1.16  (2021-07-11)
+             make Deflate code emit uncompressed blocks when it would otherwise expand
+             support writing BMPs with alpha channel
+      1.15  (2020-07-13) unknown
+      1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
+      1.13
+      1.12
+      1.11  (2019-08-11)
+
+      1.10  (2019-02-07)
+             support utf8 filenames in Windows; fix warnings and platform ifdefs
+      1.09  (2018-02-11)
+             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
+      1.08  (2018-01-29)
+             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
+      1.07  (2017-07-24)
+             doc fix
+      1.06 (2017-07-23)
+             writing JPEG (using Jon Olick's code)
+      1.05   ???
+      1.04 (2017-03-03)
+             monochrome BMP expansion
+      1.03   ???
+      1.02 (2016-04-02)
+             avoid allocating large structures on the stack
+      1.01 (2016-01-16)
+             STBIW_REALLOC_SIZED: support allocators with no realloc support
+             avoid race-condition in crc initialization
+             minor compile issues
+      1.00 (2015-09-14)
+             installable file IO function
+      0.99 (2015-09-13)
+             warning fixes; TGA rle support
+      0.98 (2015-04-08)
+             added STBIW_MALLOC, STBIW_ASSERT etc
+      0.97 (2015-01-18)
+             fixed HDR asserts, rewrote HDR rle logic
+      0.96 (2015-01-17)
+             add HDR output
+             fix monochrome BMP
+      0.95 (2014-08-17)
+             add monochrome TGA output
+      0.94 (2014-05-31)
+             rename private functions to avoid conflicts with stb_image.h
+      0.93 (2014-05-27)
+             warning fixes
+      0.92 (2010-08-01)
+             casts to unsigned char to fix warnings
+      0.91 (2010-07-17)
+             first public release
+      0.90   first internal release
+*/
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/darknet-master/CMakeLists.txt b/darknet-master/CMakeLists.txt
new file mode 100644
index 0000000..4710e39
--- /dev/null
+++ b/darknet-master/CMakeLists.txt
@@ -0,0 +1,721 @@
+cmake_minimum_required(VERSION 3.19)
+include(CMakeDependentOption)
+
+option(CMAKE_VERBOSE_MAKEFILE "Create verbose makefile" OFF)
+option(CUDA_VERBOSE_BUILD "Create verbose CUDA build" OFF)
+option(BUILD_SHARED_LIBS "Create dark as a shared library" ON)
+option(BUILD_AS_CPP "Build Darknet using C++ compiler also for C files" OFF)
+option(BUILD_USELIB_TRACK "Build uselib_track" ON)
+option(MANUALLY_EXPORT_TRACK_OPTFLOW "Manually export the TRACK_OPTFLOW=1 define" OFF)
+option(ENABLE_OPENCV "Enable OpenCV integration" ON)
+option(ENABLE_CUDA "Enable CUDA support" ON)
+cmake_dependent_option(ENABLE_CUDA_OPENGL_INTEGRATION "Build darknet with support for running networks straight from OpenGL textures" ON "ENABLE_CUDA" OFF)
+option(ENABLE_CUDNN "Enable CUDNN" ON)
+option(ENABLE_CUDNN_HALF "Enable CUDNN Half precision" ON)
+option(ENABLE_ZED_CAMERA "Enable ZED Camera support" ON)
+option(ENABLE_VCPKG_INTEGRATION "Enable VCPKG integration" OFF)
+option(ENABLE_DEPLOY_CUSTOM_CMAKE_MODULES "Copy custom CMake modules for downstream integration" OFF)
+option(ENABLE_CSHARP_WRAPPER "Enable building a csharp wrapper" OFF)
+option(ENABLE_INSTALLER "Enable building an installer" OFF)
+option(VCPKG_BUILD_OPENCV_WITH_CUDA "Build OpenCV with CUDA extension integration" ON)
+option(VCPKG_USE_OPENCV2 "Use legacy OpenCV 2" OFF)
+option(VCPKG_USE_OPENCV3 "Use legacy OpenCV 3" OFF)
+option(VCPKG_USE_OPENCV4 "Use OpenCV 4" ON)
+option(USE_NSIS "Use NSIS as a CPack backend on Windows" ON)
+
+if(DEFINED ENV{VCPKG_DEFAULT_TRIPLET})
+  message(STATUS "Setting default vcpkg target triplet to $ENV{VCPKG_DEFAULT_TRIPLET}")
+  set(VCPKG_TARGET_TRIPLET $ENV{VCPKG_DEFAULT_TRIPLET})
+endif()
+
+if(VCPKG_USE_OPENCV4 AND VCPKG_USE_OPENCV2)
+  message(STATUS "You required vcpkg feature related to OpenCV 2 but forgot to turn off those for OpenCV 4, doing that for you")
+  set(VCPKG_USE_OPENCV4 OFF CACHE BOOL "Use OpenCV 4" FORCE)
+endif()
+if(VCPKG_USE_OPENCV4 AND VCPKG_USE_OPENCV3)
+  message(STATUS "You required vcpkg feature related to OpenCV 3 but forgot to turn off those for OpenCV 4, doing that for you")
+  set(VCPKG_USE_OPENCV4 OFF CACHE BOOL "Use OpenCV 4" FORCE)
+endif()
+if(VCPKG_USE_OPENCV2 AND VCPKG_USE_OPENCV3)
+  message(STATUS "You required vcpkg features related to both OpenCV 2 and OpenCV 3. Impossible to satisfy, keeping only OpenCV 3")
+  set(VCPKG_USE_OPENCV2 OFF CACHE BOOL "Use legacy OpenCV 2" FORCE)
+endif()
+
+if(ENABLE_CUDA AND NOT APPLE)
+  list(APPEND VCPKG_MANIFEST_FEATURES "cuda")
+endif()
+if(ENABLE_CUDNN AND ENABLE_CUDA AND NOT APPLE)
+  list(APPEND VCPKG_MANIFEST_FEATURES "cudnn")
+endif()
+if(ENABLE_OPENCV)
+  if(VCPKG_BUILD_OPENCV_WITH_CUDA AND NOT APPLE)
+    if(VCPKG_USE_OPENCV4)
+      list(APPEND VCPKG_MANIFEST_FEATURES "opencv-cuda")
+    elseif(VCPKG_USE_OPENCV3)
+      list(APPEND VCPKG_MANIFEST_FEATURES "opencv3-cuda")
+    elseif(VCPKG_USE_OPENCV2)
+      list(APPEND VCPKG_MANIFEST_FEATURES "opencv2-cuda")
+    endif()
+  else()
+    if(VCPKG_USE_OPENCV4)
+      list(APPEND VCPKG_MANIFEST_FEATURES "opencv-base")
+    elseif(VCPKG_USE_OPENCV3)
+      list(APPEND VCPKG_MANIFEST_FEATURES "opencv3-base")
+    elseif(VCPKG_USE_OPENCV2)
+      list(APPEND VCPKG_MANIFEST_FEATURES "opencv2-base")
+    endif()
+  endif()
+endif()
+
+if(NOT CMAKE_HOST_SYSTEM_PROCESSOR AND NOT WIN32)
+  execute_process(COMMAND "uname" "-m" OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+
+if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86")
+  set(IS_X86 TRUE)
+else()
+  set(IS_X86 FALSE)
+endif()
+
+if(ENABLE_VCPKG_INTEGRATION AND DEFINED ENV{VCPKG_ROOT} AND NOT DEFINED CMAKE_TOOLCHAIN_FILE)
+  set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+  set(X_VCPKG_APPLOCAL_DEPS_INSTALL ON)
+  set(CMAKE_TOOLCHAIN_FILE "$ENV{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" CACHE STRING "")
+  #set(_VCPKG_INSTALLED_DIR ${CMAKE_CURRENT_LIST_DIR}/vcpkg CACHE STRING "")  #folder for manifest-installed dependencies
+  message(STATUS "VCPKG found: $ENV{VCPKG_ROOT}")
+  message(STATUS "Using VCPKG integration")
+  set(USE_INTEGRATED_LIBS "FALSE" CACHE BOOL "Use libs distributed with this repo")
+  if(VCPKG_MANIFEST_FEATURES)
+    message(STATUS "VCPKG_MANIFEST_FEATURES: ${VCPKG_MANIFEST_FEATURES}")
+  endif()
+elseif(DEFINED CMAKE_TOOLCHAIN_FILE)
+  message(STATUS "Using toolchain: ${CMAKE_TOOLCHAIN_FILE}")
+  if(CMAKE_TOOLCHAIN_FILE MATCHES "vcpkg.cmake")
+    message(STATUS "Toolchain uses VCPKG integration")
+    if(VCPKG_MANIFEST_FEATURES)
+      message(STATUS "VCPKG_MANIFEST_FEATURES: ${VCPKG_MANIFEST_FEATURES}")
+    endif()
+  endif()
+  set(USE_INTEGRATED_LIBS "FALSE" CACHE BOOL "Use libs distributed with this repo")
+elseif(WIN32)
+  message(STATUS "vcpkg not found, toolchain not defined, using integrated libs on win32")
+  set(USE_INTEGRATED_LIBS "TRUE" CACHE BOOL "Use libs distributed with this repo")
+else()
+  message(WARNING "vcpkg not found, toolchain not defined, system not win32 so build might fail")
+  set(USE_INTEGRATED_LIBS "TRUE" CACHE BOOL "Use libs distributed with this repo")
+endif()
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/vcpkg.json)
+  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/vcpkg.json VCPKG_JSON_STRING)
+  string(JSON VERSION_STRING GET ${VCPKG_JSON_STRING} version)
+else()
+  set(VERSION_STRING "0.2.5.4")
+endif()
+
+string(REPLACE "." ";" VERSION_LIST ${VERSION_STRING})
+list(LENGTH VERSION_LIST VERSION_LIST_LENGTH)
+if(VERSION_LIST_LENGTH LESS 3)
+  message(FATAL_ERROR "Darknet needs at least major.minor.patch version numbers to properly configure")
+endif()
+list(GET VERSION_LIST 0 Darknet_MAJOR_VERSION)
+list(GET VERSION_LIST 1 Darknet_MINOR_VERSION)
+list(GET VERSION_LIST 2 Darknet_PATCH_VERSION)
+if(VERSION_LIST_LENGTH GREATER 3)
+  list(GET VERSION_LIST 3 Darknet_TWEAK_VERSION)
+else()
+  set(Darknet_TWEAK_VERSION 0)
+endif()
+
+set(Darknet_VERSION ${Darknet_MAJOR_VERSION}.${Darknet_MINOR_VERSION}.${Darknet_PATCH_VERSION}.${Darknet_TWEAK_VERSION})
+message("Darknet_VERSION: ${Darknet_VERSION}")
+
+project(Darknet VERSION ${Darknet_VERSION})
+
+enable_language(C)
+enable_language(CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+if(USE_INTEGRATED_LIBS)
+  set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/Modules/" ${CMAKE_MODULE_PATH})
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCC OR "${CMAKE_C_COMPILER_ID}" MATCHES "Clang" OR "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  set(CMAKE_COMPILER_IS_GNUCC_OR_CLANG TRUE)
+  if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR "${CMAKE_CXX_COMPILER_ID}" MATCHES "clang")
+    set(CMAKE_COMPILER_IS_CLANG TRUE)
+  else()
+    set(CMAKE_COMPILER_IS_CLANG FALSE)
+  endif()
+else()
+  set(CMAKE_COMPILER_IS_GNUCC_OR_CLANG FALSE)
+  set(CMAKE_COMPILER_IS_CLANG FALSE)
+endif()
+
+cmake_dependent_option(ENABLE_SSE_AND_AVX_FLAGS "Enable AVX and SSE optimizations (x86-only)" ON "CMAKE_COMPILER_IS_GNUCC_OR_CLANG;IS_X86" OFF)
+
+set(default_build_type "Release")
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting build type to '${default_build_type}' as none was specified.")
+  set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
+      STRING "Choose the type of build." FORCE)
+  # Set the possible values of build type for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+    "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+if(NOT ENABLE_INSTALLER)
+  if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}" CACHE PATH "Install prefix" FORCE)
+  endif()
+  set(INSTALL_BIN_DIR      "${CMAKE_CURRENT_LIST_DIR}" CACHE PATH "Path where exe and dll will be installed")
+  set(INSTALL_LIB_DIR      "${CMAKE_CURRENT_LIST_DIR}" CACHE PATH "Path where lib will be installed")
+else()
+  set(INSTALL_BIN_DIR      "bin"                       CACHE PATH "Path where exe and dll will be installed")
+  set(INSTALL_LIB_DIR      "lib"                       CACHE PATH "Path where lib will be installed")
+endif()
+
+set(INSTALL_INCLUDE_DIR  "include/darknet"           CACHE PATH "Path where headers will be installed")
+set(INSTALL_CMAKE_DIR    "share/darknet"             CACHE PATH "Path where cmake configs will be installed")
+
+find_library(MATH_LIBRARY m)
+
+if(ENABLE_CUDA)
+  include(CheckLanguage)
+  check_language(CUDA)
+  if(NOT CMAKE_CUDA_COMPILER)
+    message(STATUS "CUDA_PATH: $ENV{CUDA_PATH}")
+    message(STATUS "CUDACXX: $ENV{CUDACXX}")
+    message(FATAL_ERROR "CUDA not found, please build explicitly with -DENABLE_CUDA=OFF if you do not want CUDA.")
+  else()
+    enable_language(CUDA)
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "9.0")
+      message(STATUS "CUDA_PATH: $ENV{CUDA_PATH}")
+      message(STATUS "CUDACXX: $ENV{CUDACXX}")
+      message(FATAL_ERROR "Unsupported CUDA version, please upgrade to CUDA 9+ or disable CUDA with explicitly with -DENABLE_CUDA=OFF")
+    else()
+      if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.23.0" AND NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        set(CMAKE_CUDA_ARCHITECTURES all-major)
+      endif()
+      message(STATUS "Selected CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")
+      if("all-major" IN_LIST CMAKE_CUDA_ARCHITECTURES OR
+         "all" IN_LIST CMAKE_CUDA_ARCHITECTURES OR
+         70 IN_LIST CMAKE_CUDA_ARCHITECTURES OR
+         72 IN_LIST CMAKE_CUDA_ARCHITECTURES OR
+         75 IN_LIST CMAKE_CUDA_ARCHITECTURES OR
+         80 IN_LIST CMAKE_CUDA_ARCHITECTURES OR
+         86 IN_LIST CMAKE_CUDA_ARCHITECTURES)
+        set(ENABLE_CUDNN_HALF "TRUE" CACHE BOOL "Enable CUDNN Half precision" FORCE)
+        message(STATUS "Your setup supports half precision (CUDA_ARCHITECTURES >= 70)")
+      else()
+        set(ENABLE_CUDNN_HALF "FALSE" CACHE BOOL "Enable CUDNN Half precision" FORCE)
+        message(STATUS "Your setup does not support half precision (it requires CUDA_ARCHITECTURES >= 70)")
+      endif()
+    endif()
+    if(BUILD_SHARED_LIBS)
+      set(CMAKE_CUDA_RUNTIME_LIBRARY "Shared")
+    else()
+      set(CMAKE_CUDA_RUNTIME_LIBRARY "Static")
+    endif()
+  endif()
+endif()
+
+if(WIN32 AND ENABLE_CUDA AND CMAKE_MAKE_PROGRAM MATCHES "ninja")
+  option(SELECT_OPENCV_MODULES "Use only few selected OpenCV modules to circumvent 8192 char limit when using Ninja on Windows" ON)
+else()
+  option(SELECT_OPENCV_MODULES "Use only few selected OpenCV modules to circumvent 8192 char limit when using Ninja on Windows" OFF)
+endif()
+
+if(USE_INTEGRATED_LIBS)
+  set(PThreads4W_ROOT ${CMAKE_CURRENT_LIST_DIR}/3rdparty/pthreads CACHE PATH "Path where pthreads for windows can be located")
+  set(Stb_DIR ${CMAKE_CURRENT_LIST_DIR}/3rdparty/stb CACHE PATH "Path where Stb image library can be located")
+endif()
+
+set(CMAKE_DEBUG_POSTFIX d)
+set(CMAKE_THREAD_PREFER_PTHREAD ON)
+find_package(Threads REQUIRED)
+if(MSVC)
+  find_package(PThreads4W REQUIRED)
+endif()
+if(ENABLE_OPENCV)
+  find_package(OpenCV REQUIRED)
+  if(OpenCV_FOUND)
+    if(SELECT_OPENCV_MODULES)
+      if(TARGET opencv_world)
+        list(APPEND OpenCV_LINKED_COMPONENTS "opencv_world")
+      else()
+        if(TARGET opencv_core)
+          list(APPEND OpenCV_LINKED_COMPONENTS "opencv_core")
+        endif()
+        if(TARGET opencv_highgui)
+          list(APPEND OpenCV_LINKED_COMPONENTS "opencv_highgui")
+        endif()
+        if(TARGET opencv_imgproc)
+          list(APPEND OpenCV_LINKED_COMPONENTS "opencv_imgproc")
+        endif()
+        if(TARGET opencv_video)
+          list(APPEND OpenCV_LINKED_COMPONENTS "opencv_video")
+        endif()
+        if(TARGET opencv_videoio)
+          list(APPEND OpenCV_LINKED_COMPONENTS "opencv_videoio")
+        endif()
+        if(TARGET opencv_imgcodecs)
+          list(APPEND OpenCV_LINKED_COMPONENTS "opencv_imgcodecs")
+        endif()
+        if(TARGET opencv_text)
+          list(APPEND OpenCV_LINKED_COMPONENTS "opencv_text")
+        endif()
+      endif()
+    else()
+      list(APPEND OpenCV_LINKED_COMPONENTS ${OpenCV_LIBS})
+    endif()
+  endif()
+endif()
+find_package(Stb REQUIRED)
+find_package(OpenMP)
+
+if(APPLE AND NOT OPENMP_FOUND)
+  message(STATUS "  ->  To enable OpenMP on macOS, please install libomp from Homebrew")
+endif()
+
+set(ADDITIONAL_CXX_FLAGS "-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -Wno-deprecated-declarations -Wno-write-strings")
+set(ADDITIONAL_C_FLAGS "-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -Wno-deprecated-declarations -Wno-write-strings")
+if(UNIX AND BUILD_SHARED_LIBS AND NOT CMAKE_COMPILER_IS_CLANG)
+  set(SHAREDLIB_CXX_FLAGS "-Wl,-Bsymbolic")
+  set(SHAREDLIB_C_FLAGS "-Wl,-Bsymbolic")
+endif()
+
+if(MSVC)
+  set(ADDITIONAL_CXX_FLAGS " /nologo /wd4013 /wd4018 /wd4028 /wd4047 /wd4068 /wd4090 /wd4101 /wd4113 /wd4133 /wd4190 /wd4244 /wd4267 /wd4305 /wd4477 /wd4996 /wd4819 /fp:fast")
+  set(ADDITIONAL_C_FLAGS " /nologo /wd4013 /wd4018 /wd4028 /wd4047 /wd4068 /wd4090 /wd4101 /wd4113 /wd4133 /wd4190 /wd4244 /wd4267 /wd4305 /wd4477 /wd4996 /wd4819 /fp:fast")
+  string(REGEX REPLACE "/O2" "/Ox" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+  string(REGEX REPLACE "/O2" "/Ox" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCC_OR_CLANG)
+  if(CMAKE_COMPILER_IS_CLANG)
+    if(UNIX AND NOT APPLE)
+      set(CMAKE_CXX_FLAGS "-pthread ${CMAKE_CXX_FLAGS}")  #force pthread to avoid bugs in some cmake setups
+      set(CMAKE_C_FLAGS "-pthread ${CMAKE_C_FLAGS}")
+    endif()
+  endif()
+  string(REGEX REPLACE "-O0" "-Og" CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+  string(REGEX REPLACE "-O3" "-Ofast" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+  string(REGEX REPLACE "-O0" "-Og" CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+  string(REGEX REPLACE "-O3" "-Ofast" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+  if(ENABLE_SSE_AND_AVX_FLAGS)
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -ffp-contract=fast -mavx -mavx2 -msse3 -msse4.1 -msse4.2 -msse4a")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -ffp-contract=fast -mavx -mavx2 -msse3 -msse4.1 -msse4.2 -msse4a")
+  endif()
+endif()
+
+set(CMAKE_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} ${SHAREDLIB_CXX_FLAGS} ${CMAKE_CXX_FLAGS}")
+set(CMAKE_C_FLAGS "${ADDITIONAL_C_FLAGS} ${SHAREDLIB_C_FLAGS} ${CMAKE_C_FLAGS}")
+
+if(OpenCV_FOUND)
+  if(ENABLE_CUDA AND OpenCV_CUDA_VERSION)
+    if(TARGET opencv_cudaoptflow)
+      list(APPEND OpenCV_LINKED_COMPONENTS "opencv_cudaoptflow")
+    endif()
+    if(TARGET opencv_cudaimgproc)
+      list(APPEND OpenCV_LINKED_COMPONENTS "opencv_cudaimgproc")
+    endif()
+  elseif(ENABLE_CUDA AND NOT OpenCV_CUDA_VERSION)
+    set(BUILD_USELIB_TRACK "FALSE" CACHE BOOL "Build uselib_track" FORCE)
+    message(STATUS "  ->  darknet is fine for now, but uselib_track has been disabled!")
+    message(STATUS "  ->  Please rebuild OpenCV from sources with CUDA support to enable it")
+  else()
+    set(BUILD_USELIB_TRACK "FALSE" CACHE BOOL "Build uselib_track" FORCE)
+  endif()
+endif()
+
+if(ENABLE_CUDA AND ENABLE_CUDNN)
+  find_package(CUDNN REQUIRED)
+endif()
+
+if(ENABLE_CUDA)
+  if(MSVC)
+    set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} /DGPU")
+
+    if(ENABLE_CUDA_OPENGL_INTEGRATION)
+      set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} /DCUDA_OPENGL_INTEGRATION")
+    endif()
+
+    if(CUDNN_FOUND)
+      set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} /DCUDNN")
+    endif()
+    if(OpenCV_FOUND)
+      set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} /DOPENCV")
+    endif()
+    string(REPLACE " " "," ADDITIONAL_CXX_FLAGS_COMMA_SEPARATED "${ADDITIONAL_CXX_FLAGS}")
+    set(CUDA_HOST_COMPILER_FLAGS "-Wno-deprecated-declarations -Xcompiler=\"${ADDITIONAL_CXX_FLAGS_COMMA_SEPARATED}\"")
+  else()
+    set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} -DGPU")
+
+    if(ENABLE_CUDA_OPENGL_INTEGRATION)
+      set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} -DCUDA_OPENGL_INTEGRATION")
+    endif()
+
+    if(CUDNN_FOUND)
+      set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} -DCUDNN")
+    endif()
+    if(OpenCV_FOUND)
+      set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} -DOPENCV")
+    endif()
+    if(APPLE)
+        set(CUDA_HOST_COMPILER_FLAGS "--compiler-options \" ${ADDITIONAL_CXX_FLAGS} -fPIC -Xpreprocessor -fopenmp -Ofast \"")
+    else()
+        set(CUDA_HOST_COMPILER_FLAGS "--compiler-options \" ${ADDITIONAL_CXX_FLAGS} -fPIC -fopenmp -Ofast \"")
+    endif()
+  endif()
+
+  string (REPLACE ";" " " CUDA_ARCH_FLAGS_SPACE_SEPARATED "${CUDA_ARCH_FLAGS}")
+  set(CMAKE_CUDA_FLAGS "${CUDA_ARCH_FLAGS_SPACE_SEPARATED} ${CUDA_HOST_COMPILER_FLAGS} ${CMAKE_CUDA_FLAGS}")
+  message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+endif()
+
+if(ENABLE_CUDA AND ENABLE_ZED_CAMERA)
+  find_package(ZED 2 QUIET)
+  if(ZED_FOUND)
+    include_directories(${ZED_INCLUDE_DIRS})
+    link_directories(${ZED_LIBRARY_DIR})
+    message(STATUS "ZED SDK enabled")
+  else()
+    message(STATUS "ZED SDK not found")
+    set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE)
+  endif()
+else()
+  if(ENABLE_ZED_CAMERA)
+    message(STATUS "ZED SDK not enabled, since it requires CUDA")
+  endif()
+  set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE)
+endif()
+
+foreach(p LIB BIN INCLUDE CMAKE)
+  set(var INSTALL_${p}_DIR)
+  if(NOT IS_ABSOLUTE "${${var}}")
+    set(FULLPATH_${var} "${CMAKE_INSTALL_PREFIX}/${${var}}")
+  endif()
+endforeach()
+
+configure_file(
+  "${CMAKE_CURRENT_LIST_DIR}/src/version.h.in"
+  "${CMAKE_CURRENT_LIST_DIR}/src/version.h"
+)
+
+#look for all *.h files in src folder
+file(GLOB headers "${CMAKE_CURRENT_LIST_DIR}/src/*.h")
+#add also files in the include folder
+list(APPEND headers
+  ${CMAKE_CURRENT_LIST_DIR}/include/darknet.h
+)
+#remove windows only files
+if(NOT MSVC)
+  list(REMOVE_ITEM headers
+    ${CMAKE_CURRENT_LIST_DIR}/src/gettimeofday.h
+  )
+endif()
+#set(exported_headers ${headers})
+
+#look for all *.c files in src folder
+file(GLOB sources "${CMAKE_CURRENT_LIST_DIR}/src/*.c")
+#add also .cpp files
+list(APPEND sources
+  ${CMAKE_CURRENT_LIST_DIR}/src/http_stream.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/src/image_opencv.cpp
+)
+#remove darknet.c file which is necessary only for the executable, not for the lib
+list(REMOVE_ITEM sources
+  ${CMAKE_CURRENT_LIST_DIR}/src/darknet.c
+)
+#remove windows only files
+if(NOT MSVC)
+  list(REMOVE_ITEM sources
+    ${CMAKE_CURRENT_LIST_DIR}/src/gettimeofday.c
+  )
+endif()
+
+if(USE_INTEGRATED_LIBS AND MSVC)
+  list(APPEND sources
+    ${CMAKE_CURRENT_LIST_DIR}/3rdparty/getopt/getopt.c
+    ${CMAKE_CURRENT_LIST_DIR}/3rdparty/getopt/getopt.h
+  )
+endif()
+if((NOT USE_INTEGRATED_LIBS) AND MSVC)
+  find_package(unofficial-getopt-win32 REQUIRED)
+endif()
+
+if(ENABLE_CUDA)
+  file(GLOB cuda_sources "${CMAKE_CURRENT_LIST_DIR}/src/*.cu")
+endif()
+
+if(BUILD_AS_CPP)
+  set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
+endif()
+
+add_library(dark ${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp ${CMAKE_CURRENT_LIST_DIR}/src/yolo_v2_class.cpp ${sources} ${headers} ${cuda_sources})
+set_target_properties(dark PROPERTIES POSITION_INDEPENDENT_CODE ON)
+if(ENABLE_CUDA)
+  set_target_properties(dark PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+endif()
+if(BUILD_SHARED_LIBS)
+  target_compile_definitions(dark PRIVATE LIB_EXPORTS=1)
+endif()
+if(BUILD_AS_CPP)
+  set_target_properties(dark PROPERTIES LINKER_LANGUAGE CXX)
+endif()
+set_target_properties(dark PROPERTIES OUTPUT_NAME "darknet")
+
+if(OpenCV_FOUND AND OpenCV_VERSION VERSION_GREATER "3.0" AND BUILD_USELIB_TRACK)
+  add_executable(uselib_track ${CMAKE_CURRENT_LIST_DIR}/src/yolo_console_dll.cpp)
+endif()
+
+add_executable(uselib ${CMAKE_CURRENT_LIST_DIR}/src/yolo_console_dll.cpp)
+if(BUILD_AS_CPP)
+  set_target_properties(uselib PROPERTIES LINKER_LANGUAGE CXX)
+endif()
+
+add_executable(darknet ${CMAKE_CURRENT_LIST_DIR}/src/darknet.c ${sources} ${headers} ${cuda_sources})
+if(BUILD_AS_CPP)
+  set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/src/darknet.c PROPERTIES LANGUAGE CXX)
+  set_target_properties(darknet PROPERTIES LINKER_LANGUAGE CXX)
+endif()
+if(MSVC)
+  target_sources(darknet PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src/darknet.rc)
+endif()
+
+add_executable(kmeansiou ${CMAKE_CURRENT_LIST_DIR}/scripts/kmeansiou.c)
+if(MATH_LIBRARY)
+  target_link_libraries(kmeansiou PRIVATE ${MATH_LIBRARY})
+endif()
+
+target_include_directories(darknet PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include> $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/src> $<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}> $<BUILD_INTERFACE:${Stb_INCLUDE_DIR}>)
+target_include_directories(dark PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include> $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/src> $<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}> $<BUILD_INTERFACE:${Stb_INCLUDE_DIR}>)
+target_include_directories(uselib PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include> $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/src> $<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}> $<BUILD_INTERFACE:${Stb_INCLUDE_DIR}>)
+
+target_compile_definitions(darknet PRIVATE -DUSE_CMAKE_LIBS)
+target_compile_definitions(dark PRIVATE -DUSE_CMAKE_LIBS)
+target_compile_definitions(uselib PRIVATE -DUSE_CMAKE_LIBS)
+
+if(OpenCV_FOUND AND OpenCV_VERSION VERSION_GREATER "3.0" AND BUILD_USELIB_TRACK AND NOT MANUALLY_EXPORT_TRACK_OPTFLOW)
+  target_compile_definitions(dark PUBLIC TRACK_OPTFLOW=1)
+endif()
+
+if(CUDNN_FOUND)
+  target_link_libraries(darknet PRIVATE CuDNN::CuDNN)
+  target_link_libraries(dark PRIVATE CuDNN::CuDNN)
+  target_compile_definitions(darknet PRIVATE -DCUDNN)
+  target_compile_definitions(dark PUBLIC -DCUDNN)
+  if(ENABLE_CUDNN_HALF)
+    target_compile_definitions(darknet PRIVATE -DCUDNN_HALF)
+    target_compile_definitions(dark PUBLIC -DCUDNN_HALF)
+  endif()
+endif()
+
+if(OpenCV_FOUND)
+  target_link_libraries(darknet PRIVATE ${OpenCV_LINKED_COMPONENTS})
+  target_link_libraries(uselib PRIVATE ${OpenCV_LINKED_COMPONENTS})
+  target_link_libraries(dark PUBLIC ${OpenCV_LINKED_COMPONENTS})
+  target_include_directories(dark PRIVATE ${OpenCV_INCLUDE_DIRS})
+  target_compile_definitions(darknet PRIVATE -DOPENCV)
+  target_compile_definitions(dark PUBLIC -DOPENCV)
+endif()
+
+if(OPENMP_FOUND)
+  target_link_libraries(darknet PRIVATE OpenMP::OpenMP_CXX)
+  target_link_libraries(darknet PRIVATE OpenMP::OpenMP_C)
+  target_link_libraries(dark PUBLIC OpenMP::OpenMP_CXX)
+  target_link_libraries(dark PUBLIC OpenMP::OpenMP_C)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCC AND MATH_LIBRARY)
+  target_link_libraries(darknet PRIVATE ${MATH_LIBRARY})
+  target_link_libraries(dark PUBLIC ${MATH_LIBRARY})
+endif()
+
+if(MSVC)
+  target_link_libraries(darknet PRIVATE PThreads4W::PThreads4W)
+  target_link_libraries(darknet PRIVATE wsock32)
+  target_link_libraries(dark PUBLIC PThreads4W::PThreads4W)
+  target_link_libraries(dark PUBLIC wsock32)
+  target_link_libraries(uselib PRIVATE PThreads4W::PThreads4W)
+  if(USE_INTEGRATED_LIBS)
+    target_include_directories(dark PRIVATE ${CMAKE_CURRENT_LIST_DIR}/3rdparty/getopt)
+    target_include_directories(darknet PRIVATE ${CMAKE_CURRENT_LIST_DIR}/3rdparty/getopt)
+  else()
+    target_link_libraries(dark PRIVATE unofficial::getopt-win32::getopt)
+    target_link_libraries(darknet PRIVATE unofficial::getopt-win32::getopt)
+  endif()
+  target_compile_definitions(darknet PRIVATE -D_CRT_RAND_S -DNOMINMAX -D_USE_MATH_DEFINES)
+  target_compile_definitions(dark PRIVATE -D_CRT_RAND_S -DNOMINMAX -D_USE_MATH_DEFINES)
+  target_compile_definitions(dark PUBLIC -D_CRT_SECURE_NO_WARNINGS)
+  target_compile_definitions(uselib PRIVATE -D_CRT_RAND_S -DNOMINMAX -D_USE_MATH_DEFINES)
+endif()
+
+if(MSVC OR MINGW)
+  target_link_libraries(darknet PRIVATE ws2_32)
+  target_link_libraries(dark PUBLIC ws2_32)
+endif()
+
+target_link_libraries(darknet PRIVATE Threads::Threads)
+target_link_libraries(dark PUBLIC Threads::Threads)
+target_link_libraries(uselib PRIVATE Threads::Threads)
+
+if(ENABLE_ZED_CAMERA)
+  target_link_libraries(darknet PRIVATE ${ZED_LIBRARIES})
+  target_link_libraries(dark PUBLIC ${ZED_LIBRARIES})
+  target_link_libraries(uselib PRIVATE ${ZED_LIBRARIES})
+  target_compile_definitions(darknet PRIVATE -DZED_STEREO)
+  target_compile_definitions(uselib PRIVATE -DZED_STEREO)
+  target_compile_definitions(dark PUBLIC -DZED_STEREO)
+endif()
+
+if(ENABLE_CUDA)
+  target_include_directories(darknet PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+  target_include_directories(dark PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+  target_link_libraries(darknet PRIVATE curand cublas cuda)
+  target_link_libraries(dark PRIVATE curand cublas cuda)
+  set_target_properties(dark PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+  target_compile_definitions(darknet PRIVATE -DGPU)
+  target_compile_definitions(dark PUBLIC -DGPU)
+endif()
+
+if(ENABLE_CUDA_OPENGL_INTEGRATION)
+  target_compile_definitions(darknet PRIVATE -DCUDA_OPENGL_INTEGRATION)
+  target_compile_definitions(dark PUBLIC -DCUDA_OPENGL_INTEGRATION)
+endif()
+
+if(USE_INTEGRATED_LIBS AND WIN32)
+  target_compile_definitions(darknet PRIVATE -D_TIMESPEC_DEFINED)
+  target_compile_definitions(dark PRIVATE -D_TIMESPEC_DEFINED)
+endif()
+
+target_link_libraries(uselib PRIVATE dark)
+if(OpenCV_FOUND AND OpenCV_VERSION VERSION_GREATER "3.0" AND BUILD_USELIB_TRACK)
+  target_link_libraries(uselib_track PRIVATE dark)
+  target_compile_definitions(uselib_track PRIVATE TRACK_OPTFLOW=1)
+  target_compile_definitions(uselib_track PRIVATE -DUSE_CMAKE_LIBS)
+  if(BUILD_AS_CPP)
+    set_target_properties(uselib_track PROPERTIES LINKER_LANGUAGE CXX)
+  endif()
+  target_include_directories(uselib_track PRIVATE ${CMAKE_CURRENT_LIST_DIR}/include)
+  target_link_libraries(uselib_track PRIVATE ${OpenCV_LINKED_COMPONENTS})
+  if(ENABLE_ZED_CAMERA)
+    target_link_libraries(uselib_track PRIVATE ${ZED_LIBRARIES})
+    target_compile_definitions(uselib_track PRIVATE -DZED_STEREO)
+  endif()
+  if(MSVC)
+    target_link_libraries(uselib_track PRIVATE PThreads4W::PThreads4W)
+    target_compile_definitions(uselib_track PRIVATE -D_CRT_RAND_S -DNOMINMAX -D_USE_MATH_DEFINES)
+  endif()
+  target_link_libraries(uselib_track PRIVATE Threads::Threads)
+endif()
+
+#set_target_properties(dark PROPERTIES PUBLIC_HEADER "${exported_headers};${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp")
+set_target_properties(dark PROPERTIES PUBLIC_HEADER "${CMAKE_CURRENT_LIST_DIR}/include/darknet.h;${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp")
+
+set_target_properties(dark PROPERTIES CXX_VISIBILITY_PRESET hidden)
+
+install(TARGETS dark EXPORT DarknetTargets
+  RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
+  LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
+  ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
+  PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}"
+  COMPONENT dev
+)
+install(TARGETS uselib darknet kmeansiou
+  DESTINATION "${INSTALL_BIN_DIR}"
+)
+if(OpenCV_FOUND AND OpenCV_VERSION VERSION_GREATER "3.0" AND BUILD_USELIB_TRACK)
+  install(TARGETS uselib_track
+    DESTINATION "${INSTALL_BIN_DIR}"
+  )
+endif()
+
+install(EXPORT DarknetTargets
+  FILE DarknetTargets.cmake
+  NAMESPACE Darknet::
+  DESTINATION "${INSTALL_CMAKE_DIR}"
+)
+
+# Export the package for use from the build-tree (this registers the build-tree with a global CMake-registry)
+export(PACKAGE Darknet)
+
+# Create the DarknetConfig.cmake
+# First of all we compute the relative path between the cmake config file and the include path
+file(RELATIVE_PATH REL_INCLUDE_DIR "${FULLPATH_INSTALL_CMAKE_DIR}" "${FULLPATH_INSTALL_INCLUDE_DIR}")
+set(CONF_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}" "${PROJECT_BINARY_DIR}")
+configure_file(DarknetConfig.cmake.in "${PROJECT_BINARY_DIR}/DarknetConfig.cmake" @ONLY)
+set(CONF_INCLUDE_DIRS "\${Darknet_CMAKE_DIR}/${REL_INCLUDE_DIR}")
+configure_file(DarknetConfig.cmake.in "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/DarknetConfig.cmake" @ONLY)
+
+# Create the DarknetConfigVersion.cmake
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file("${PROJECT_BINARY_DIR}/DarknetConfigVersion.cmake"
+  COMPATIBILITY SameMajorVersion
+)
+
+install(FILES
+  "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/DarknetConfig.cmake"
+  "${PROJECT_BINARY_DIR}/DarknetConfigVersion.cmake"
+  DESTINATION "${INSTALL_CMAKE_DIR}"
+)
+
+if (ENABLE_DEPLOY_CUSTOM_CMAKE_MODULES)
+  install(FILES
+    "${CMAKE_CURRENT_LIST_DIR}/cmake/Modules/FindCUDNN.cmake"
+    "${CMAKE_CURRENT_LIST_DIR}/cmake/Modules/FindPThreads4W.cmake"
+    DESTINATION "${INSTALL_CMAKE_DIR}"
+  )
+endif()
+
+if(ENABLE_CSHARP_WRAPPER)
+  add_subdirectory(src/csharp)
+endif()
+
+set(CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_SKIP TRUE)
+include(InstallRequiredSystemLibraries)
+
+install(
+    PROGRAMS ${CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS}
+    DESTINATION ${INSTALL_BIN_DIR}
+)
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/vcpkg.json)
+  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/vcpkg.json VCPKG_JSON_STRING)
+  string(JSON CPACK_PACKAGE_NAME GET ${VCPKG_JSON_STRING} name)
+  string(JSON CPACK_PACKAGE_HOMEPAGE_URL GET ${VCPKG_JSON_STRING} homepage)
+  string(JSON CPACK_PACKAGE_DESCRIPTION GET ${VCPKG_JSON_STRING} description)
+  set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE)
+
+  if(UNIX AND NOT APPLE)
+    find_program(LSB_RELEASE_EXEC lsb_release)
+    execute_process(COMMAND ${LSB_RELEASE_EXEC} -is
+        OUTPUT_VARIABLE LSB_RELEASE_ID_SHORT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(LSB_RELEASE_ID_SHORT STREQUAL "Ubuntu")
+      set(CPACK_GENERATOR "DEB")
+      set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Darknet")
+    else()
+      set(CPACK_GENERATOR "RPM")
+    endif()
+  elseif(APPLE)
+      set(CPACK_GENERATOR "DragNDrop")
+  elseif(WIN32)
+    set(CPACK_PACKAGE_INSTALL_DIRECTORY ${CPACK_PACKAGE_NAME})
+    if(USE_NSIS)
+      set(CPACK_GENERATOR "NSIS")
+      string(JSON CPACK_NSIS_PACKAGE_NAME GET ${VCPKG_JSON_STRING} name)
+      string(JSON CPACK_NSIS_DISPLAY_NAME GET ${VCPKG_JSON_STRING} name)
+      set(CPACK_NSIS_ENABLE_UNINSTALL_BEFORE_INSTALL "ON")
+      set(CPACK_NSIS_MODIFY_PATH OFF) #disable extra page for adding to PATH, because it's broken on Win10+ due to NSIS not supporting MAX_PATH
+      set(CPACK_NSIS_MUI_ICON "${CMAKE_CURRENT_SOURCE_DIR}/src/darknet.ico")
+      set(CPACK_NSIS_MUI_UNIICON "${CMAKE_CURRENT_SOURCE_DIR}/src/darknet.ico")
+    else()
+      set(CPACK_GENERATOR "WIX")
+      #set(CPACK_WIX_UPGRADE_GUID "") # IMPORTANT! It has to be unique for every project!!
+    endif()
+  endif()
+
+  include(CPack)
+endif()
diff --git a/darknet-master/DarknetConfig.cmake.in b/darknet-master/DarknetConfig.cmake.in
new file mode 100644
index 0000000..ab63885
--- /dev/null
+++ b/darknet-master/DarknetConfig.cmake.in
@@ -0,0 +1,50 @@
+# Config file for the Darknet package
+
+get_filename_component(Darknet_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+list(APPEND CMAKE_MODULE_PATH "${Darknet_CMAKE_DIR}")
+
+include(CMakeFindDependencyMacro)
+
+if(@OpenCV_FOUND@)
+  find_dependency(OpenCV)
+endif()
+
+if(@ENABLE_CUDA@)
+  include(CheckLanguage)
+  check_language(CUDA)
+  if(NOT CMAKE_CUDA_COMPILER)
+    message(STATUS " --> WARNING: Unable to find native CUDA integration!")
+  endif()
+  if(@CUDNN_FOUND@)
+    find_dependency(CUDNN)
+  endif()
+endif()
+
+set(CMAKE_THREAD_PREFER_PTHREAD ON)
+find_dependency(Threads)
+
+if(MSVC)
+  find_dependency(PThreads4W)
+  set(CMAKE_CXX_FLAGS "/wd4018 /wd4244 /wd4267 /wd4305 ${CMAKE_CXX_FLAGS}")
+  if(@unofficial-getopt-win32_FOUND@)
+    find_dependency(unofficial-getopt-win32)
+  endif()
+endif()
+
+if(@OPENMP_FOUND@)
+  find_dependency(OpenMP)
+endif()
+
+# Our library dependencies (contains definitions for IMPORTED targets)
+include("${Darknet_CMAKE_DIR}/DarknetTargets.cmake")
+include("${Darknet_CMAKE_DIR}/DarknetConfigVersion.cmake")
+
+if(@OpenCV_FOUND@)
+  set_property(TARGET Darknet::dark APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${OpenCV_INCLUDE_DIRS}")
+endif()
+
+get_target_property(FULL_DARKNET_INCLUDE_DIRS Darknet::dark INTERFACE_INCLUDE_DIRECTORIES)
+list(GET FULL_DARKNET_INCLUDE_DIRS 0 Darknet_INCLUDE_DIR)
+get_filename_component(Darknet_INCLUDE_DIR "${Darknet_INCLUDE_DIR}" REALPATH)
+
+find_package_handle_standard_args(Darknet REQUIRED_VARS Darknet_INCLUDE_DIR VERSION_VAR PACKAGE_VERSION)
diff --git a/darknet-master/Dockerfile.cpu b/darknet-master/Dockerfile.cpu
new file mode 100644
index 0000000..7a15cfb
--- /dev/null
+++ b/darknet-master/Dockerfile.cpu
@@ -0,0 +1,49 @@
+FROM ubuntu:latest AS builder
+
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get update -y
+
+RUN apt-get install -y g++ make pkg-config libopencv-dev
+
+COPY  . /darknet
+
+WORKDIR /darknet
+
+RUN rm Dockerfile.cpu
+
+RUN rm Dockerfile.gpu
+
+RUN rm Docker-compose.yml
+
+RUN make
+
+FROM ubuntu:latest
+
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get update -y
+
+RUN apt-get install -y sudo libgomp1
+
+RUN useradd -U -m yolo
+
+RUN usermod -aG sudo yolo 
+
+RUN usermod --shell /bin/bash yolo
+
+RUN echo "yolo:yolo" | chpasswd
+
+COPY --from=builder /darknet /home/yolo/darknet
+
+RUN cp /home/yolo/darknet/libdarknet.so /usr/local/lib/libdarknet.so || echo "libso not used"
+
+RUN cp /home/yolo/darknet/include/darknet.h /usr/local/include/darknet.h
+
+RUN ldconfig
+
+WORKDIR /home/yolo/darknet
+
+USER yolo
+
+
diff --git a/darknet-master/Dockerfile.gpu b/darknet-master/Dockerfile.gpu
new file mode 100644
index 0000000..c4a9eff
--- /dev/null
+++ b/darknet-master/Dockerfile.gpu
@@ -0,0 +1,47 @@
+FROM nvidia/cuda:11.6.0-cudnn8-devel-ubuntu20.04 AS builder
+
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get update -y
+
+RUN apt-get install -y g++ make pkg-config libopencv-dev
+
+COPY  . /darknet
+
+WORKDIR /darknet
+
+RUN rm Dockerfile.cpu
+
+RUN rm Dockerfile.gpu
+
+RUN rm Docker-compose.yml
+
+RUN make
+
+FROM nvidia/cuda:11.6.0-cudnn8-devel-ubuntu20.04
+
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get update -y
+
+RUN apt-get install -y sudo libgomp1
+
+RUN useradd -U -m yolo
+
+RUN usermod -aG sudo yolo 
+
+RUN usermod --shell /bin/bash yolo
+
+RUN echo "yolo:yolo" | chpasswd
+
+COPY --from=builder /darknet /home/yolo/darknet
+
+RUN cp /home/yolo/darknet/libdarknet.so /usr/local/lib/libdarknet.so || echo "libso not used"
+
+RUN cp /home/yolo/darknet/include/darknet.h /usr/local/include/darknet.h
+
+RUN ldconfig
+
+WORKDIR /home/yolo/darknet
+
+USER yolo
\ No newline at end of file
diff --git a/darknet-master/LICENSE b/darknet-master/LICENSE
new file mode 100644
index 0000000..a50f7d7
--- /dev/null
+++ b/darknet-master/LICENSE
@@ -0,0 +1,12 @@
+                                  YOLO LICENSE
+                             Version 2, July 29 2016
+
+THIS SOFTWARE LICENSE IS PROVIDED "ALL CAPS" SO THAT YOU KNOW IT IS SUPER
+SERIOUS AND YOU DON'T MESS AROUND WITH COPYRIGHT LAW BECAUSE YOU WILL GET IN
+TROUBLE HERE ARE SOME OTHER BUZZWORDS COMMONLY IN THESE THINGS WARRANTIES
+LIABILITY CONTRACT TORT LIABLE CLAIMS RESTRICTION MERCHANTABILITY. NOW HERE'S
+THE REAL LICENSE:
+
+0. Darknet is public domain.
+1. Do whatever you want with it.
+2. Stop emailing me about it!
diff --git a/darknet-master/Makefile b/darknet-master/Makefile
new file mode 100644
index 0000000..167d071
--- /dev/null
+++ b/darknet-master/Makefile
@@ -0,0 +1,215 @@
+GPU=0
+CUDNN=0
+CUDNN_HALF=0
+OPENCV=0
+AVX=0
+OPENMP=0
+LIBSO=0
+ZED_CAMERA=0
+ZED_CAMERA_v2_8=0
+
+# set GPU=1 and CUDNN=1 to speedup on GPU
+# set CUDNN_HALF=1 to further speedup 3 x times (Mixed-precision on Tensor Cores) GPU: Volta, Xavier, Turing, Ampere, Ada and higher
+# set AVX=1 and OPENMP=1 to speedup on CPU (if error occurs then set AVX=0)
+# set ZED_CAMERA=1 to enable ZED SDK 3.0 and above
+# set ZED_CAMERA_v2_8=1 to enable ZED SDK 2.X
+
+USE_CPP=0
+DEBUG=0
+
+ARCH= -gencode arch=compute_50,code=[sm_50,compute_50] \
+      -gencode arch=compute_52,code=[sm_52,compute_52] \
+	    -gencode arch=compute_61,code=[sm_61,compute_61]
+
+OS := $(shell uname)
+
+# Naming confusion with recent RTX cards.
+# "NVIDIA Quadro RTX x000" and T1000/Tx00 are Turing Architecture Family with Compute Capability of 7.5
+# "NVIDIA RTX Ax000" are Ampere Architecture Family with Compute Capability of 8.6
+# NVIDIA "RTX x000 Ada" are Ada Lovelace Architecture Family with Compute Capability of 8.9
+# Source https://developer.nvidia.com/cuda-gpus
+
+# KEPLER, GeForce GTX 770, GTX 760, GT 740
+# ARCH= -gencode arch=compute_30,code=sm_30
+
+# MAXWELL, GeForce GTX 950, 960, 970, 980, 980 Ti, "GTX" Titan X
+# ARCH= -gencode arch=compute_52,code=sm_52
+
+# Jetson TX1, Tegra X1, DRIVE CX, DRIVE PX, Jetson Nano (2GB, 4GB)
+# ARCH= -gencode arch=compute_53,code=[sm_53,compute_53]
+
+# GP100/Tesla P100 - DGX-1
+# ARCH= -gencode arch=compute_60,code=sm_60
+
+# PASCAL, GTX 10x0, GTX 10x0 Ti, Titan Xp, Tesla P40, Tesla P4
+# ARCH= -gencode arch=compute_61,code=[sm_61,compute_61]
+
+# For Jetson TX2, Jetson Nano TX2 or Drive-PX2 uncomment:
+# ARCH= -gencode arch=compute_62,code=[sm_62,compute_62]
+
+# Tesla V100
+# ARCH= -gencode arch=compute_70,code=[sm_70,compute_70]
+
+# Jetson XAVIER, XAVIER NX
+# ARCH= -gencode arch=compute_72,code=[sm_72,compute_72]
+
+# GeForce Titan RTX, RTX 20x0, RTX 20x0 Ti, Quadro RTX x000, Tesla T4, XNOR Tensor Cores
+# ARCH= -gencode arch=compute_75,code=[sm_75,compute_75]
+
+# Tesla A100 (GA100), DGX-A100, A30, A100, RTX 3080
+# ARCH= -gencode arch=compute_80,code=[sm_80,compute_80]
+
+# GeForce RTX 30x0, 30x0 Ti, Tesla GA10x, RTX Axxxx, A2, A10, A16, A40
+# ARCH= -gencode arch=compute_86,code=[sm_86,compute_86]
+
+# NOT TESTED, THEORETICAL
+# Jetson ORIN, ORIN NX, ORIN NANO
+# ARCH= -gencode arch=compute_87,code=[sm_87,compute_87]
+
+# NOT TESTED, THEORETICAL
+# GeForce RTX 4070 Ti, 4080, 4090, L4, L40
+# ARCH= -gencode arch=compute_89,code=[sm_89,compute_89]
+
+# NOT TESTED, THEORETICAL
+# Nvidia H100
+# ARCH= -gencode arch=compute_90,code=[sm_90,compute_90]
+
+VPATH=./src/
+EXEC=darknet
+OBJDIR=./obj/
+
+ifeq ($(LIBSO), 1)
+LIBNAMESO=libdarknet.so
+APPNAMESO=uselib
+endif
+
+ifeq ($(USE_CPP), 1)
+CC=g++
+else
+CC=gcc
+endif
+
+CPP=g++ -std=c++11
+NVCC=nvcc
+OPTS=-Ofast
+LDFLAGS= -lm -pthread
+COMMON= -Iinclude/ -I3rdparty/stb/include
+CFLAGS=-Wall -Wfatal-errors -Wno-unused-result -Wno-unknown-pragmas -fPIC -rdynamic
+
+ifeq ($(DEBUG), 1)
+#OPTS= -O0 -g
+#OPTS= -Og -g
+COMMON+= -DDEBUG
+CFLAGS+= -DDEBUG
+else
+ifeq ($(AVX), 1)
+CFLAGS+= -ffp-contract=fast -mavx -mavx2 -msse3 -msse4.1 -msse4.2 -msse4a
+endif
+endif
+
+CFLAGS+=$(OPTS)
+
+ifneq (,$(findstring MSYS_NT,$(OS)))
+LDFLAGS+=-lws2_32
+endif
+
+ifeq ($(OPENCV), 1)
+COMMON+= -DOPENCV
+CFLAGS+= -DOPENCV
+LDFLAGS+= `pkg-config --libs opencv4 2> /dev/null || pkg-config --libs opencv`
+COMMON+= `pkg-config --cflags opencv4 2> /dev/null || pkg-config --cflags opencv`
+endif
+
+ifeq ($(OPENMP), 1)
+    ifeq ($(OS),Darwin) #MAC
+	    CFLAGS+= -Xpreprocessor -fopenmp
+	else
+		CFLAGS+= -fopenmp
+	endif
+LDFLAGS+= -lgomp
+endif
+
+ifeq ($(GPU), 1)
+COMMON+= -DGPU -I/usr/local/cuda/include/
+CFLAGS+= -DGPU
+ifeq ($(OS),Darwin) #MAC
+LDFLAGS+= -L/usr/local/cuda/lib -lcuda -lcudart -lcublas -lcurand
+else
+LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
+endif
+endif
+
+ifeq ($(CUDNN), 1)
+COMMON+= -DCUDNN
+ifeq ($(OS),Darwin) #MAC
+CFLAGS+= -DCUDNN -I/usr/local/cuda/include
+LDFLAGS+= -L/usr/local/cuda/lib -lcudnn
+else
+CFLAGS+= -DCUDNN -I/usr/local/cudnn/include
+LDFLAGS+= -L/usr/local/cudnn/lib64 -lcudnn
+endif
+endif
+
+ifeq ($(CUDNN_HALF), 1)
+COMMON+= -DCUDNN_HALF
+CFLAGS+= -DCUDNN_HALF
+ARCH+= -gencode arch=compute_70,code=[sm_70,compute_70]
+endif
+
+ifeq ($(ZED_CAMERA), 1)
+CFLAGS+= -DZED_STEREO -I/usr/local/zed/include
+ifeq ($(ZED_CAMERA_v2_8), 1)
+LDFLAGS+= -L/usr/local/zed/lib -lsl_core -lsl_input -lsl_zed
+#-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0
+else
+LDFLAGS+= -L/usr/local/zed/lib -lsl_zed
+#-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0
+endif
+endif
+
+OBJ=image_opencv.o http_stream.o gemm.o utils.o dark_cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o representation_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o yolo_layer.o gaussian_yolo_layer.o upsample_layer.o lstm_layer.o conv_lstm_layer.o scale_channels_layer.o sam_layer.o
+ifeq ($(GPU), 1)
+LDFLAGS+= -lstdc++
+OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o
+endif
+
+OBJS = $(addprefix $(OBJDIR), $(OBJ))
+DEPS = $(wildcard src/*.h) Makefile include/darknet.h
+
+all: $(OBJDIR) backup results setchmod $(EXEC) $(LIBNAMESO) $(APPNAMESO)
+
+ifeq ($(LIBSO), 1)
+CFLAGS+= -fPIC
+
+$(LIBNAMESO): $(OBJDIR) $(OBJS) include/yolo_v2_class.hpp src/yolo_v2_class.cpp
+	$(CPP) -shared -std=c++11 -fvisibility=hidden -DLIB_EXPORTS $(COMMON) $(CFLAGS) $(OBJS) src/yolo_v2_class.cpp -o $@ $(LDFLAGS)
+
+$(APPNAMESO): $(LIBNAMESO) include/yolo_v2_class.hpp src/yolo_console_dll.cpp
+	$(CPP) -std=c++11 $(COMMON) $(CFLAGS) -o $@ src/yolo_console_dll.cpp $(LDFLAGS) -L ./ -l:$(LIBNAMESO)
+endif
+
+$(EXEC): $(OBJS)
+	$(CPP) -std=c++11 $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS)
+
+$(OBJDIR)%.o: %.c $(DEPS)
+	$(CC) $(COMMON) $(CFLAGS) -c $< -o $@
+
+$(OBJDIR)%.o: %.cpp $(DEPS)
+	$(CPP) -std=c++11 $(COMMON) $(CFLAGS) -c $< -o $@
+
+$(OBJDIR)%.o: %.cu $(DEPS)
+	$(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@
+
+$(OBJDIR):
+	mkdir -p $(OBJDIR)
+backup:
+	mkdir -p backup
+results:
+	mkdir -p results
+setchmod:
+	chmod +x *.sh
+
+.PHONY: clean
+
+clean:
+	rm -rf $(OBJS) $(EXEC) $(LIBNAMESO) $(APPNAMESO)
diff --git a/darknet-master/README.md b/darknet-master/README.md
new file mode 100644
index 0000000..0a1ae52
--- /dev/null
+++ b/darknet-master/README.md
@@ -0,0 +1,850 @@
+# Yolo v4, v3 and v2 for Windows and Linux
+
+* Read the FAQ:  https://www.ccoderun.ca/programming/darknet_faq/
+* Join the Darknet/YOLO Discord:  https://discord.gg/zSq8rtW
+* Recommended GitHub repo for Darknet/YOLO:  https://github.com/hank-ai/darknetcv/
+* Hank.ai and Darknet/YOLO:  https://hank.ai/darknet-welcomes-hank-ai-as-official-sponsor-and-commercial-entity/
+
+## (neural networks for object detection)
+
+* Paper **YOLOv7**: https://arxiv.org/abs/2207.02696
+
+* source code YOLOv7 - Pytorch (use to reproduce results): https://github.com/WongKinYiu/yolov7
+
+----
+
+* Paper **YOLOv4**: https://arxiv.org/abs/2004.10934
+
+* source code YOLOv4 - Darknet (use to reproduce results): https://github.com/AlexeyAB/darknet
+
+----
+
+* Paper **Scaled-YOLOv4 (CVPR 2021)**: https://openaccess.thecvf.com/content/CVPR2021/html/Wang_Scaled-YOLOv4_Scaling_Cross_Stage_Partial_Network_CVPR_2021_paper.html
+
+* source code Scaled-YOLOv4 - Pytorch (use to reproduce results): https://github.com/WongKinYiu/ScaledYOLOv4
+
+----
+
+### YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors
+
+* **Paper**: https://arxiv.org/abs/2207.02696
+
+* **source code - Pytorch (use to reproduce results):** https://github.com/WongKinYiu/yolov7
+
+
+YOLOv7 is more accurate and faster than YOLOv5 by **120%** FPS, than YOLOX by **180%** FPS, than Dual-Swin-T by **1200%** FPS, than ConvNext by **550%** FPS, than SWIN-L by **500%** FPS, than PPYOLOE-X by **150%** FPS.
+
+YOLOv7 surpasses all known object detectors in both speed and accuracy in the range from 5 FPS to 160 FPS and has the highest accuracy 56.8% AP among all known real-time object detectors with 30 FPS or higher on GPU V100, batch=1. 
+
+* YOLOv7-e6 (55.9% AP, 56 FPS V100 b=1) by `+500%` FPS faster than SWIN-L C-M-RCNN (53.9% AP, 9.2 FPS A100 b=1)
+* YOLOv7-e6 (55.9% AP, 56 FPS V100 b=1) by `+550%` FPS faster than ConvNeXt-XL C-M-RCNN (55.2% AP, 8.6 FPS A100 b=1)
+* YOLOv7-w6 (54.6% AP, 84 FPS V100 b=1) by `+120%` FPS faster than YOLOv5-X6-r6.1 (55.0% AP, 38 FPS V100 b=1)
+* YOLOv7-w6 (54.6% AP, 84 FPS V100 b=1) by `+1200%` FPS faster than Dual-Swin-T C-M-RCNN (53.6% AP, 6.5 FPS V100 b=1)
+* YOLOv7x (52.9% AP, 114 FPS V100 b=1) by `+150%` FPS faster than PPYOLOE-X (51.9% AP, 45 FPS V100 b=1)
+* YOLOv7 (51.2% AP, 161 FPS V100 b=1) by `+180%` FPS faster than YOLOX-X (51.1% AP, 58 FPS V100 b=1)
+
+
+----
+
+![more5](https://user-images.githubusercontent.com/4096485/179425274-f55a36d4-8450-4471-816b-8c105841effd.jpg)
+
+----
+
+![image](https://user-images.githubusercontent.com/4096485/177675030-a929ee00-0eba-4d93-95c2-225231d0fd61.png)
+
+
+----
+
+More details in articles on medium:
+
+- [Scaled_YOLOv4](https://alexeyab84.medium.com/scaled-yolo-v4-is-the-best-neural-network-for-object-detection-on-ms-coco-dataset-39dfa22fa982?source=friends_link&sk=c8553bfed861b1a7932f739d26f487c8)
+- [YOLOv4](https://medium.com/@alexeyab84/yolov4-the-most-accurate-real-time-neural-network-on-ms-coco-dataset-73adfd3602fe?source=friends_link&sk=6039748846bbcf1d960c3061542591d7)
+
+Manual: https://github.com/AlexeyAB/darknet/wiki
+
+Discussion:
+
+- [Discord](https://discord.gg/zSq8rtW)
+
+About Darknet framework: http://pjreddie.com/darknet/
+
+[![Darknet Continuous Integration](https://github.com/AlexeyAB/darknet/workflows/Darknet%20Continuous%20Integration/badge.svg)](https://github.com/AlexeyAB/darknet/actions?query=workflow%3A%22Darknet+Continuous+Integration%22)
+[![CircleCI](https://circleci.com/gh/AlexeyAB/darknet.svg?style=svg)](https://circleci.com/gh/AlexeyAB/darknet)
+[![Contributors](https://img.shields.io/github/contributors/AlexeyAB/Darknet.svg)](https://github.com/AlexeyAB/darknet/graphs/contributors)
+[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/AlexeyAB/darknet/blob/master/LICENSE)
+[![DOI](https://zenodo.org/badge/75388965.svg)](https://zenodo.org/badge/latestdoi/75388965)
+[![arxiv.org](http://img.shields.io/badge/cs.CV-arXiv%3A2004.10934-B31B1B.svg)](https://arxiv.org/abs/2004.10934)
+[![arxiv.org](http://img.shields.io/badge/cs.CV-arXiv%3A2011.08036-B31B1B.svg)](https://arxiv.org/abs/2011.08036)
+[![colab](https://user-images.githubusercontent.com/4096485/86174089-b2709f80-bb29-11ea-9faf-3d8dc668a1a5.png)](https://colab.research.google.com/drive/12QusaaRj_lUwCGDvQNfICpa7kA7_a2dE)
+[![colab](https://user-images.githubusercontent.com/4096485/86174097-b56b9000-bb29-11ea-9240-c17f6bacfc34.png)](https://colab.research.google.com/drive/1_GdoqCJWXsChrOiY8sZMr_zbr_fH-0Fg)
+
+- [YOLOv4 model zoo](https://github.com/AlexeyAB/darknet/wiki/YOLOv4-model-zoo)
+- [Requirements (and how to install dependencies)](#requirements-for-windows-linux-and-macos)
+- [Pre-trained models](#pre-trained-models)
+- [FAQ - frequently asked questions](https://github.com/AlexeyAB/darknet/wiki/FAQ---frequently-asked-questions)
+- [Explanations in issues](https://github.com/AlexeyAB/darknet/issues?q=is%3Aopen+is%3Aissue+label%3AExplanations)
+- [Yolo v4 in other frameworks (TensorRT, TensorFlow, PyTorch, OpenVINO, OpenCV-dnn, TVM,...)](#yolo-v4-in-other-frameworks)
+- [Datasets](#datasets)
+
+- [Yolo v4, v3 and v2 for Windows and Linux](#yolo-v4-v3-and-v2-for-windows-and-linux)
+  - [(neural networks for object detection)](#neural-networks-for-object-detection)
+    - [GeForce RTX 2080 Ti](#geforce-rtx-2080-ti)
+      - [Youtube video of results](#youtube-video-of-results)
+      - [How to evaluate AP of YOLOv4 on the MS COCO evaluation server](#how-to-evaluate-ap-of-yolov4-on-the-ms-coco-evaluation-server)
+      - [How to evaluate FPS of YOLOv4 on GPU](#how-to-evaluate-fps-of-yolov4-on-gpu)
+      - [Pre-trained models](#pre-trained-models)
+    - [Requirements for Windows, Linux and macOS](#requirements-for-windows-linux-and-macos)
+    - [Yolo v4 in other frameworks](#yolo-v4-in-other-frameworks)
+      - [Datasets](#datasets)
+    - [Improvements in this repository](#improvements-in-this-repository)
+      - [How to use on the command line](#how-to-use-on-the-command-line)
+        - [For using network video-camera mjpeg-stream with any Android smartphone](#for-using-network-video-camera-mjpeg-stream-with-any-android-smartphone)
+    - [How to compile on Linux/macOS (using `CMake`)](#how-to-compile-on-linuxmacos-using-cmake)
+    - [Using also PowerShell](#using-also-powershell)
+    - [How to compile on Linux (using `make`)](#how-to-compile-on-linux-using-make)
+    - [How to compile on Windows (using `CMake`)](#how-to-compile-on-windows-using-cmake)
+    - [How to compile on Windows (using `vcpkg`)](#how-to-compile-on-windows-using-vcpkg)
+  - [How to train with multi-GPU](#how-to-train-with-multi-gpu)
+  - [How to train (to detect your custom objects)](#how-to-train-to-detect-your-custom-objects)
+    - [How to train tiny-yolo (to detect your custom objects)](#how-to-train-tiny-yolo-to-detect-your-custom-objects)
+  - [When should I stop training](#when-should-i-stop-training)
+    - [Custom object detection](#custom-object-detection)
+  - [How to improve object detection](#how-to-improve-object-detection)
+  - [How to mark bounded boxes of objects and create annotation files](#how-to-mark-bounded-boxes-of-objects-and-create-annotation-files)
+  - [How to use Yolo as DLL and SO libraries](#how-to-use-yolo-as-dll-and-so-libraries)
+  - [Citation](#citation)
+
+![Darknet Logo](http://pjreddie.com/media/files/darknet-black-small.png)
+
+![scaled_yolov4](https://user-images.githubusercontent.com/4096485/112776361-281d8380-9048-11eb-8083-8728b12dcd55.png) AP50:95 - FPS (Tesla V100) Paper: https://arxiv.org/abs/2011.08036
+
+----
+
+![modern_gpus](https://user-images.githubusercontent.com/4096485/82835867-f1c62380-9ecd-11ea-9134-1598ed2abc4b.png) AP50:95 / AP50 - FPS (Tesla V100) Paper: https://arxiv.org/abs/2004.10934
+
+tkDNN-TensorRT accelerates YOLOv4 **~2x** times for batch=1 and **3x-4x** times for batch=4.
+
+- tkDNN: https://github.com/ceccocats/tkDNN
+- OpenCV: https://gist.github.com/YashasSamaga/48bdb167303e10f4d07b754888ddbdcf
+
+### GeForce RTX 2080 Ti
+
+| Network Size               | Darknet, FPS (avg) | tkDNN TensorRT FP32, FPS | tkDNN TensorRT FP16, FPS | OpenCV FP16, FPS | tkDNN TensorRT FP16 batch=4, FPS | OpenCV FP16 batch=4, FPS | tkDNN Speedup |
+|:--------------------------:|:------------------:|-------------------------:|-------------------------:|-----------------:|---------------------------------:|-------------------------:|--------------:|
+|320                         | 100                | 116                      | **202**                  | 183              | 423                              | **430**                  | **4.3x**      |
+|416                         | 82                 | 103                      | **162**                  | 159              | 284                              | **294**                  | **3.6x**      |
+|512                         | 69                 | 91                       | 134                      | **138**          | 206                              | **216**                  | **3.1x**      |
+|608                         | 53                 | 62                       | 103                      | **115**          | 150                              | **150**                  | **2.8x**      |
+|Tiny 416                    | 443                | 609                      | **790**                  | 773              | **1774**                         | 1353                     | **3.5x**      |
+|Tiny 416 CPU Core i7 7700HQ | 3.4                | -                        | -                        | 42               | -                                | 39                       | **12x**       |
+
+- Yolo v4 Full comparison: [map_fps](https://user-images.githubusercontent.com/4096485/80283279-0e303e00-871f-11ea-814c-870967d77fd1.png)
+- Yolo v4 tiny comparison: [tiny_fps](https://user-images.githubusercontent.com/4096485/85734112-6e366700-b705-11ea-95d1-fcba0de76d72.png)
+- CSPNet: [paper](https://arxiv.org/abs/1911.11929) and [map_fps](https://user-images.githubusercontent.com/4096485/71702416-6645dc00-2de0-11ea-8d65-de7d4b604021.png) comparison: https://github.com/WongKinYiu/CrossStagePartialNetworks
+- Yolo v3 on MS COCO: [Speed / Accuracy (mAP@0.5) chart](https://user-images.githubusercontent.com/4096485/52151356-e5d4a380-2683-11e9-9d7d-ac7bc192c477.jpg)
+- Yolo v3 on MS COCO (Yolo v3 vs RetinaNet) - Figure 3: https://arxiv.org/pdf/1804.02767v1.pdf
+- Yolo v2 on Pascal VOC 2007: https://hsto.org/files/a24/21e/068/a2421e0689fb43f08584de9d44c2215f.jpg
+- Yolo v2 on Pascal VOC 2012 (comp4): https://hsto.org/files/3a6/fdf/b53/3a6fdfb533f34cee9b52bdd9bb0b19d9.jpg
+
+#### Youtube video of results
+
+| [![Yolo v4](https://user-images.githubusercontent.com/4096485/101360000-1a33cf00-38ae-11eb-9e5e-b29c5fb0afbe.png)](https://youtu.be/1_SiUOYUoOI "Yolo v4") |  [![Scaled Yolo v4](https://user-images.githubusercontent.com/4096485/101359389-43a02b00-38ad-11eb-866c-f813e96bf61a.png)](https://youtu.be/YDFf-TqJOFE "Scaled Yolo v4") |
+|---|---|
+
+Others: https://www.youtube.com/user/pjreddie/videos
+
+#### How to evaluate AP of YOLOv4 on the MS COCO evaluation server
+
+1. Download and unzip test-dev2017 dataset from MS COCO server: http://images.cocodataset.org/zips/test2017.zip
+2. Download list of images for Detection tasks and replace the paths with yours: https://raw.githubusercontent.com/AlexeyAB/darknet/master/scripts/testdev2017.txt
+3. Download `yolov4.weights` file 245 MB: [yolov4.weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights) (Google-drive mirror [yolov4.weights](https://drive.google.com/open?id=1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT) )
+4. Content of the file `cfg/coco.data` should be
+
+```ini
+classes= 80
+train  = <replace with your path>/trainvalno5k.txt
+valid = <replace with your path>/testdev2017.txt
+names = data/coco.names
+backup = backup
+eval=coco
+```
+
+5. Create `/results/` folder near with `./darknet` executable file
+6. Run validation: `./darknet detector valid cfg/coco.data cfg/yolov4.cfg yolov4.weights`
+7. Rename the file  `/results/coco_results.json` to `detections_test-dev2017_yolov4_results.json` and compress it to `detections_test-dev2017_yolov4_results.zip`
+8. Submit file `detections_test-dev2017_yolov4_results.zip` to the MS COCO evaluation server for the `test-dev2019 (bbox)`
+
+#### How to evaluate FPS of YOLOv4 on GPU
+
+1. Compile Darknet with `GPU=1 CUDNN=1 CUDNN_HALF=1 OPENCV=1` in the `Makefile`
+2. Download `yolov4.weights` file 245 MB: [yolov4.weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights) (Google-drive mirror [yolov4.weights](https://drive.google.com/open?id=1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT) )
+3. Get any .avi/.mp4 video file (preferably not more than 1920x1080 to avoid bottlenecks in CPU performance)
+4. Run one of two commands and look at the AVG FPS:
+
+- include video_capturing + NMS + drawing_bboxes:
+    `./darknet detector demo cfg/coco.data cfg/yolov4.cfg yolov4.weights test.mp4 -dont_show -ext_output`
+- exclude video_capturing + NMS + drawing_bboxes:
+    `./darknet detector demo cfg/coco.data cfg/yolov4.cfg yolov4.weights test.mp4 -benchmark`
+
+#### Pre-trained models
+
+There are weights-file for different cfg-files (trained for MS COCO dataset):
+
+FPS on RTX 2070 (R) and Tesla V100 (V):
+
+- [yolov4-p6.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-p6.cfg) - 1280x1280 - **72.1% mAP@0.5 (54.0% AP@0.5:0.95) - 32(V) FPS** - xxx BFlops (xxx FMA) - 487 MB: [yolov4-p6.weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-p6.weights)
+  - pre-trained weights for training: https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-p6.conv.289
+
+- [yolov4-p5.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-p5.cfg) - 896x896 - **70.0% mAP@0.5 (51.6% AP@0.5:0.95) - 43(V) FPS** - xxx BFlops (xxx FMA) - 271 MB: [yolov4-p5.weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-p5.weights)
+  - pre-trained weights for training: https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-p5.conv.232
+
+- [yolov4-csp-x-swish.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-csp-x-swish.cfg) - 640x640 - **69.9% mAP@0.5 (51.5% AP@0.5:0.95) - 23(R) FPS / 50(V) FPS** - 221 BFlops (110 FMA) - 381 MB: [yolov4-csp-x-swish.weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-csp-x-swish.weights)
+  - pre-trained weights for training: https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-csp-x-swish.conv.192
+
+- [yolov4-csp-swish.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-csp-swish.cfg) - 640x640 - **68.7% mAP@0.5 (50.0% AP@0.5:0.95) - 70(V) FPS** - 120 (60 FMA) - 202 MB: [yolov4-csp-swish.weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-csp-swish.weights)
+  - pre-trained weights for training: https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-csp-swish.conv.164
+
+- [yolov4x-mish.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4x-mish.cfg) - 640x640 - **68.5% mAP@0.5 (50.1% AP@0.5:0.95) - 23(R) FPS / 50(V) FPS** - 221 BFlops (110 FMA) - 381 MB: [yolov4x-mish.weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4x-mish.weights)
+  - pre-trained weights for training: https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4x-mish.conv.166
+
+- [yolov4-csp.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-csp.cfg) - 202 MB: [yolov4-csp.weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-csp.weights) paper [Scaled Yolo v4](https://arxiv.org/abs/2011.08036)
+
+    just change `width=` and `height=` parameters in `yolov4-csp.cfg` file and use the same `yolov4-csp.weights` file for all cases:
+  - `width=640 height=640` in cfg: **67.4% mAP@0.5 (48.7% AP@0.5:0.95) - 70(V) FPS** - 120 (60 FMA) BFlops
+  - `width=512 height=512` in cfg: **64.8% mAP@0.5 (46.2% AP@0.5:0.95) - 93(V) FPS** - 77 (39 FMA) BFlops
+  - pre-trained weights for training: https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-csp.conv.142
+
+- [yolov4.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4.cfg) - 245 MB: [yolov4.weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights) (Google-drive mirror [yolov4.weights](https://drive.google.com/open?id=1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT) ) paper [Yolo v4](https://arxiv.org/abs/2004.10934)
+    just change `width=` and `height=` parameters in `yolov4.cfg` file and use the same `yolov4.weights` file for all cases:
+  - `width=608 height=608` in cfg: **65.7% mAP@0.5 (43.5% AP@0.5:0.95) - 34(R) FPS / 62(V) FPS** - 128.5 BFlops
+  - `width=512 height=512` in cfg: **64.9% mAP@0.5 (43.0% AP@0.5:0.95) - 45(R) FPS / 83(V) FPS** - 91.1 BFlops
+  - `width=416 height=416` in cfg: **62.8% mAP@0.5 (41.2% AP@0.5:0.95) - 55(R) FPS / 96(V) FPS** - 60.1 BFlops
+  - `width=320 height=320` in cfg:   **60% mAP@0.5 (  38% AP@0.5:0.95) - 63(R) FPS / 123(V) FPS** - 35.5 BFlops
+
+- [yolov4-tiny.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny.cfg) - **40.2% mAP@0.5 - 371(1080Ti) FPS / 330(RTX2070) FPS** - 6.9 BFlops - 23.1 MB: [yolov4-tiny.weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights)
+
+- [enet-coco.cfg (EfficientNetB0-Yolov3)](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/enet-coco.cfg) - **45.5% mAP@0.5 - 55(R) FPS** - 3.7 BFlops - 18.3 MB: [enetb0-coco_final.weights](https://drive.google.com/file/d/1FlHeQjWEQVJt0ay1PVsiuuMzmtNyv36m/view)
+
+- [yolov3-openimages.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3-openimages.cfg) - 247 MB - 18(R) FPS - OpenImages dataset: [yolov3-openimages.weights](https://pjreddie.com/media/files/yolov3-openimages.weights)
+
+<details><summary><b>CLICK ME</b> - Yolo v3 models</summary>
+
+- [csresnext50-panet-spp-original-optimal.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/csresnext50-panet-spp-original-optimal.cfg) - **65.4% mAP@0.5 (43.2% AP@0.5:0.95) - 32(R) FPS** - 100.5 BFlops - 217 MB: [csresnext50-panet-spp-original-optimal_final.weights](https://drive.google.com/open?id=1_NnfVgj0EDtb_WLNoXV8Mo7WKgwdYZCc)
+
+- [yolov3-spp.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3-spp.cfg) - **60.6% mAP@0.5 - 38(R) FPS** - 141.5 BFlops - 240 MB: [yolov3-spp.weights](https://pjreddie.com/media/files/yolov3-spp.weights)
+
+- [csresnext50-panet-spp.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/csresnext50-panet-spp.cfg) - **60.0% mAP@0.5 - 44 FPS** - 71.3 BFlops - 217 MB: [csresnext50-panet-spp_final.weights](https://drive.google.com/file/d/1aNXdM8qVy11nqTcd2oaVB3mf7ckr258-/view?usp=sharing)
+
+- [yolov3.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3.cfg) - **55.3% mAP@0.5 - 66(R) FPS** - 65.9 BFlops - 236 MB: [yolov3.weights](https://pjreddie.com/media/files/yolov3.weights)
+
+- [yolov3-tiny.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3-tiny.cfg) - **33.1% mAP@0.5 - 345(R) FPS** - 5.6 BFlops - 33.7 MB: [yolov3-tiny.weights](https://pjreddie.com/media/files/yolov3-tiny.weights)
+
+- [yolov3-tiny-prn.cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3-tiny-prn.cfg) - **33.1% mAP@0.5 - 370(R) FPS** - 3.5 BFlops - 18.8 MB: [yolov3-tiny-prn.weights](https://drive.google.com/file/d/18yYZWyKbo4XSDVyztmsEcF9B_6bxrhUY/view?usp=sharing)
+
+</details>
+
+<details><summary><b>CLICK ME</b> - Yolo v2 models</summary>
+
+- `yolov2.cfg` (194 MB COCO Yolo v2) - requires 4 GB GPU-RAM: https://pjreddie.com/media/files/yolov2.weights
+- `yolo-voc.cfg` (194 MB VOC Yolo v2) - requires 4 GB GPU-RAM: http://pjreddie.com/media/files/yolo-voc.weights
+- `yolov2-tiny.cfg` (43 MB COCO Yolo v2) - requires 1 GB GPU-RAM: https://pjreddie.com/media/files/yolov2-tiny.weights
+- `yolov2-tiny-voc.cfg` (60 MB VOC Yolo v2) - requires 1 GB GPU-RAM: http://pjreddie.com/media/files/yolov2-tiny-voc.weights
+- `yolo9000.cfg` (186 MB Yolo9000-model) - requires 4 GB GPU-RAM: http://pjreddie.com/media/files/yolo9000.weights
+
+</details>
+
+Put it near compiled: darknet.exe
+
+You can get cfg-files by path: `darknet/cfg/`
+
+### Requirements for Windows, Linux and macOS
+
+- **CMake >= 3.18**: https://cmake.org/download/
+- **Powershell** (already installed on windows): https://docs.microsoft.com/en-us/powershell/scripting/install/installing-powershell
+- **CUDA >= 10.2**: https://developer.nvidia.com/cuda-toolkit-archive (on Linux do [Post-installation Actions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions))
+- **OpenCV >= 2.4**: use your preferred package manager (brew, apt), build from source using [vcpkg](https://github.com/Microsoft/vcpkg) or download from [OpenCV official site](https://opencv.org/releases.html) (on Windows set system variable `OpenCV_DIR` = `C:\opencv\build` - where are the `include` and `x64` folders [image](https://user-images.githubusercontent.com/4096485/53249516-5130f480-36c9-11e9-8238-a6e82e48c6f2.png))
+- **cuDNN >= 8.0.2** https://developer.nvidia.com/rdp/cudnn-archive (on **Linux** follow steps described here https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installlinux-tar , on **Windows** follow steps described here https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installwindows)
+- **GPU with CC >= 3.0**: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
+
+### Yolo v4 in other frameworks
+
+- **Pytorch - Scaled-YOLOv4:** https://github.com/WongKinYiu/ScaledYOLOv4
+- **TensorFlow:** `pip install yolov4` YOLOv4 on TensorFlow 2.0 / TFlite / Android: https://github.com/hunglc007/tensorflow-yolov4-tflite
+    Official TF models: https://github.com/tensorflow/models/tree/master/official/vision/beta/projects/yolo
+    For YOLOv4 - convert `yolov4.weights`/`cfg` files to `yolov4.pb` by using [TNTWEN](https://github.com/TNTWEN/OpenVINO-YOLOV4) project, and to `yolov4.tflite` [TensorFlow-lite](https://www.tensorflow.org/lite/guide/get_started#2_convert_the_model_format)
+- **OpenCV** the fastest implementation of YOLOv4 for CPU (x86/ARM-Android), OpenCV can be compiled with [OpenVINO-backend](https://github.com/opencv/opencv/wiki/Intel's-Deep-Learning-Inference-Engine-backend) for running on (Myriad X / USB Neural Compute Stick / Arria FPGA), use `yolov4.weights`/`cfg` with: [C++ example](https://github.com/opencv/opencv/blob/8c25a8eb7b10fb50cda323ee6bec68aa1a9ce43c/samples/dnn/object_detection.cpp#L192-L221) or [Python example](https://github.com/opencv/opencv/blob/8c25a8eb7b10fb50cda323ee6bec68aa1a9ce43c/samples/dnn/object_detection.py#L129-L150)
+- **Intel OpenVINO 2021.2:** supports YOLOv4 (NPU Myriad X / USB Neural Compute Stick / Arria FPGA): https://devmesh.intel.com/projects/openvino-yolov4-49c756 read this [manual](https://github.com/TNTWEN/OpenVINO-YOLOV4) (old [manual](https://software.intel.com/en-us/articles/OpenVINO-Using-TensorFlow#converting-a-darknet-yolo-model) ) (for [Scaled-YOLOv4](https://github.com/WongKinYiu/ScaledYOLOv4/tree/yolov4-large) models use https://github.com/Chen-MingChang/pytorch_YOLO_OpenVINO_demo )
+- **PyTorch > ONNX**:
+  - [WongKinYiu/PyTorch_YOLOv4](https://github.com/WongKinYiu/PyTorch_YOLOv4)
+  - [maudzung/3D-YOLOv4](https://github.com/maudzung/Complex-YOLOv4-Pytorch)
+  - [Tianxiaomo/pytorch-YOLOv4](https://github.com/Tianxiaomo/pytorch-YOLOv4)
+  - [YOLOv5](https://github.com/ultralytics/yolov5)
+- **ONNX** on Jetson for YOLOv4: https://developer.nvidia.com/blog/announcing-onnx-runtime-for-jetson/ and https://github.com/ttanzhiqiang/onnx_tensorrt_project
+- **nVidia Transfer Learning Toolkit (TLT>=3.0)** Training and Detection https://docs.nvidia.com/metropolis/TLT/tlt-user-guide/text/object_detection/yolo_v4.html
+- **TensorRT+tkDNN**: https://github.com/ceccocats/tkDNN#fps-results
+- **Deepstream 5.0 / TensorRT for YOLOv4** https://github.com/NVIDIA-AI-IOT/yolov4_deepstream or https://github.com/marcoslucianops/DeepStream-Yolo read [Yolo is natively supported in DeepStream 4.0](https://news.developer.nvidia.com/deepstream-sdk-4-now-available/) and [PDF](https://docs.nvidia.com/metropolis/deepstream/Custom_YOLO_Model_in_the_DeepStream_YOLO_App.pdf). Additionally [jkjung-avt/tensorrt_demos](https://github.com/jkjung-avt/tensorrt_demos) or [wang-xinyu/tensorrtx](https://github.com/wang-xinyu/tensorrtx)
+- **Triton Inference Server / TensorRT** https://github.com/isarsoft/yolov4-triton-tensorrt
+- **DirectML** https://github.com/microsoft/DirectML/tree/master/Samples/yolov4
+- **OpenCL** (Intel, AMD, Mali GPUs for macOS & GNU/Linux) https://github.com/sowson/darknet
+- **HIP** for Training and Detection on AMD GPU https://github.com/os-hackathon/darknet
+- **ROS** (Robot Operating System) https://github.com/engcang/ros-yolo-sort
+- **Xilinx Zynq Ultrascale+ Deep Learning Processor (DPU) ZCU102/ZCU104:** https://github.com/Xilinx/Vitis-In-Depth-Tutorial/tree/master/Machine_Learning/Design_Tutorials/07-yolov4-tutorial
+- **Amazon Neurochip / Amazon EC2 Inf1 instances** 1.85 times higher throughput and 37% lower cost per image for TensorFlow based YOLOv4 model, using Keras [URL](https://aws.amazon.com/ru/blogs/machine-learning/improving-performance-for-deep-learning-based-object-detection-with-an-aws-neuron-compiled-yolov4-model-on-aws-inferentia/)
+- **TVM** - compilation of deep learning models (Keras, MXNet, PyTorch, Tensorflow, CoreML, DarkNet) into minimum deployable modules on diverse hardware backend (CPUs, GPUs, FPGA, and specialized accelerators): https://tvm.ai/about
+- **Tencent/ncnn:** the fastest inference of YOLOv4 on mobile phone CPU: https://github.com/Tencent/ncnn
+- **OpenDataCam** - It detects, tracks and counts moving objects by using YOLOv4: https://github.com/opendatacam/opendatacam#-hardware-pre-requisite
+- **Netron** - Visualizer for neural networks: https://github.com/lutzroeder/netron
+
+#### Datasets
+
+- MS COCO: use `./scripts/get_coco_dataset.sh` to get labeled MS COCO detection dataset
+- OpenImages: use `python ./scripts/get_openimages_dataset.py` for labeling train detection dataset
+- Pascal VOC: use `python ./scripts/voc_label.py` for labeling Train/Test/Val detection datasets
+- ILSVRC2012 (ImageNet classification): use `./scripts/get_imagenet_train.sh` (also `imagenet_label.sh` for labeling valid set)
+- German/Belgium/Russian/LISA/MASTIF Traffic Sign Datasets for Detection - use this parser: https://github.com/angeligareta/Datasets2Darknet#detection-task
+- List of other datasets: https://github.com/AlexeyAB/darknet/tree/master/scripts#datasets
+
+### Improvements in this repository
+
+- developed State-of-the-Art object detector YOLOv4
+- added State-of-Art models: CSP, PRN, EfficientNet
+- added layers: [conv_lstm], [scale_channels] SE/ASFF/BiFPN, [local_avgpool], [sam], [Gaussian_yolo], [reorg3d] (fixed [reorg]), fixed [batchnorm]
+- added the ability for training recurrent models (with layers conv-lstm`[conv_lstm]`/conv-rnn`[crnn]`) for accurate detection on video
+- added data augmentation: `[net] mixup=1 cutmix=1 mosaic=1 blur=1`. Added activations: SWISH, MISH, NORM_CHAN, NORM_CHAN_SOFTMAX
+- added the ability for training with GPU-processing using CPU-RAM to increase the mini_batch_size and increase accuracy (instead of batch-norm sync)
+- improved binary neural network performance **2x-4x times** for Detection on CPU and GPU if you trained your own weights by using this XNOR-net model (bit-1 inference) : https://github.com/AlexeyAB/darknet/blob/master/cfg/yolov3-tiny_xnor.cfg
+- improved neural network performance **~7%** by fusing 2 layers into 1: Convolutional + Batch-norm
+- improved performance: Detection **2x times**, on GPU Volta/Turing (Tesla V100, GeForce RTX, ...) using Tensor Cores if `CUDNN_HALF` defined in the `Makefile` or `darknet.sln`
+- improved performance **~1.2x** times on FullHD, **~2x** times on 4K, for detection on the video (file/stream) using `darknet detector demo`...
+- improved performance **3.5 X times** of data augmentation for training (using OpenCV SSE/AVX functions instead of hand-written functions) - removes bottleneck for training on multi-GPU or GPU Volta
+- improved performance of detection and training on Intel CPU with AVX (Yolo v3 **~85%**)
+- optimized memory allocation during network resizing when `random=1`
+- optimized GPU initialization for detection - we use batch=1 initially instead of re-init with batch=1
+- added correct calculation of **mAP, F1, IoU, Precision-Recall** using command `darknet detector map`...
+- added drawing of chart of average-Loss and accuracy-mAP (`-map` flag) during training
+- run `./darknet detector demo ... -json_port 8070 -mjpeg_port 8090` as JSON and MJPEG server to get results online over the network by using your soft or Web-browser
+- added calculation of anchors for training
+- added example of Detection and Tracking objects: https://github.com/AlexeyAB/darknet/blob/master/src/yolo_console_dll.cpp
+- run-time tips and warnings if you use incorrect cfg-file or dataset
+- added support for Windows
+- many other fixes of code...
+
+And added manual - [How to train Yolo v4-v2 (to detect your custom objects)](#how-to-train-to-detect-your-custom-objects)
+
+Also, you might be interested in using a simplified repository where is implemented INT8-quantization (+30% speedup and -1% mAP reduced): https://github.com/AlexeyAB/yolo2_light
+
+#### How to use on the command line
+
+If you use `build.ps1` script or the makefile (Linux only) you will find `darknet` in the root directory.
+
+If you use the deprecated Visual Studio solutions, you will find `darknet` in the directory `\build\darknet\x64`.
+
+If you customize build with CMake GUI, darknet executable will be installed in your preferred folder.
+
+- Yolo v4 COCO - **image**: `./darknet detector test cfg/coco.data cfg/yolov4.cfg yolov4.weights -thresh 0.25`
+- **Output coordinates** of objects: `./darknet detector test cfg/coco.data yolov4.cfg yolov4.weights -ext_output dog.jpg`
+- Yolo v4 COCO - **video**: `./darknet detector demo cfg/coco.data cfg/yolov4.cfg yolov4.weights -ext_output test.mp4`
+- Yolo v4 COCO - **WebCam 0**: `./darknet detector demo cfg/coco.data cfg/yolov4.cfg yolov4.weights -c 0`
+- Yolo v4 COCO for **net-videocam** - Smart WebCam: `./darknet detector demo cfg/coco.data cfg/yolov4.cfg yolov4.weights http://192.168.0.80:8080/video?dummy=param.mjpg`
+- Yolo v4 - **save result videofile res.avi**: `./darknet detector demo cfg/coco.data cfg/yolov4.cfg yolov4.weights test.mp4 -out_filename res.avi`
+- Yolo v3 **Tiny** COCO - video: `./darknet detector demo cfg/coco.data cfg/yolov3-tiny.cfg yolov3-tiny.weights test.mp4`
+- **JSON and MJPEG server** that allows multiple connections from your soft or Web-browser `ip-address:8070` and 8090: `./darknet detector demo ./cfg/coco.data ./cfg/yolov3.cfg ./yolov3.weights test50.mp4 -json_port 8070 -mjpeg_port 8090 -ext_output`
+- Yolo v3 Tiny **on GPU #1**: `./darknet detector demo cfg/coco.data cfg/yolov3-tiny.cfg yolov3-tiny.weights -i 1 test.mp4`
+- Alternative method Yolo v3 COCO - image: `./darknet detect cfg/yolov4.cfg yolov4.weights -i 0 -thresh 0.25`
+- Train on **Amazon EC2**, to see mAP & Loss-chart using URL like: `http://ec2-35-160-228-91.us-west-2.compute.amazonaws.com:8090` in the Chrome/Firefox (**Darknet should be compiled with OpenCV**):
+    `./darknet detector train cfg/coco.data yolov4.cfg yolov4.conv.137 -dont_show -mjpeg_port 8090 -map`
+- 186 MB Yolo9000 - image: `./darknet detector test cfg/combine9k.data cfg/yolo9000.cfg yolo9000.weights`
+- Remember to put data/9k.tree and data/coco9k.map under the same folder of your app if you use the cpp api to build an app
+- To process a list of images `data/train.txt` and save results of detection to `result.json` file use:
+    `./darknet detector test cfg/coco.data cfg/yolov4.cfg yolov4.weights -ext_output -dont_show -out result.json < data/train.txt`
+- To process a list of images `data/train.txt` and save results of detection to `result.txt` use:
+    `./darknet detector test cfg/coco.data cfg/yolov4.cfg yolov4.weights -dont_show -ext_output < data/train.txt > result.txt`
+- To process a video and output results to a json file use: `darknet.exe detector demo cfg/coco.data cfg/yolov3.cfg yolov3.weights file.mp4 -dont_show -json_file_output results.json`
+- Pseudo-labelling - to process a list of images `data/new_train.txt` and save results of detection in Yolo training format for each image as label `<image_name>.txt` (in this way you can increase the amount of training data) use:
+    `./darknet detector test cfg/coco.data cfg/yolov4.cfg yolov4.weights -thresh 0.25 -dont_show -save_labels < data/new_train.txt`
+- To calculate anchors: `./darknet detector calc_anchors data/obj.data -num_of_clusters 9 -width 416 -height 416`
+- To check accuracy mAP@IoU=50: `./darknet detector map data/obj.data yolo-obj.cfg backup\yolo-obj_7000.weights`
+- To check accuracy mAP@IoU=75: `./darknet detector map data/obj.data yolo-obj.cfg backup\yolo-obj_7000.weights -iou_thresh 0.75`
+
+##### For using network video-camera mjpeg-stream with any Android smartphone
+
+1. Download for Android phone mjpeg-stream soft: IP Webcam / Smart WebCam
+
+    - Smart WebCam - preferably: https://play.google.com/store/apps/details?id=com.acontech.android.SmartWebCam2
+    - IP Webcam: https://play.google.com/store/apps/details?id=com.pas.webcam
+
+2. Connect your Android phone to the computer by WiFi (through a WiFi-router) or USB
+3. Start Smart WebCam on your phone
+4. Replace the address below, shown in the phone application (Smart WebCam) and launch:
+
+- Yolo v4 COCO-model: `./darknet detector demo data/coco.data yolov4.cfg yolov4.weights http://192.168.0.80:8080/video?dummy=param.mjpg -i 0`
+
+### How to compile on Linux/macOS (using `CMake`)
+
+The `CMakeLists.txt` will attempt to find installed optional dependencies like CUDA, cudnn, ZED and build against those. It will also create a shared object library file to use `darknet` for code development.
+
+To update CMake on Ubuntu, it's better to follow guide here: https://apt.kitware.com/ or https://cmake.org/download/
+
+```bash
+git clone https://github.com/AlexeyAB/darknet
+cd darknet
+mkdir build_release
+cd build_release
+cmake ..
+cmake --build . --target install --parallel 8
+```
+
+### Using also PowerShell
+
+Install: `Cmake`, `CUDA`, `cuDNN` [How to install dependencies](#requirements)
+
+Install powershell for your OS (Linux or MacOS) ([guide here](https://docs.microsoft.com/en-us/powershell/scripting/install/installing-powershell)).
+
+Open PowerShell type these commands
+
+```PowerShell
+git clone https://github.com/AlexeyAB/darknet
+cd darknet
+./build.ps1 -UseVCPKG -EnableOPENCV -EnableCUDA -EnableCUDNN
+```
+
+- remove options like `-EnableCUDA` or `-EnableCUDNN` if you are not interested into
+- remove option `-UseVCPKG` if you plan to manually provide OpenCV library to darknet or if you do not want to enable OpenCV integration
+- add option `-EnableOPENCV_CUDA` if you want to build OpenCV with CUDA support - very slow to build! (requires `-UseVCPKG`)
+
+If you open the `build.ps1` script at the beginning you will find all available switches.
+
+### How to compile on Linux (using `make`)
+
+Just do `make` in the darknet directory. (You can try to compile and run it on Google Colab in cloud [link](https://colab.research.google.com/drive/12QusaaRj_lUwCGDvQNfICpa7kA7_a2dE) (press «Open in Playground» button at the top-left corner) and watch the video [link](https://www.youtube.com/watch?v=mKAEGSxwOAY) )
+Before make, you can set such options in the `Makefile`: [link](https://github.com/AlexeyAB/darknet/blob/9c1b9a2cf6363546c152251be578a21f3c3caec6/Makefile#L1)
+
+- `GPU=1` to build with CUDA to accelerate by using GPU (CUDA should be in `/usr/local/cuda`)
+- `CUDNN=1` to build with cuDNN v5-v7 to accelerate training by using GPU (cuDNN should be in `/usr/local/cudnn`)
+- `CUDNN_HALF=1` to build for Tensor Cores (on Titan V / Tesla V100 / DGX-2 and later) speedup Detection 3x, Training 2x
+- `OPENCV=1` to build with OpenCV 4.x/3.x/2.4.x - allows to detect on video files and video streams from network cameras or web-cams
+- `DEBUG=1` to build debug version of Yolo
+- `OPENMP=1` to build with OpenMP support to accelerate Yolo by using multi-core CPU
+- `LIBSO=1` to build a library `darknet.so` and binary runnable file `uselib` that uses this library. Or you can try to run so `LD_LIBRARY_PATH=./:$LD_LIBRARY_PATH ./uselib test.mp4` How to use this SO-library from your own code - you can look at C++ example: https://github.com/AlexeyAB/darknet/blob/master/src/yolo_console_dll.cpp
+    or use in such a way: `LD_LIBRARY_PATH=./:$LD_LIBRARY_PATH ./uselib data/coco.names cfg/yolov4.cfg yolov4.weights test.mp4`
+- `ZED_CAMERA=1` to build a library with ZED-3D-camera support (should be ZED SDK installed), then run
+    `LD_LIBRARY_PATH=./:$LD_LIBRARY_PATH ./uselib data/coco.names cfg/yolov4.cfg yolov4.weights zed_camera`
+- You also need to specify for which graphics card the code is generated. This is done by setting `ARCH=`. If you use a newer version than CUDA 11 you further need to edit line 20 from Makefile and remove `-gencode arch=compute_30,code=sm_30 \` as Kepler GPU support was dropped in CUDA 11. You can also drop the general `ARCH=` and just uncomment `ARCH=` for your graphics card.
+
+### How to compile on Windows (using `CMake`)
+
+Requires:
+
+- MSVC: https://visualstudio.microsoft.com/thank-you-downloading-visual-studio/?sku=Community
+- CMake GUI: `Windows win64-x64 Installer`https://cmake.org/download/
+- Download Darknet zip-archive with the latest commit and uncompress it: [master.zip](https://github.com/AlexeyAB/darknet/archive/master.zip)
+
+In Windows:
+
+- Start (button) -> All programs -> CMake -> CMake (gui) ->
+
+- [look at image](https://habrastorage.org/webt/pz/s1/uu/pzs1uu4heb7vflfcjqn-lxy-aqu.jpeg) In CMake: Enter input path to the darknet Source, and output path to the Binaries -> Configure (button) -> Optional platform for generator: `x64`  -> Finish -> Generate -> Open Project ->
+
+- in MS Visual Studio: Select: x64 and Release -> Build -> Build solution
+
+- find the executable file `darknet.exe` in the output path to the binaries you specified
+
+![x64 and Release](https://habrastorage.org/webt/ay/ty/f-/aytyf-8bufe7q-16yoecommlwys.jpeg)
+
+### How to compile on Windows (using `vcpkg`)
+
+This is the recommended approach to build Darknet on Windows.
+
+1. Install Visual Studio 2017 or 2019. In case you need to download it, please go here: [Visual Studio Community](http://visualstudio.com). Remember to install English language pack, this is mandatory for vcpkg!
+
+2. Install CUDA enabling VS Integration during installation.
+
+3. Open Powershell (Start -> All programs -> Windows Powershell) and type these commands:
+
+```PowerShell
+Set-ExecutionPolicy unrestricted -Scope CurrentUser -Force
+git clone https://github.com/AlexeyAB/darknet
+cd darknet
+.\build.ps1 -UseVCPKG -EnableOPENCV -EnableCUDA -EnableCUDNN
+```
+
+(add option `-EnableOPENCV_CUDA` if you want to build OpenCV with CUDA support - very slow to build! - or remove options like `-EnableCUDA` or `-EnableCUDNN` if you are not interested in them). If you open the `build.ps1` script at the beginning you will find all available switches.
+
+## How to train with multi-GPU
+
+1. Train it first on 1 GPU for like 1000 iterations: `darknet.exe detector train cfg/coco.data cfg/yolov4.cfg yolov4.conv.137`
+
+2. Then stop and by using partially-trained model `/backup/yolov4_1000.weights` run training with multigpu (up to 4 GPUs): `darknet.exe detector train cfg/coco.data cfg/yolov4.cfg /backup/yolov4_1000.weights -gpus 0,1,2,3`
+
+If you get a Nan, then for some datasets better to decrease learning rate, for 4 GPUs set `learning_rate = 0,00065` (i.e. learning_rate = 0.00261 / GPUs). In this case also increase 4x times `burn_in =` in your cfg-file. I.e. use `burn_in = 4000` instead of `1000`.
+
+https://groups.google.com/d/msg/darknet/NbJqonJBTSY/Te5PfIpuCAAJ
+
+## How to train (to detect your custom objects)
+
+(to train old Yolo v2 `yolov2-voc.cfg`, `yolov2-tiny-voc.cfg`, `yolo-voc.cfg`, `yolo-voc.2.0.cfg`, ... [click by the link](https://github.com/AlexeyAB/darknet/tree/47c7af1cea5bbdedf1184963355e6418cb8b1b4f#how-to-train-pascal-voc-data))
+
+Training Yolo v4 (and v3):
+
+0. For training `cfg/yolov4-custom.cfg` download the pre-trained weights-file (162 MB): [yolov4.conv.137](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.conv.137) (Google drive mirror [yolov4.conv.137](https://drive.google.com/open?id=1JKF-bdIklxOOVy-2Cr5qdvjgGpmGfcbp) )
+1. Create file `yolo-obj.cfg` with the same content as in `yolov4-custom.cfg` (or copy `yolov4-custom.cfg` to `yolo-obj.cfg)` and:
+
+- change line batch to [`batch=64`](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L3)
+- change line subdivisions to [`subdivisions=16`](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L4)
+- change line max_batches to (`classes*2000`, but not less than number of training images and not less than `6000`), f.e. [`max_batches=6000`](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L20) if you train for 3 classes
+- change line steps to 80% and 90% of max_batches, f.e. [`steps=4800,5400`](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L22)
+- set network size `width=416 height=416` or any value multiple of 32: https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L8-L9
+- change line `classes=80` to your number of objects in each of 3 `[yolo]`-layers:
+  - https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L610
+  - https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L696
+  - https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L783
+- change [`filters=255`] to filters=(classes + 5)x3 in the 3 `[convolutional]` before each `[yolo]` layer, keep in mind that it only has to be the last `[convolutional]` before each of the `[yolo]` layers.
+  - https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L603
+  - https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L689
+  - https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L776
+- when using [`[Gaussian_yolo]`](https://github.com/AlexeyAB/darknet/blob/6e5bdf1282ad6b06ed0e962c3f5be67cf63d96dc/cfg/Gaussian_yolov3_BDD.cfg#L608)  layers, change [`filters=57`] filters=(classes + 9)x3 in the 3 `[convolutional]` before each `[Gaussian_yolo]` layer
+  - https://github.com/AlexeyAB/darknet/blob/6e5bdf1282ad6b06ed0e962c3f5be67cf63d96dc/cfg/Gaussian_yolov3_BDD.cfg#L604
+  - https://github.com/AlexeyAB/darknet/blob/6e5bdf1282ad6b06ed0e962c3f5be67cf63d96dc/cfg/Gaussian_yolov3_BDD.cfg#L696
+  - https://github.com/AlexeyAB/darknet/blob/6e5bdf1282ad6b06ed0e962c3f5be67cf63d96dc/cfg/Gaussian_yolov3_BDD.cfg#L789
+
+So if `classes=1` then should be `filters=18`. If `classes=2` then write `filters=21`.
+**(Do not write in the cfg-file: filters=(classes + 5)x3)**
+
+(Generally `filters` depends on the `classes`, `coords` and number of `mask`s, i.e. filters=`(classes + coords + 1)*<number of mask>`, where `mask` is indices of anchors. If `mask` is absence, then filters=`(classes + coords + 1)*num`)
+
+So for example, for 2 objects, your file `yolo-obj.cfg` should differ from `yolov4-custom.cfg` in such lines in each of **3** [yolo]-layers:
+
+```ini
+[convolutional]
+filters=21
+
+[region]
+classes=2
+```
+
+2. Create file `obj.names` in the directory `build\darknet\x64\data\`, with objects names - each in new line
+3. Create file `obj.data` in the directory `build\darknet\x64\data\`, containing (where **classes = number of objects**):
+
+  ```ini
+  classes = 2
+  train  = data/train.txt
+  valid  = data/test.txt
+  names = data/obj.names
+  backup = backup/
+  ```
+
+4. Put image-files (.jpg) of your objects in the directory `build\darknet\x64\data\obj\`
+5. You should label each object on images from your dataset. Use this visual GUI-software for marking bounded boxes of objects and generating annotation files for Yolo v2 & v3: https://github.com/AlexeyAB/Yolo_mark
+
+It will create `.txt`-file for each `.jpg`-image-file - in the same directory and with the same name, but with `.txt`-extension, and put to file: object number and object coordinates on this image, for each object in new line:
+
+`<object-class> <x_center> <y_center> <width> <height>`
+
+  Where:
+
+- `<object-class>` - integer object number from `0` to `(classes-1)`
+- `<x_center> <y_center> <width> <height>` - float values **relative** to width and height of image, it can be equal from `(0.0 to 1.0]`
+- for example: `<x> = <absolute_x> / <image_width>` or `<height> = <absolute_height> / <image_height>`
+- attention: `<x_center> <y_center>` - are center of rectangle (are not top-left corner)
+
+  For example for `img1.jpg` you will be created `img1.txt` containing:
+
+  ```csv
+  1 0.716797 0.395833 0.216406 0.147222
+  0 0.687109 0.379167 0.255469 0.158333
+  1 0.420312 0.395833 0.140625 0.166667
+  ```
+
+6. Create file `train.txt` in directory `build\darknet\x64\data\`, with filenames of your images, each filename in new line, with path relative to `darknet.exe`, for example containing:
+
+  ```csv
+  data/obj/img1.jpg
+  data/obj/img2.jpg
+  data/obj/img3.jpg
+  ```
+
+7. Download pre-trained weights for the convolutional layers and put to the directory `build\darknet\x64`
+    - for `yolov4.cfg`, `yolov4-custom.cfg` (162 MB): [yolov4.conv.137](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.conv.137) (Google drive mirror [yolov4.conv.137](https://drive.google.com/open?id=1JKF-bdIklxOOVy-2Cr5qdvjgGpmGfcbp) )
+    - for `yolov4-tiny.cfg`, `yolov4-tiny-3l.cfg`, `yolov4-tiny-custom.cfg` (19 MB): [yolov4-tiny.conv.29](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.conv.29)  
+    - for `csresnext50-panet-spp.cfg` (133 MB): [csresnext50-panet-spp.conv.112](https://drive.google.com/file/d/16yMYCLQTY_oDlCIZPfn_sab6KD3zgzGq/view?usp=sharing)
+    - for `yolov3.cfg, yolov3-spp.cfg` (154 MB): [darknet53.conv.74](https://pjreddie.com/media/files/darknet53.conv.74)
+    - for `yolov3-tiny-prn.cfg , yolov3-tiny.cfg` (6 MB): [yolov3-tiny.conv.11](https://drive.google.com/file/d/18v36esoXCh-PsOKwyP2GWrpYDptDY8Zf/view?usp=sharing)
+    - for `enet-coco.cfg (EfficientNetB0-Yolov3)` (14 MB): [enetb0-coco.conv.132](https://drive.google.com/file/d/1uhh3D6RSn0ekgmsaTcl-ZW53WBaUDo6j/view?usp=sharing)
+
+8. Start training by using the command line: `darknet.exe detector train data/obj.data yolo-obj.cfg yolov4.conv.137`
+
+   To train on Linux use command: `./darknet detector train data/obj.data yolo-obj.cfg yolov4.conv.137` (just use `./darknet` instead of `darknet.exe`)
+
+   - (file `yolo-obj_last.weights` will be saved to the `build\darknet\x64\backup\` for each 100 iterations)
+   - (file `yolo-obj_xxxx.weights` will be saved to the `build\darknet\x64\backup\` for each 1000 iterations)
+   - (to disable Loss-Window use `darknet.exe detector train data/obj.data yolo-obj.cfg yolov4.conv.137 -dont_show`, if you train on computer without monitor like a cloud Amazon EC2)
+   - (to see the mAP & Loss-chart during training on remote server without GUI, use command `darknet.exe detector train data/obj.data yolo-obj.cfg yolov4.conv.137 -dont_show -mjpeg_port 8090 -map` then open URL `http://ip-address:8090` in Chrome/Firefox browser)
+
+8.1. For training with mAP (mean average precisions) calculation for each 4 Epochs (set `valid=valid.txt` or `train.txt` in `obj.data` file) and run: `darknet.exe detector train data/obj.data yolo-obj.cfg yolov4.conv.137 -map`
+
+8.2. One can also set the `-mAP_epochs` in the training command if less or more frequent mAP calculation is needed. For example in order to calculate mAP for each 2 Epochs run `darknet.exe detector train data/obj.data yolo-obj.cfg yolov4.conv.137 -map -mAP_epochs 2`
+
+9. After training is complete - get result `yolo-obj_final.weights` from path `build\darknet\x64\backup\`
+
+   - After each 100 iterations you can stop and later start training from this point. For example, after 2000 iterations you can stop training, and later just start training using: `darknet.exe detector train data/obj.data yolo-obj.cfg backup\yolo-obj_2000.weights`
+
+    (in the original repository https://github.com/pjreddie/darknet the weights-file is saved only once every 10 000 iterations `if(iterations > 1000)`)
+
+   - Also you can get result earlier than all 45000 iterations.
+
+ **Note:** If during training you see `nan` values for `avg` (loss) field - then training goes wrong, but if `nan` is in some other lines - then training goes well.
+
+ **Note:** If you changed width= or height= in your cfg-file, then new width and height must be divisible by 32.
+
+ **Note:** After training use such command for detection: `darknet.exe detector test data/obj.data yolo-obj.cfg yolo-obj_8000.weights`
+
+  **Note:** if error `Out of memory` occurs then in `.cfg`-file you should increase `subdivisions=16`, 32 or 64: [link](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L4)
+
+### How to train tiny-yolo (to detect your custom objects)
+
+Do all the same steps as for the full yolo model as described above. With the exception of:
+
+- Download file with the first 29-convolutional layers of yolov4-tiny: https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.conv.29
+ (Or get this file from yolov4-tiny.weights file by using command: `darknet.exe partial cfg/yolov4-tiny-custom.cfg yolov4-tiny.weights yolov4-tiny.conv.29 29`
+- Make your custom model `yolov4-tiny-obj.cfg` based on `cfg/yolov4-tiny-custom.cfg` instead of `yolov4.cfg`
+- Start training: `darknet.exe detector train data/obj.data yolov4-tiny-obj.cfg yolov4-tiny.conv.29`
+
+For training Yolo based on other models ([DenseNet201-Yolo](https://github.com/AlexeyAB/darknet/blob/master/build/darknet/x64/densenet201_yolo.cfg) or [ResNet50-Yolo](https://github.com/AlexeyAB/darknet/blob/master/build/darknet/x64/resnet50_yolo.cfg)), you can download and get pre-trained weights as showed in this file: https://github.com/AlexeyAB/darknet/blob/master/build/darknet/x64/partial.cmd
+If you made you custom model that isn't based on other models, then you can train it without pre-trained weights, then will be used random initial weights.
+
+## When should I stop training
+
+Usually sufficient 2000 iterations for each class(object), but not less than number of training images and not less than 6000 iterations in total. But for a more precise definition of when you should stop training, use the following manual:
+
+1. During training, you will see varying indicators of error, and you should stop when no longer decreases **0.XXXXXXX avg**:
+
+  > Region Avg IOU: 0.798363, Class: 0.893232, Obj: 0.700808, No Obj: 0.004567, Avg Recall: 1.000000,  count: 8
+  > Region Avg IOU: 0.800677, Class: 0.892181, Obj: 0.701590, No Obj: 0.004574, Avg Recall: 1.000000,  count: 8
+  >
+  > **9002**: 0.211667, **0.60730 avg**, 0.001000 rate, 3.868000 seconds, 576128 images
+  > Loaded: 0.000000 seconds
+
+- **9002** - iteration number (number of batch)
+- **0.60730 avg** - average loss (error) - **the lower, the better**
+
+  When you see that average loss **0.xxxxxx avg** no longer decreases at many iterations then you should stop training. The final average loss can be from `0.05` (for a small model and easy dataset) to `3.0` (for a big model and a difficult dataset).
+  
+  Or if you train with flag `-map` then you will see mAP indicator `Last accuracy mAP@0.5 = 18.50%` in the console - this indicator is better than Loss, so train while mAP increases.
+
+2. Once training is stopped, you should take some of last `.weights`-files from `darknet\build\darknet\x64\backup` and choose the best of them:
+
+For example, you stopped training after 9000 iterations, but the best result can give one of previous weights (7000, 8000, 9000). It can happen due to over-fitting. **Over-fitting** - is case when you can detect objects on images from training-dataset, but can't detect objects on any others images. You should get weights from **Early Stopping Point**:
+
+![Over-fitting](https://hsto.org/files/5dc/7ae/7fa/5dc7ae7fad9d4e3eb3a484c58bfc1ff5.png)
+
+To get weights from Early Stopping Point:
+
+  2.1. At first, in your file `obj.data` you must specify the path to the validation dataset `valid = valid.txt` (format of `valid.txt` as in `train.txt`), and if you haven't validation images, just copy `data\train.txt` to `data\valid.txt`.
+
+  2.2 If training is stopped after 9000 iterations, to validate some of previous weights use this commands:
+
+(If you use another GitHub repository, then use `darknet.exe detector recall`... instead of `darknet.exe detector map`...)
+
+- `darknet.exe detector map data/obj.data yolo-obj.cfg backup\yolo-obj_7000.weights`
+- `darknet.exe detector map data/obj.data yolo-obj.cfg backup\yolo-obj_8000.weights`
+- `darknet.exe detector map data/obj.data yolo-obj.cfg backup\yolo-obj_9000.weights`
+
+And compare last output lines for each weights (7000, 8000, 9000):
+
+Choose weights-file **with the highest mAP (mean average precision)** or IoU (intersect over union)
+
+For example, **bigger mAP** gives weights `yolo-obj_8000.weights` - then **use this weights for detection**.
+
+Or just train with `-map` flag:
+
+`darknet.exe detector train data/obj.data yolo-obj.cfg yolov4.conv.137 -map`
+
+So you will see mAP-chart (red-line) in the Loss-chart Window. mAP will be calculated for each 4 Epochs using `valid=valid.txt` file that is specified in `obj.data` file (`1 Epoch = images_in_train_txt / batch` iterations)
+
+(to change the max x-axis value - change [`max_batches=`](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L20) parameter to `2000*classes`, f.e. `max_batches=6000` for 3 classes)
+
+![loss_chart_map_chart](https://hsto.org/webt/yd/vl/ag/ydvlagutof2zcnjodstgroen8ac.jpeg)
+
+Example of custom object detection: `darknet.exe detector test data/obj.data yolo-obj.cfg yolo-obj_8000.weights`
+
+- **IoU** (intersect over union) - average intersect over union of objects and detections for a certain threshold = 0.24
+
+- **mAP** (mean average precision) - mean value of `average precisions` for each class, where `average precision` is average value of 11 points on PR-curve for each possible threshold (each probability of detection) for the same class (Precision-Recall in terms of PascalVOC, where Precision=TP/(TP+FP) and Recall=TP/(TP+FN) ), page-11: http://homepages.inf.ed.ac.uk/ckiw/postscript/ijcv_voc09.pdf
+
+**mAP** is default metric of precision in the PascalVOC competition, **this is the same as AP50** metric in the MS COCO competition.
+In terms of Wiki, indicators Precision and Recall have a slightly different meaning than in the PascalVOC competition, but **IoU always has the same meaning**.
+
+![precision_recall_iou](https://hsto.org/files/ca8/866/d76/ca8866d76fb840228940dbf442a7f06a.jpg)
+
+### Custom object detection
+
+Example of custom object detection: `darknet.exe detector test data/obj.data yolo-obj.cfg yolo-obj_8000.weights`
+
+| ![Yolo_v2_training](https://hsto.org/files/d12/1e7/515/d121e7515f6a4eb694913f10de5f2b61.jpg) | ![Yolo_v2_training](https://hsto.org/files/727/c7e/5e9/727c7e5e99bf4d4aa34027bb6a5e4bab.jpg) |
+|---|---|
+
+## How to improve object detection
+
+1. Before training:
+
+- set flag `random=1` in your `.cfg`-file - it will increase precision by training Yolo for different resolutions: [link](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L788)
+
+- increase network resolution in your `.cfg`-file (`height=608`, `width=608` or any value multiple of 32) - it will increase precision
+
+- check that each object that you want to detect is mandatory labeled in your dataset - no one object in your data set should not be without label. In the most training issues - there are wrong labels in your dataset (got labels by using some conversion script, marked with a third-party tool, ...). Always check your dataset by using: https://github.com/AlexeyAB/Yolo_mark
+
+- my Loss is very high and mAP is very low, is training wrong? Run training with `-show_imgs` flag at the end of training command, do you see correct bounded boxes of objects (in windows or in files `aug_...jpg`)? If no - your training dataset is wrong.
+
+- for each object which you want to detect - there must be at least 1 similar object in the Training dataset with about the same: shape, side of object, relative size, angle of rotation, tilt, illumination. So desirable that your training dataset include images with objects at different: scales, rotations, lightings, from different sides, on different backgrounds - you should preferably have 2000 different images for each class or more, and you should train `2000*classes` iterations or more
+
+- desirable that your training dataset include images with non-labeled objects that you do not want to detect - negative samples without bounded box (empty `.txt` files) - use as many images of negative samples as there are images with objects
+
+- What is the best way to mark objects: label only the visible part of the object, or label the visible and overlapped part of the object, or label a little more than the entire object (with a little gap)? Mark as you like - how would you like it to be detected.
+
+- for training with a large number of objects in each image, add the parameter `max=200` or higher value in the last `[yolo]`-layer or `[region]`-layer in your cfg-file (the global maximum number of objects that can be detected by YoloV3 is `0,0615234375*(width*height)` where are width and height are parameters from `[net]` section in cfg-file)
+  
+- for training for small objects (smaller than 16x16 after the image is resized to 416x416) - set `layers = 23` instead of https://github.com/AlexeyAB/darknet/blob/6f718c257815a984253346bba8fb7aa756c55090/cfg/yolov4.cfg#L895
+  - set `stride=4` instead of https://github.com/AlexeyAB/darknet/blob/6f718c257815a984253346bba8fb7aa756c55090/cfg/yolov4.cfg#L892
+  - set `stride=4` instead of https://github.com/AlexeyAB/darknet/blob/6f718c257815a984253346bba8fb7aa756c55090/cfg/yolov4.cfg#L989
+  
+- for training for both small and large objects use modified models:
+  - Full-model: 5 yolo layers: https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3_5l.cfg
+  - Tiny-model: 3 yolo layers: https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny-3l.cfg
+  - YOLOv4: 3 yolo layers: https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-custom.cfg
+  
+- If you train the model to distinguish Left and Right objects as separate classes (left/right hand, left/right-turn on road signs, ...) then for disabling flip data augmentation - add `flip=0` here: https://github.com/AlexeyAB/darknet/blob/3d2d0a7c98dbc8923d9ff705b81ff4f7940ea6ff/cfg/yolov3.cfg#L17
+  
+- General rule - your training dataset should include such a set of relative sizes of objects that you want to detect:
+  - `train_network_width * train_obj_width / train_image_width ~= detection_network_width * detection_obj_width / detection_image_width`
+  - `train_network_height * train_obj_height / train_image_height ~= detection_network_height * detection_obj_height / detection_image_height`
+
+  I.e. for each object from Test dataset there must be at least 1 object in the Training dataset with the same class_id and about the same relative size:
+
+  `object width in percent from Training dataset` ~= `object width in percent from Test dataset`
+
+  That is, if only objects that occupied 80-90% of the image were present in the training set, then the trained network will not be able to detect objects that occupy 1-10% of the image.
+
+- to speedup training (with decreasing detection accuracy) set param `stopbackward=1` for layer-136 in cfg-file
+
+- each: `model of object, side, illumination, scale, each 30 grad` of the turn and inclination angles - these are *different objects* from an internal perspective of the neural network. So the more *different objects* you want to detect, the more complex network model should be used.
+
+- to make the detected bounded boxes more accurate, you can add 3 parameters `ignore_thresh = .9 iou_normalizer=0.5 iou_loss=giou` to each `[yolo]` layer and train, it will increase mAP@0.9, but decrease mAP@0.5.
+
+- Only if you are an **expert** in neural detection networks - recalculate anchors for your dataset for `width` and `height` from cfg-file:
+`darknet.exe detector calc_anchors data/obj.data -num_of_clusters 9 -width 416 -height 416`
+then set the same 9 `anchors` in each of 3 `[yolo]`-layers in your cfg-file. But you should change indexes of anchors `masks=` for each [yolo]-layer, so for YOLOv4 the 1st-[yolo]-layer has anchors smaller than 30x30, 2nd smaller than 60x60, 3rd remaining, and vice versa for YOLOv3. Also you should change the `filters=(classes + 5)*<number of mask>` before each [yolo]-layer. If many of the calculated anchors do not fit under the appropriate layers - then just try using all the default anchors.
+
+2. After training - for detection:
+
+- Increase network-resolution by set in your `.cfg`-file (`height=608` and `width=608`) or (`height=832` and `width=832`) or (any value multiple of 32) - this increases the precision and makes it possible to detect small objects: [link](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L8-L9)
+
+- it is not necessary to train the network again, just use `.weights`-file already trained for 416x416 resolution
+
+- to get even greater accuracy you should train with higher resolution 608x608 or 832x832, note: if error `Out of memory` occurs then in `.cfg`-file you should increase `subdivisions=16`, 32 or 64: [link](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L4)
+
+## How to mark bounded boxes of objects and create annotation files
+
+Here you can find repository with GUI-software for marking bounded boxes of objects and generating annotation files for Yolo v2 - v4: https://github.com/AlexeyAB/Yolo_mark
+
+With example of: `train.txt`, `obj.names`, `obj.data`, `yolo-obj.cfg`, `air`1-6`.txt`, `bird`1-4`.txt` for 2 classes of objects (air, bird) and `train_obj.cmd` with example how to train this image-set with Yolo v2 - v4
+
+Different tools for marking objects in images:
+
+1. in C++: https://github.com/AlexeyAB/Yolo_mark
+2. in Python: https://github.com/tzutalin/labelImg
+3. in Python: https://github.com/Cartucho/OpenLabeling
+4. in C++: https://www.ccoderun.ca/darkmark/
+5. in JavaScript: https://github.com/opencv/cvat
+6. in C++: https://github.com/jveitchmichaelis/deeplabel
+7. in C#: https://github.com/BMW-InnovationLab/BMW-Labeltool-Lite
+8. DL-Annotator for Windows ($30): [url](https://www.microsoft.com/en-us/p/dlannotator/9nsx79m7t8fn?activetab=pivot:overviewtab)
+9. v7labs - the greatest cloud labeling tool ($1.5 per hour): https://www.v7labs.com/
+
+## How to use Yolo as DLL and SO libraries
+
+- on Linux
+  - using `build.sh` or
+  - build `darknet` using `cmake` or
+  - set `LIBSO=1` in the `Makefile` and do `make`
+- on Windows
+  - using `build.ps1` or
+  - build `darknet` using `cmake` or
+  - compile `build\darknet\yolo_cpp_dll.sln` solution or `build\darknet\yolo_cpp_dll_no_gpu.sln` solution
+
+There are 2 APIs:
+
+- C API: https://github.com/AlexeyAB/darknet/blob/master/include/darknet.h
+  - Python examples using the C API:
+    - https://github.com/AlexeyAB/darknet/blob/master/darknet.py
+    - https://github.com/AlexeyAB/darknet/blob/master/darknet_video.py
+
+- C++ API: https://github.com/AlexeyAB/darknet/blob/master/include/yolo_v2_class.hpp
+  - C++ example that uses C++ API: https://github.com/AlexeyAB/darknet/blob/master/src/yolo_console_dll.cpp
+
+----
+
+1. To compile Yolo as C++ DLL-file `yolo_cpp_dll.dll` - open the solution `build\darknet\yolo_cpp_dll.sln`, set **x64** and **Release**, and do the: Build -> Build yolo_cpp_dll
+    - You should have installed **CUDA 10.2**
+    - To use cuDNN do: (right click on project) -> properties -> C/C++ -> Preprocessor -> Preprocessor Definitions, and add at the beginning of line: `CUDNN;`
+
+2. To use Yolo as DLL-file in your C++ console application - open the solution `build\darknet\yolo_console_dll.sln`, set **x64** and **Release**, and do the: Build -> Build yolo_console_dll
+
+    - you can run your console application from Windows Explorer `build\darknet\x64\yolo_console_dll.exe`
+    **use this command**: `yolo_console_dll.exe data/coco.names yolov4.cfg yolov4.weights test.mp4`
+
+    - after launching your console application and entering the image file name - you will see info for each object:
+    `<obj_id> <left_x> <top_y> <width> <height> <probability>`
+    - to use simple OpenCV-GUI you should uncomment line `//#define OPENCV` in `yolo_console_dll.cpp`-file: [link](https://github.com/AlexeyAB/darknet/blob/a6cbaeecde40f91ddc3ea09aa26a03ab5bbf8ba8/src/yolo_console_dll.cpp#L5)
+    - you can see source code of simple example for detection on the video file: [link](https://github.com/AlexeyAB/darknet/blob/ab1c5f9e57b4175f29a6ef39e7e68987d3e98704/src/yolo_console_dll.cpp#L75)
+
+`yolo_cpp_dll.dll`-API: [link](https://github.com/AlexeyAB/darknet/blob/master/src/yolo_v2_class.hpp#L42)
+
+```cpp
+struct bbox_t {
+    unsigned int x, y, w, h;    // (x,y) - top-left corner, (w, h) - width & height of bounded box
+    float prob;                    // confidence - probability that the object was found correctly
+    unsigned int obj_id;        // class of object - from range [0, classes-1]
+    unsigned int track_id;        // tracking id for video (0 - untracked, 1 - inf - tracked object)
+    unsigned int frames_counter;// counter of frames on which the object was detected
+};
+
+class Detector {
+public:
+        Detector(std::string cfg_filename, std::string weight_filename, int gpu_id = 0);
+        ~Detector();
+
+        std::vector<bbox_t> detect(std::string image_filename, float thresh = 0.2, bool use_mean = false);
+        std::vector<bbox_t> detect(image_t img, float thresh = 0.2, bool use_mean = false);
+        static image_t load_image(std::string image_filename);
+        static void free_image(image_t m);
+
+#ifdef OPENCV
+        std::vector<bbox_t> detect(cv::Mat mat, float thresh = 0.2, bool use_mean = false);
+        std::shared_ptr<image_t> mat_to_image_resize(cv::Mat mat) const;
+#endif
+};
+```
+
+## Citation
+
+```
+@misc{bochkovskiy2020yolov4,
+      title={YOLOv4: Optimal Speed and Accuracy of Object Detection}, 
+      author={Alexey Bochkovskiy and Chien-Yao Wang and Hong-Yuan Mark Liao},
+      year={2020},
+      eprint={2004.10934},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+```
+@InProceedings{Wang_2021_CVPR,
+    author    = {Wang, Chien-Yao and Bochkovskiy, Alexey and Liao, Hong-Yuan Mark},
+    title     = {{Scaled-YOLOv4}: Scaling Cross Stage Partial Network},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month     = {June},
+    year      = {2021},
+    pages     = {13029-13038}
+}
+```
diff --git a/darknet-master/build.ps1 b/darknet-master/build.ps1
new file mode 100644
index 0000000..e378e4b
--- /dev/null
+++ b/darknet-master/build.ps1
@@ -0,0 +1,1145 @@
+#!/usr/bin/env pwsh
+
+<#
+
+.SYNOPSIS
+        build
+        Created By: Stefano Sinigardi
+        Created Date: February 18, 2019
+        Last Modified Date: September 25, 2023
+
+.DESCRIPTION
+Build tool using CMake, trying to properly setup the environment around compiler
+
+.PARAMETER DisableInteractive
+Disable script interactivity (useful for CI runs)
+
+.PARAMETER BuildDebug
+Build using the debug toolchain
+
+.PARAMETER DisableDLLcopy
+Disable automatic DLL deployment through vcpkg at the end
+
+.PARAMETER EnableCUDA
+Enable CUDA feature
+
+.PARAMETER EnableCUDNN
+Enable CUDNN feature
+
+.PARAMETER EnableOPENCV
+Enable OpenCV feature
+
+.PARAMETER ForceOpenCVVersion
+Force a specific OpenCV version
+
+.PARAMETER EnableOPENCV_CUDA
+Use a CUDA-enabled OpenCV build
+
+.PARAMETER UseVCPKG
+Use vcpkg to build tool dependencies. Clone it if not already found on system
+
+.PARAMETER ForceLocalVCPKG
+Use a copy of vcpkg in a subfolder of the tool folder, even if there might be another copy already provided by the system
+
+.PARAMETER InstallDARKNETthroughVCPKG
+Use VCPKG to install darknet thanks to the port integrated in it
+
+.PARAMETER InstallDARKNETdependenciesThroughVCPKGManifest
+Use VCPKG to install darknet dependencies using vcpkg manifest feature
+
+.PARAMETER ForceVCPKGDarknetHEAD
+Install darknet from vcpkg and force it to HEAD version, not latest port release
+
+.PARAMETER DoNotUpdateVCPKG
+Do not update vcpkg before running the build (valid only if vcpkg is cloned by this script or the version found on the system is git-enabled)
+
+.PARAMETER VCPKGSuffix
+Specify a suffix to the vcpkg local folder for searching, useful to point to a custom version
+
+.PARAMETER VCPKGFork
+Specify a fork username to point to a custom version of vcpkg (ex: -VCPKGFork "custom" to point to github.com/custom/vcpkg)
+
+.PARAMETER VCPKGBranch
+Specify a branch to checkout in the vcpkg folder, useful to point to a custom version especially for forked vcpkg versions
+
+.PARAMETER DoNotUpdateTOOL
+Do not update the tool before running the build (valid only if tool is git-enabled)
+
+.PARAMETER DoNotDeleteBuildFolder
+Do not delete temporary cmake build folder at the end of the script
+
+.PARAMETER DoNotSetupVS
+Do not setup VisualStudio environment using the vcvars script
+
+.PARAMETER DoNotUseNinja
+Do not use Ninja for build
+
+.PARAMETER ForceCPP
+Force building darknet using C++ compiler also for plain C code
+
+.PARAMETER ForceStaticLib
+Create library as static instead of the default linking mode of your system
+
+.PARAMETER ForceVCPKGCacheRemoval
+Force clean up of the local vcpkg binary cache before building
+
+.PARAMETER ForceVCPKGBuildtreesRemoval
+Force clean up of vcpkg buildtrees temp folder at the end of the script
+
+.PARAMETER ForceVCPKGBuildtreesPath
+Force using a different buildtrees dir for vcpkg
+
+.PARAMETER ForceVCPKGPackagesRemoval
+Force clean up of vcpkg packages folder at the end of the script
+
+.PARAMETER ForceSetupVS
+Forces Visual Studio setup, also on systems on which it would not have been enabled automatically
+
+.PARAMETER ForceCMakeFromVS
+Forces usage of CMake from Visual Studio instead of the system-wide/user installed one
+
+.PARAMETER ForceNinjaFromVS
+Forces usage of Ninja from Visual Studio instead of the system-wide/user installed one
+
+.PARAMETER EnableCSharpWrapper
+Enables building C# darknet wrapper
+
+.PARAMETER DownloadWeights
+Download pre-trained weight files
+
+.PARAMETER Use32bitTriplet
+Use 32 bit triplet for target build (windows-only)
+
+.PARAMETER BuildInstaller
+Build an installer using CPack
+
+.PARAMETER ForceGCCVersion
+Force a specific GCC version
+
+.PARAMETER NumberOfBuildWorkers
+Forces a specific number of threads for parallel building
+
+.PARAMETER AdditionalBuildSetup
+Additional setup parameters to manually pass to CMake
+
+.EXAMPLE
+./build -DisableInteractive -DoNotDeleteBuildFolder -UseVCPKG
+
+#>
+
+<#
+Copyright (c) Stefano Sinigardi
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+#>
+
+param (
+  [switch]$DisableInteractive = $false,
+  [switch]$BuildDebug = $false,
+  [switch]$DisableDLLcopy = $false,
+  [switch]$EnableCUDA = $false,
+  [switch]$EnableCUDNN = $false,
+  [switch]$EnableOPENCV = $false,
+  [Int32]$ForceOpenCVVersion = 0,
+  [switch]$EnableOPENCV_CUDA = $false,
+  [switch]$UseVCPKG = $false,
+  [switch]$ForceLocalVCPKG = $false,
+  [switch]$InstallDARKNETthroughVCPKG = $false,
+  [switch]$InstallDARKNETdependenciesThroughVCPKGManifest = $false,
+  [switch]$ForceVCPKGDarknetHEAD = $false,
+  [switch]$DoNotUpdateVCPKG = $false,
+  [string]$VCPKGSuffix = "",
+  [string]$VCPKGFork = "",
+  [string]$VCPKGBranch = "",
+  [switch]$DoNotUpdateTOOL = $false,
+  [switch]$DoNotDeleteBuildFolder = $false,
+  [switch]$DoNotSetupVS = $false,
+  [switch]$DoNotUseNinja = $false,
+  [switch]$ForceCPP = $false,
+  [switch]$ForceStaticLib = $false,
+  [switch]$ForceVCPKGCacheRemoval = $false,
+  [switch]$ForceVCPKGBuildtreesRemoval = $false,
+  [string]$ForceVCPKGBuildtreesPath = "",
+  [switch]$ForceVCPKGPackagesRemoval = $false,
+  [switch]$ForceSetupVS = $false,
+  [switch]$ForceCMakeFromVS = $false,
+  [switch]$ForceNinjaFromVS = $false,
+  [switch]$EnableCSharpWrapper = $false,
+  [switch]$DownloadWeights = $false,
+  [switch]$Use32bitTriplet = $false,
+  [switch]$BuildInstaller = $false,
+  [Int32]$ForceGCCVersion = 0,
+  [Int32]$NumberOfBuildWorkers = 8,
+  [string]$AdditionalBuildSetup = ""  # "-DCMAKE_CUDA_ARCHITECTURES=30"
+)
+
+$global:DisableInteractive = $DisableInteractive
+
+$build_ps1_version = "3.6.1"
+$script_name = $MyInvocation.MyCommand.Name
+$utils_psm1_avail = $false
+
+if (Test-Path $PSScriptRoot/scripts/utils.psm1) {
+  Import-Module -Name $PSScriptRoot/scripts/utils.psm1 -Force
+  $utils_psm1_avail = $true
+}
+else {
+  $utils_psm1_version = "unavail"
+  $IsWindowsPowerShell = $false
+  $IsInGitSubmodule = $false
+}
+
+if (-Not $utils_psm1_avail) {
+  $DoNotSetupVS = $true
+  $ForceCMakeFromVS = $false
+}
+
+$ErrorActionPreference = "SilentlyContinue"
+Stop-Transcript | out-null
+$ErrorActionPreference = "Continue"
+if($IsInGitSubmodule) {
+  $PSCustomScriptRoot = Split-Path $PSScriptRoot -Parent
+}
+else {
+  $PSCustomScriptRoot = $PSScriptRoot
+}
+$BuildLogPath = "$PSCustomScriptRoot/build.log"
+$ReleaseInstallPrefix = "$PSCustomScriptRoot"
+$DebugInstallPrefix = "$PSCustomScriptRoot/debug"
+$DebugBuildSetup = " -DCMAKE_BUILD_TYPE=Debug "
+$ReleaseBuildSetup = " -DCMAKE_BUILD_TYPE=Release "
+if (-Not $BuildInstaller) {
+  $DebugBuildSetup = $DebugBuildSetup + " -DCMAKE_INSTALL_PREFIX=$DebugInstallPrefix "
+  $ReleaseBuildSetup = $ReleaseBuildSetup + " -DCMAKE_INSTALL_PREFIX=$ReleaseInstallPrefix "
+}
+Start-Transcript -Path $BuildLogPath
+
+Write-Host "Build script version ${build_ps1_version}, utils module version ${utils_psm1_version}"
+if (-Not $utils_psm1_avail) {
+  Write-Host "utils.psm1 is not available, so VS integration is forcefully disabled" -ForegroundColor Yellow
+}
+Write-Host "Working directory: $PSCustomScriptRoot, log file: $BuildLogPath, $script_name is in submodule: $IsInGitSubmodule"
+
+if ((-Not $global:DisableInteractive) -and (-Not $UseVCPKG)) {
+  $Result = Read-Host "Enable vcpkg to install dependencies (yes/no)"
+  if (($Result -eq 'Yes') -or ($Result -eq 'Y') -or ($Result -eq 'yes') -or ($Result -eq 'y')) {
+    $UseVCPKG = $true
+  }
+}
+
+if ((-Not $DisableInteractive) -and (-Not $EnableCUDA) -and (-Not $IsMacOS)) {
+  $Result = Read-Host "Enable CUDA integration (yes/no)"
+  if (($Result -eq 'Yes') -or ($Result -eq 'Y') -or ($Result -eq 'yes') -or ($Result -eq 'y')) {
+    $EnableCUDA = $true
+  }
+}
+
+if ($EnableCUDA -and (-Not $DisableInteractive) -and (-Not $EnableCUDNN)) {
+  $Result = Read-Host "Enable CUDNN optional dependency (yes/no)"
+  if (($Result -eq 'Yes') -or ($Result -eq 'Y') -or ($Result -eq 'yes') -or ($Result -eq 'y')) {
+    $EnableCUDNN = $true
+  }
+}
+
+if ((-Not $DisableInteractive) -and (-Not $EnableOPENCV)) {
+  $Result = Read-Host "Enable OpenCV optional dependency (yes/no)"
+  if (($Result -eq 'Yes') -or ($Result -eq 'Y') -or ($Result -eq 'yes') -or ($Result -eq 'y')) {
+    $EnableOPENCV = $true
+  }
+}
+
+Write-Host -NoNewLine "PowerShell version:"
+$PSVersionTable.PSVersion
+
+if ($IsWindowsPowerShell) {
+  Write-Host "Running on Windows Powershell, please consider update and running on newer Powershell versions"
+}
+
+if ($PSVersionTable.PSVersion.Major -lt 5) {
+  MyThrow("Your PowerShell version is too old, please update it.")
+}
+
+if ($IsLinux -or $IsMacOS) {
+  $bootstrap_ext = ".sh"
+  $exe_ext = ""
+}
+elseif ($IsWindows -or $IsWindowsPowerShell) {
+  $bootstrap_ext = ".bat"
+  $exe_ext = ".exe"
+}
+if ($ForceLocalVCPKG -And -Not $UseVCPKG) {
+  $UseVCPKG = $true
+  Write-Host "ForceLocalVCPKG was true but UseVCPKG was false, setting UseVCPKG to true"
+}
+if ($InstallDARKNETdependenciesThroughVCPKGManifest -and -not $InstallDARKNETthroughVCPKG) {
+  Write-Host "You requested darknet dependencies to be installed by vcpkg in manifest mode but you didn't enable installation through vcpkg, doing that for you"
+  $InstallDARKNETthroughVCPKG = $true
+}
+
+if ($InstallDARKNETthroughVCPKG -and -not $UseVCPKG) {
+  Write-Host "You requested darknet to be installed by vcpkg but you didn't enable vcpkg, doing that for you"
+  $UseVCPKG = $true
+}
+
+if ($InstallDARKNETthroughVCPKG -and -not $EnableOPENCV) {
+  Write-Host "You requested darknet to be installed by vcpkg but you didn't enable OpenCV, doing that for you"
+  $EnableOPENCV = $true
+}
+
+if ($UseVCPKG) {
+  Write-Host "vcpkg bootstrap script: bootstrap-vcpkg${bootstrap_ext}"
+}
+
+if ((-Not $IsWindows) -and (-Not $IsWindowsPowerShell) -and (-Not $ForceSetupVS)) {
+  $DoNotSetupVS = $true
+}
+
+if ($ForceStaticLib) {
+  Write-Host "Forced CMake to produce a static library"
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DBUILD_SHARED_LIBS=OFF "
+}
+
+if (($IsLinux -or $IsMacOS) -and ($ForceGCCVersion -gt 0)) {
+  Write-Host "Manually setting CC and CXX variables to gcc version $ForceGCCVersion"
+  $env:CC = "gcc-$ForceGCCVersion"
+  $env:CXX = "g++-$ForceGCCVersion"
+}
+
+$vcpkg_triplet_set_by_this_script = $false
+$vcpkg_host_triplet_set_by_this_script = $false
+
+if (($IsWindows -or $IsWindowsPowerShell) -and (-Not $env:VCPKG_DEFAULT_TRIPLET)) {
+  if ($Use32bitTriplet) {
+    if (-Not $BuildDebug) {
+      $BuildDebug = $true
+      Write-Host "Warning: when building for 32bit windows target, only builds with also debug version are possible. Debug has been enabled on your behalf!" -ForegroundColor Yellow
+    }
+    if (-Not $DoNotUseNinja) {
+      $DoNotUseNinja = $true
+      Write-Host "Warning: when building for 32bit windows target, only msbuild can be used and ninja will be disabled. Doing that for you!" -ForegroundColor Yellow
+    }
+    $env:VCPKG_DEFAULT_TRIPLET = "x86-windows"
+    $vcpkg_triplet_set_by_this_script = $true
+  }
+  else {
+    if($BuildDebug) {
+      $env:VCPKG_DEFAULT_TRIPLET = "x64-windows"
+      $vcpkg_triplet_set_by_this_script = $true
+    }
+    else {
+      $env:VCPKG_DEFAULT_TRIPLET = "x64-windows-release"
+      $vcpkg_triplet_set_by_this_script = $true
+    }
+  }
+}
+if (($IsWindows -or $IsWindowsPowerShell) -and (-Not $env:VCPKG_DEFAULT_HOST_TRIPLET)) {
+  if ($BuildDebug) {
+    $env:VCPKG_DEFAULT_HOST_TRIPLET = "x64-windows"
+    $vcpkg_host_triplet_set_by_this_script = $true
+  }
+  else {
+    $env:VCPKG_DEFAULT_HOST_TRIPLET = "x64-windows-release"
+    $vcpkg_host_triplet_set_by_this_script = $true
+  }
+}
+
+if ($IsMacOS -and (-Not $env:VCPKG_DEFAULT_TRIPLET)) {
+  if ($BuildDebug) {
+    $env:VCPKG_DEFAULT_TRIPLET = "x64-osx"
+    $vcpkg_triplet_set_by_this_script = $true
+  }
+  else {
+    $env:VCPKG_DEFAULT_TRIPLET = "x64-osx-release"
+    $vcpkg_triplet_set_by_this_script = $true
+  }
+}
+if ($IsMacOS -and (-Not $env:VCPKG_DEFAULT_HOST_TRIPLET)) {
+  if ($BuildDebug) {
+    $env:VCPKG_DEFAULT_HOST_TRIPLET = "x64-osx"
+    $vcpkg_host_triplet_set_by_this_script = $true
+  }
+  else {
+    $env:VCPKG_DEFAULT_HOST_TRIPLET = "x64-osx-release"
+    $vcpkg_host_triplet_set_by_this_script = $true
+  }
+}
+
+if ($IsLinux -and (-Not $env:VCPKG_DEFAULT_TRIPLET)) {
+  if ($true) {
+    if ($BuildDebug) {
+      $env:VCPKG_DEFAULT_TRIPLET = "x64-linux"
+      $vcpkg_triplet_set_by_this_script = $true
+    }
+    else {
+      $env:VCPKG_DEFAULT_TRIPLET = "x64-linux-release"
+      $vcpkg_triplet_set_by_this_script = $true
+    }
+  }
+}
+if ($IsLinux -and (-Not $env:VCPKG_DEFAULT_HOST_TRIPLET)) {
+  if ($BuildDebug) {
+    $env:VCPKG_DEFAULT_HOST_TRIPLET = "x64-linux"
+    $vcpkg_host_triplet_set_by_this_script = $true
+  }
+  else {
+    $env:VCPKG_DEFAULT_HOST_TRIPLET = "x64-linux-release"
+    $vcpkg_host_triplet_set_by_this_script = $true
+  }
+}
+
+if ($VCPKGSuffix -ne "" -and -not $UseVCPKG) {
+  Write-Host "You specified a vcpkg folder suffix but didn't enable vcpkg integration, doing that for you" -ForegroundColor Yellow
+  $UseVCPKG = $true
+}
+
+if ($VCPKGFork -ne "" -and -not $UseVCPKG) {
+  Write-Host "You specified a vcpkg fork but didn't enable vcpkg integration, doing that for you" -ForegroundColor Yellow
+  $UseVCPKG = $true
+}
+
+if ($VCPKGBranch -ne "" -and -not $UseVCPKG) {
+  Write-Host "You specified a vcpkg branch but didn't enable vcpkg integration, doing that for you" -ForegroundColor Yellow
+  $UseVCPKG = $true
+}
+
+if ($EnableCUDA) {
+  if ($IsMacOS) {
+    Write-Host "Cannot enable CUDA on macOS" -ForegroundColor Yellow
+    $EnableCUDA = $false
+  }
+  Write-Host "CUDA is enabled"
+}
+elseif (-Not $IsMacOS) {
+  Write-Host "CUDA is disabled, please pass -EnableCUDA to the script to enable"
+}
+
+if ($EnableCUDNN) {
+  if ($IsMacOS) {
+    Write-Host "Cannot enable CUDNN on macOS" -ForegroundColor Yellow
+    $EnableCUDNN = $false
+  }
+  Write-Host "CUDNN is enabled"
+}
+elseif (-Not $IsMacOS) {
+  Write-Host "CUDNN is disabled, please pass -EnableCUDNN to the script to enable"
+}
+
+if ($EnableOPENCV) {
+  Write-Host "OPENCV is enabled"
+}
+else {
+  Write-Host "OPENCV is disabled, please pass -EnableOPENCV to the script to enable"
+}
+
+if ($EnableCUDA -and $EnableOPENCV -and (-Not $EnableOPENCV_CUDA)) {
+  Write-Host "OPENCV with CUDA extension is not enabled, you can enable it passing -EnableOPENCV_CUDA"
+}
+elseif ($EnableOPENCV -and $EnableOPENCV_CUDA -and (-Not $EnableCUDA)) {
+  Write-Host "OPENCV with CUDA extension was requested, but CUDA is not enabled, you can enable it passing -EnableCUDA"
+  $EnableOPENCV_CUDA = $false
+}
+elseif ($EnableCUDA -and $EnableOPENCV_CUDA -and (-Not $EnableOPENCV)) {
+  Write-Host "OPENCV with CUDA extension was requested, but OPENCV is not enabled, you can enable it passing -EnableOPENCV"
+  $EnableOPENCV_CUDA = $false
+}
+elseif ($EnableOPENCV_CUDA -and (-Not $EnableCUDA) -and (-Not $EnableOPENCV)) {
+  Write-Host "OPENCV with CUDA extension was requested, but OPENCV and CUDA are not enabled, you can enable them passing -EnableOPENCV -EnableCUDA"
+  $EnableOPENCV_CUDA = $false
+}
+
+if ($UseVCPKG) {
+  Write-Host "VCPKG is enabled"
+  if ($BuildDebug -and ($env:VCPKG_DEFAULT_TRIPLET -match "release")) {
+    MyThrow("You asked to build also debug binaries but VCPKG_DEFAULT_TRIPLET is set to a release triplet")
+  }
+  if ($DoNotUpdateVCPKG) {
+    Write-Host "VCPKG will not be updated to latest version if found" -ForegroundColor Yellow
+  }
+  else {
+    Write-Host "VCPKG will be updated to latest version if found"
+  }
+}
+else {
+  Write-Host "VCPKG is disabled, please pass -UseVCPKG to the script to enable"
+}
+
+if ($DoNotSetupVS) {
+  Write-Host "VisualStudio integration is disabled"
+}
+else {
+  Write-Host "VisualStudio integration is enabled, please pass -DoNotSetupVS to the script to disable"
+}
+
+if ($EnableCSharpWrapper -and ($IsWindowsPowerShell -or $IsWindows)) {
+  Write-Host "Yolo C# wrapper integration is enabled. Will be built with Visual Studio generator. Disabling Ninja"
+  $DoNotUseNinja = $true
+}
+else {
+  $EnableCSharpWrapper = $false
+  Write-Host "Yolo C# wrapper integration is disabled, please pass -EnableCSharpWrapper to the script to enable. You must be on Windows!"
+}
+
+if ($DoNotUseNinja) {
+  Write-Host "Ninja is disabled"
+}
+else {
+  Write-Host "Ninja is enabled, please pass -DoNotUseNinja to the script to disable"
+}
+
+if ($ForceCPP) {
+  Write-Host "ForceCPP build mode is enabled"
+}
+else {
+  Write-Host "ForceCPP build mode is disabled, please pass -ForceCPP to the script to enable"
+}
+
+Push-Location $PSCustomScriptRoot
+
+$GIT_EXE = Get-Command "git" -ErrorAction SilentlyContinue | Select-Object -ExpandProperty Definition
+if (-Not $GIT_EXE) {
+  MyThrow("Could not find git, please install it")
+}
+else {
+  Write-Host "Using git from ${GIT_EXE}"
+}
+
+$GitRepoPath = Resolve-Path "$PSCustomScriptRoot/.git" -ErrorAction SilentlyContinue
+$GitModulesPath = Resolve-Path "$PSCustomScriptRoot/.gitmodules" -ErrorAction SilentlyContinue
+if (Test-Path "$GitRepoPath") {
+  Write-Host "This tool has been cloned with git and supports self-updating mechanism"
+  if ($DoNotUpdateTOOL) {
+    Write-Host "This tool will not self-update sources" -ForegroundColor Yellow
+  }
+  else {
+    Write-Host "This tool will self-update sources, please pass -DoNotUpdateTOOL to the script to disable"
+    Set-Location "$PSCustomScriptRoot"
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $GIT_EXE -ArgumentList "pull"
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Updating this tool sources failed! Exited with error code $exitCode.")
+    }
+    if (Test-Path "$GitModulesPath") {
+      $proc = Start-Process -NoNewWindow -PassThru -FilePath $GIT_EXE -ArgumentList "submodule update --init --recursive"
+      $handle = $proc.Handle
+      $proc.WaitForExit()
+      $exitCode = $proc.ExitCode
+      if (-Not ($exitCode -eq 0)) {
+        MyThrow("Updating this tool submodule sources failed! Exited with error code $exitCode.")
+      }
+    }
+    Set-Location "$PSCustomScriptRoot"
+  }
+}
+
+if ($ForceCmakeFromVS) {
+  $vsfound = getLatestVisualStudioWithDesktopWorkloadPath
+  $cmakePath = "${vsfound}/Common7/IDE/CommonExtensions/Microsoft/CMake/CMake/bin"
+  $vsCmakePath = "${vsfound}/Common7/IDE/CommonExtensions/Microsoft/CMake/CMake/bin/cmake.exe"
+  $CMAKE_EXE = Get-Command "cmake" -ErrorAction SilentlyContinue | Select-Object -ExpandProperty Definition
+  if ((Test-Path "$vsCmakePath") -and -not ($vsCmakePath -eq $CMAKE_EXE)) {
+    Write-Host "Adding CMake from Visual Studio to PATH"
+    $env:PATH = '{0}{1}{2}' -f "$cmakePath", [IO.Path]::PathSeparator, $env:PATH
+  }
+  elseif ($vsCmakePath -eq $CMAKE_EXE) {
+    Write-Host "CMake from Visual Studio was already the preferred choice" -ForegroundColor Yellow
+  }
+  else {
+    Write-Host "Unable to find CMake integrated in Visual Studio" -ForegroundColor Red
+  }
+}
+
+$CMAKE_EXE = Get-Command "cmake" -ErrorAction SilentlyContinue | Select-Object -ExpandProperty Definition
+if (-Not $CMAKE_EXE) {
+  MyThrow("Could not find CMake, please install it")
+}
+else {
+  Write-Host "Using CMake from ${CMAKE_EXE}"
+  $proc = Start-Process -NoNewWindow -PassThru -FilePath ${CMAKE_EXE} -ArgumentList "--version"
+  $handle = $proc.Handle
+  $proc.WaitForExit()
+  $exitCode = $proc.ExitCode
+  if (-Not ($exitCode -eq 0)) {
+    MyThrow("CMake version check failed! Exited with error code $exitCode.")
+  }
+}
+
+if (-Not $DoNotUseNinja) {
+  if ($ForceNinjaFromVS) {
+    $vsfound = getLatestVisualStudioWithDesktopWorkloadPath
+    $ninjaPath = "${vsfound}/Common7/IDE/CommonExtensions/Microsoft/CMake/Ninja"
+    $vsninjaPath = "${vsfound}/Common7/IDE/CommonExtensions/Microsoft/CMake/Ninja/ninja.exe"
+    $NINJA_EXE = Get-Command "ninja" -ErrorAction SilentlyContinue | Select-Object -ExpandProperty Definition
+    if ((Test-Path "$vsninjaPath") -and -not ($vsninjaPath -eq $NINJA_EXE) -and (-not $DoNotUseNinja)) {
+      Write-Host "Adding Ninja from Visual Studio to PATH"
+      $env:PATH = '{0}{1}{2}' -f "$ninjaPath", [IO.Path]::PathSeparator, $env:PATH
+    }
+    elseif ($vsninjaPath -eq $NINJA_EXE) {
+      Write-Host "Ninja from Visual Studio was already the preferred choice" -ForegroundColor Yellow
+    }
+    else {
+      Write-Host "Unable to find Ninja integrated in Visual Studio" -ForegroundColor Red
+    }
+  }
+  $NINJA_EXE = Get-Command "ninja" -ErrorAction SilentlyContinue | Select-Object -ExpandProperty Definition
+  if (-Not $NINJA_EXE) {
+    DownloadNinja
+    $NinjaPath = Join-Path (${PSCustomScriptRoot}) 'ninja'
+    $env:PATH = '{0}{1}{2}' -f $env:PATH, [IO.Path]::PathSeparator, "$NinjaPath"
+    $NINJA_EXE = Get-Command "ninja" -ErrorAction SilentlyContinue | Select-Object -ExpandProperty Definition
+    if (-Not $NINJA_EXE) {
+      $DoNotUseNinja = $true
+      Write-Host "Could not find Ninja, unable to download a portable ninja, using msbuild or make backends as a fallback" -ForegroundColor Yellow
+    }
+  }
+  if ($NINJA_EXE) {
+    Write-Host "Using Ninja from ${NINJA_EXE}"
+    Write-Host -NoNewLine "Ninja version "
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath ${NINJA_EXE} -ArgumentList "--version"
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      $DoNotUseNinja = $true
+      Write-Host "Unable to run Ninja previously found, using msbuild or make backends as a fallback" -ForegroundColor Yellow
+    }
+    else {
+      $generator = "Ninja"
+    }
+  }
+}
+
+if (-Not $DoNotSetupVS) {
+  $CL_EXE = Get-Command "cl" -ErrorAction SilentlyContinue | Select-Object -ExpandProperty Definition
+  if ((-Not $CL_EXE) -or ($CL_EXE -match "HostX86\\x86") -or ($CL_EXE -match "HostX64\\x86")) {
+    $vsfound = getLatestVisualStudioWithDesktopWorkloadPath
+    Write-Host "Found VS in ${vsfound}"
+    Push-Location "${vsfound}/Common7/Tools"
+    cmd.exe /c "VsDevCmd.bat -arch=x64 & set" |
+    ForEach-Object {
+      if ($_ -match "=") {
+        $v = $_.split("="); Set-Item -force -path "ENV:\$($v[0])"  -value "$($v[1])"
+      }
+    }
+    Pop-Location
+    Write-Host "Visual Studio Command Prompt variables set"
+  }
+
+  $tokens = getLatestVisualStudioWithDesktopWorkloadVersion
+  $tokens = $tokens.split('.')
+  if ($DoNotUseNinja) {
+    $debugConfig = " --config Debug "
+    $releaseConfig = " --config Release "
+    if ($Use32bitTriplet) {
+      $targetArchitecture = "`"Win32`""
+    }
+    else {
+      $targetArchitecture = "`"x64`""
+    }
+    if ($tokens[0] -eq "14") {
+      $generator = "Visual Studio 14 2015"
+      $AdditionalBuildSetup = $AdditionalBuildSetup + " -T `"host=x64`" -A $targetArchitecture"
+    }
+    elseif ($tokens[0] -eq "15") {
+      $generator = "Visual Studio 15 2017"
+      $AdditionalBuildSetup = $AdditionalBuildSetup + " -T `"host=x64`" -A $targetArchitecture"
+    }
+    elseif ($tokens[0] -eq "16") {
+      $generator = "Visual Studio 16 2019"
+      $AdditionalBuildSetup = $AdditionalBuildSetup + " -T `"host=x64`" -A $targetArchitecture"
+    }
+    elseif ($tokens[0] -eq "17") {
+      $generator = "Visual Studio 17 2022"
+      $AdditionalBuildSetup = $AdditionalBuildSetup + " -T `"host=x64`" -A $targetArchitecture"
+    }
+    else {
+      MyThrow("Unknown Visual Studio version, unsupported configuration")
+    }
+  }
+  if (-Not $UseVCPKG) {
+    $dllfolder = "../3rdparty/pthreads/bin"
+  }
+}
+if ($DoNotSetupVS -and $DoNotUseNinja) {
+  $generator = "Unix Makefiles"
+}
+Write-Host "Setting up environment to use CMake generator: $generator"
+
+if (-Not $IsMacOS -and $EnableCUDA) {
+  $NVCC_EXE = Get-Command "nvcc" -ErrorAction SilentlyContinue | Select-Object -ExpandProperty Definition
+  if (-Not $NVCC_EXE) {
+    if (Test-Path env:CUDA_PATH) {
+      $env:PATH = '{0}{1}{2}' -f $env:PATH, [IO.Path]::PathSeparator, "${env:CUDA_PATH}/bin"
+      Write-Host "Found cuda in ${env:CUDA_PATH}"
+    }
+    else {
+      Write-Host "Unable to find CUDA, if necessary please install it or define a CUDA_PATH env variable pointing to the install folder" -ForegroundColor Yellow
+    }
+  }
+
+  if (Test-Path env:CUDA_PATH) {
+    if (-Not(Test-Path env:CUDA_TOOLKIT_ROOT_DIR)) {
+      $env:CUDA_TOOLKIT_ROOT_DIR = "${env:CUDA_PATH}"
+      Write-Host "Added missing env variable CUDA_TOOLKIT_ROOT_DIR" -ForegroundColor Yellow
+    }
+    if (-Not(Test-Path env:CUDACXX)) {
+      $env:CUDACXX = "${env:CUDA_PATH}/bin/nvcc"
+      Write-Host "Added missing env variable CUDACXX" -ForegroundColor Yellow
+    }
+  }
+}
+
+$vcpkg_root_set_by_this_script = $false
+
+if ($UseVCPKG -And -Not $ForceLocalVCPKG) {
+  if ((Test-Path env:VCPKG_ROOT) -and $VCPKGSuffix -eq "") {
+    $vcpkg_path = "$env:VCPKG_ROOT"
+    $vcpkg_path = Resolve-Path $vcpkg_path
+    Write-Host "Found vcpkg in VCPKG_ROOT: $vcpkg_path"
+    $AdditionalBuildSetup = $AdditionalBuildSetup + " -DENABLE_VCPKG_INTEGRATION:BOOL=ON"
+  }
+  elseif (-not($null -eq ${env:WORKSPACE}) -and (Test-Path "${env:WORKSPACE}/vcpkg${VCPKGSuffix}")) {
+    $vcpkg_path = "${env:WORKSPACE}/vcpkg${VCPKGSuffix}"
+    $vcpkg_path = Resolve-Path $vcpkg_path
+    $env:VCPKG_ROOT = "$vcpkg_path"
+    $vcpkg_root_set_by_this_script = $true
+    Write-Host "Found vcpkg in WORKSPACE/vcpkg${VCPKGSuffix}: $vcpkg_path"
+    $AdditionalBuildSetup = $AdditionalBuildSetup + " -DENABLE_VCPKG_INTEGRATION:BOOL=ON"
+  }
+  elseif (-not($null -eq ${RUNVCPKG_VCPKG_ROOT_OUT})) {
+    if ((Test-Path "${RUNVCPKG_VCPKG_ROOT_OUT}") -and $UseVCPKG) {
+      $vcpkg_path = "${RUNVCPKG_VCPKG_ROOT_OUT}"
+      $vcpkg_path = Resolve-Path $vcpkg_path
+      $env:VCPKG_ROOT = "$vcpkg_path"
+      $vcpkg_root_set_by_this_script = $true
+      Write-Host "Found vcpkg in RUNVCPKG_VCPKG_ROOT_OUT: $vcpkg_path"
+      $AdditionalBuildSetup = $AdditionalBuildSetup + " -DENABLE_VCPKG_INTEGRATION:BOOL=ON"
+    }
+  }
+}
+if (($null -eq $vcpkg_path) -and $UseVCPKG) {
+  if (-Not (Test-Path "$PWD/vcpkg${VCPKGSuffix}")) {
+    $shallow_copy = ""
+    if(($ForceOpenCVVersion -eq 0)) {
+      $shallow_copy = " --depth 1 "
+    }
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $GIT_EXE -ArgumentList "clone $shallow_copy https://github.com/microsoft/vcpkg vcpkg${VCPKGSuffix}"
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-not ($exitCode -eq 0)) {
+      MyThrow("Cloning vcpkg sources failed! Exited with error code $exitCode.")
+    }
+  }
+  $vcpkg_path = "$PWD/vcpkg${VCPKGSuffix}"
+  $vcpkg_path = Resolve-Path $vcpkg_path
+  $env:VCPKG_ROOT = "$vcpkg_path"
+  $vcpkg_root_set_by_this_script = $true
+  Write-Host "Found vcpkg in $PWD/vcpkg${VCPKGSuffix}: $vcpkg_path"
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DENABLE_VCPKG_INTEGRATION:BOOL=ON"
+}
+
+$vcpkg_branch_set_by_this_script = $false
+
+if ($UseVCPKG -and (Test-Path "$vcpkg_path/.git")) {
+  Push-Location $vcpkg_path
+  if ($VCPKGFork -ne "") {
+    $vcpkgfork_already_setup = $false
+    $remotes = & $GIT_EXE 'remote'
+    ForEach ($remote in $remotes) {
+      if ($remote -eq "vcpkgfork") {
+        $vcpkgfork_already_setup = $true
+        Write-Host "remote vcpkgfork already setup"
+      }
+    }
+    if (-Not $vcpkgfork_already_setup) {
+      $git_args = "remote add vcpkgfork https://github.com/${VCPKGFork}/vcpkg"
+      Write-Host "setting up remote vcpkgfork"
+      $proc = Start-Process -NoNewWindow -PassThru -FilePath $GIT_EXE -ArgumentList "$git_args"
+      $handle = $proc.Handle
+      $proc.WaitForExit()
+      $exitCode = $proc.ExitCode
+      if (-Not ($exitCode -eq 0)) {
+        MyThrow("Adding remote https://github.com/${VCPKGFork}/vcpkg failed! Exited with error code $exitCode.")
+      }
+    }
+    $git_args = "fetch vcpkgfork"
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $GIT_EXE -ArgumentList "$git_args"
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Fetching from remote https://github.com/${VCPKGFork}/vcpkg failed! Exited with error code $exitCode.")
+    }
+  }
+  if ($VCPKGBranch -ne "") {
+    if ($VCPKGFork -ne "") {
+      $git_args = "checkout vcpkgfork/$VCPKGBranch"
+    }
+    else {
+      $git_args = "checkout $VCPKGBranch"
+    }
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $GIT_EXE -ArgumentList "$git_args"
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Checking out branch $VCPKGBranch failed! Exited with error code $exitCode.")
+    }
+    $vcpkg_branch_set_by_this_script = $true
+  }
+  if (-Not $DoNotUpdateVCPKG -and $VCPKGFork -eq "") {
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $GIT_EXE -ArgumentList "pull"
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Updating vcpkg sources failed! Exited with error code $exitCode.")
+    }
+    $VcpkgBootstrapScript = Join-Path $PWD "bootstrap-vcpkg${bootstrap_ext}"
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $VcpkgBootstrapScript -ArgumentList "-disableMetrics"
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Bootstrapping vcpkg failed! Exited with error code $exitCode.")
+    }
+  }
+  Pop-Location
+}
+
+if ($UseVCPKG -and ($vcpkg_path.length -gt 40) -and ($IsWindows -or $IsWindowsPowerShell)) {
+  Write-Host "vcpkg path is very long and might fail. Please move it or" -ForegroundColor Yellow
+  Write-Host "the entire tool folder to a shorter path, like C:\src" -ForegroundColor Yellow
+  Write-Host "You can use the subst command to ease the process if necessary" -ForegroundColor Yellow
+  if (-Not $global:DisableInteractive) {
+    $Result = Read-Host "Do you still want to continue? (yes/no)"
+    if (($Result -eq 'No') -or ($Result -eq 'N') -or ($Result -eq 'no') -or ($Result -eq 'n')) {
+      MyThrow("Build aborted")
+    }
+  }
+}
+
+if ($ForceVCPKGCacheRemoval -and (-Not $UseVCPKG)) {
+  Write-Host "VCPKG is not enabled, so local vcpkg binary cache will not be deleted even if requested" -ForegroundColor Yellow
+}
+
+if ($BuildInstaller) {
+  Write-Host "You requested to build an installer, so enabling this option if supported" -ForegroundColor Yellow
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DENABLE_INSTALLER=ON"
+}
+
+if (($ForceOpenCVVersion -eq 2) -and $UseVCPKG) {
+  Write-Host "You requested OpenCV version 2, so vcpkg will install that version" -ForegroundColor Yellow
+  Write-Host "This requires using vcpkg.json.opencv23 as manifest file" -ForegroundColor Yellow
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DVCPKG_USE_OPENCV4=OFF -DVCPKG_USE_OPENCV2=ON"
+}
+
+if (($ForceOpenCVVersion -eq 3) -and $UseVCPKG) {
+  Write-Host "You requested OpenCV version 3, so vcpkg will install that version" -ForegroundColor Yellow
+  Write-Host "This requires using vcpkg.json.opencv23 as manifest file" -ForegroundColor Yellow
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DVCPKG_USE_OPENCV4=OFF -DVCPKG_USE_OPENCV3=ON"
+}
+
+if($ForceOpenCVVersion -gt 0) {
+  Move-Item $PSCustomScriptRoot/vcpkg.json $PSCustomScriptRoot/vcpkg.json.bak
+  Move-Item $PSCustomScriptRoot/vcpkg.json.opencv23 $PSCustomScriptRoot/vcpkg.json
+}
+
+if ($UseVCPKG -and $ForceVCPKGCacheRemoval) {
+  if ($IsWindows -or $IsWindowsPowerShell) {
+    $vcpkgbinarycachepath = "$env:LOCALAPPDATA/vcpkg/archive"
+  }
+  elseif ($IsLinux) {
+    $vcpkgbinarycachepath = "$env:HOME/.cache/vcpkg/archive"
+  }
+  elseif ($IsMacOS) {
+    $vcpkgbinarycachepath = "$env:HOME/.cache/vcpkg/archive"
+  }
+  else {
+    MyThrow("Unknown OS, unsupported")
+  }
+  Write-Host "Removing local vcpkg binary cache from $vcpkgbinarycachepath" -ForegroundColor Yellow
+  Remove-Item -Force -Recurse -ErrorAction SilentlyContinue $vcpkgbinarycachepath
+}
+
+if (-Not $DisableDLLcopy) {
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DX_VCPKG_APPLOCAL_DEPS_INSTALL=ON"
+}
+
+if ($ForceCPP) {
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DBUILD_AS_CPP:BOOL=ON"
+}
+
+if (-Not $EnableCUDA) {
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DENABLE_CUDA:BOOL=OFF"
+}
+
+if (-Not $EnableCUDNN) {
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DENABLE_CUDNN:BOOL=OFF"
+}
+
+if (-Not $EnableOPENCV) {
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DENABLE_OPENCV:BOOL=OFF"
+}
+
+if (-Not $EnableOPENCV_CUDA) {
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DVCPKG_BUILD_OPENCV_WITH_CUDA:BOOL=OFF"
+}
+
+if ($EnableCSharpWrapper) {
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DENABLE_CSHARP_WRAPPER:BOOL=ON"
+}
+
+if (-Not $InstallDARKNETthroughVCPKG) {
+  $AdditionalBuildSetup = $AdditionalBuildSetup + " -DENABLE_DEPLOY_CUSTOM_CMAKE_MODULES:BOOL=ON"
+}
+
+if($UseVCPKG) {
+  if ($ForceVCPKGBuildtreesPath -ne "") {
+    $AdditionalBuildSetup = $AdditionalBuildSetup + " -DVCPKG_INSTALL_OPTIONS=--clean-buildtrees-after-build;--x-buildtrees-root=`"$ForceVCPKGBuildtreesPath`""
+    New-Item -Path $ForceVCPKGBuildtreesPath -ItemType directory -Force | Out-Null
+    $vcpkgbuildtreespath = "$ForceVCPKGBuildtreesPath"
+  }
+  else {
+    $AdditionalBuildSetup = $AdditionalBuildSetup + " -DVCPKG_INSTALL_OPTIONS=--clean-buildtrees-after-build"
+    $vcpkgbuildtreespath = "$vcpkg_path/buildtrees"
+  }
+}
+
+if ($InstallDARKNETthroughVCPKG) {
+  if ($ForceVCPKGDarknetHEAD) {
+    $headMode = " --head "
+  }
+  $features = "opencv-base"
+  $feature_manifest_opencv = "--x-feature=opencv-base"
+  if ($EnableCUDA) {
+    $features = $features + ",cuda"
+    $feature_manifest_cuda = "--x-feature=cuda"
+  }
+  if ($EnableCUDNN) {
+    $features = $features + ",cudnn"
+    $feature_manifest_cudnn = "--x-feature=cudnn"
+  }
+  if (-not (Test-Path "${env:VCPKG_ROOT}/vcpkg${exe_ext}")) {
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath ${env:VCPKG_ROOT}/bootstrap-vcpkg${bootstrap_ext} -ArgumentList "-disableMetrics"
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Bootstrapping vcpkg failed! Exited with error code $exitCode.")
+    }
+  }
+  if ($InstallDARKNETdependenciesThroughVCPKGManifest) {
+    Write-Host "Running vcpkg in manifest mode to install darknet dependencies"
+    Write-Host "vcpkg install --x-no-default-features $feature_manifest_opencv $feature_manifest_cuda $feature_manifest_cudnn $headMode"
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath "${env:VCPKG_ROOT}/vcpkg${exe_ext}" -ArgumentList " install --x-no-default-features $feature_manifest_opencv $feature_manifest_cuda $feature_manifest_cudnn $headMode "
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Installing darknet through vcpkg failed! Exited with error code $exitCode.")
+    }
+  }
+  else {
+    Write-Host "Running vcpkg to install darknet"
+    Write-Host "vcpkg install darknet[${features}] $headMode --recurse"
+    Move-Item $PSCustomScriptRoot/vcpkg.json $PSCustomScriptRoot/vcpkg.json.bak
+    Push-Location ${env:VCPKG_ROOT}
+    if ($ForceVCPKGDarknetHEAD) {
+      $proc = Start-Process -NoNewWindow -PassThru -FilePath "${env:VCPKG_ROOT}/vcpkg${exe_ext}" -ArgumentList " remove darknet --recurse "
+      $handle = $proc.Handle
+      $proc.WaitForExit()
+      $exitCode = $proc.ExitCode
+      if (-Not ($exitCode -eq 0)) {
+        MyThrow("Removing darknet through vcpkg failed! Exited with error code $exitCode.")
+      }
+    }
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath "${env:VCPKG_ROOT}/vcpkg${exe_ext}" -ArgumentList "  upgrade --no-dry-run "
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Upgrading vcpkg installed ports failed! Exited with error code $exitCode.")
+    }
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath "${env:VCPKG_ROOT}/vcpkg${exe_ext}" -ArgumentList " install darknet[${features}] $headMode --recurse "  # "-manifest"  disables the manifest feature, so that if vcpkg is a subfolder of darknet, the vcpkg.json inside darknet folder does not trigger errors due to automatic manifest mode
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Installing darknet dependencies through vcpkg failed! Exited with error code $exitCode.")
+    }
+    Move-Item $PSCustomScriptRoot/vcpkg.json.bak $PSCustomScriptRoot/vcpkg.json
+  }
+}
+else {
+  if ($BuildDebug -and $UseVCPKG) {
+    $debug_build_folder = "$PSCustomScriptRoot/build_debug"
+    if (-Not $DoNotDeleteBuildFolder) {
+      Write-Host "Removing folder $debug_build_folder" -ForegroundColor Yellow
+      Remove-Item -Force -Recurse -ErrorAction SilentlyContinue $debug_build_folder
+    }
+
+    New-Item -Path $debug_build_folder -ItemType directory -Force | Out-Null
+    Set-Location $debug_build_folder
+    $cmake_args = "-G `"$generator`" ${DebugBuildSetup} ${AdditionalBuildSetup} -S .."
+    Write-Host "Configuring debug CMake project" -ForegroundColor Green
+    Write-Host "CMake args: $cmake_args"
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $CMAKE_EXE -ArgumentList $cmake_args
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Config failed! Exited with error code $exitCode.")
+    }
+    Write-Host "Building debug CMake project" -ForegroundColor Green
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $CMAKE_EXE -ArgumentList "--build . ${debugConfig} --parallel ${NumberOfBuildWorkers} --target install"
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Build failed! Exited with error code $exitCode.")
+    }
+    Get-ChildItem "${debug_build_folder}" -Filter *.pdb |
+    Foreach-Object {
+      Write-Host "-- Copying $_ to $DebugInstallPrefix/bin"
+      Copy-Item $_ $DebugInstallPrefix/bin
+    }
+  }
+  $release_build_folder = "$PSCustomScriptRoot/build_release"
+  if (-Not $DoNotDeleteBuildFolder) {
+    Write-Host "Removing folder $release_build_folder" -ForegroundColor Yellow
+    Remove-Item -Force -Recurse -ErrorAction SilentlyContinue $release_build_folder
+  }
+
+  New-Item -Path $release_build_folder -ItemType directory -Force | Out-Null
+  Set-Location $release_build_folder
+  $cmake_args = "-G `"$generator`" ${ReleaseBuildSetup} ${AdditionalBuildSetup} -S .."
+  Write-Host "Configuring release CMake project" -ForegroundColor Green
+  Write-Host "CMake args: $cmake_args"
+  $proc = Start-Process -NoNewWindow -PassThru -FilePath $CMAKE_EXE -ArgumentList $cmake_args
+  $handle = $proc.Handle
+  $proc.WaitForExit()
+  $exitCode = $proc.ExitCode
+  if (-Not ($exitCode -eq 0)) {
+    MyThrow("Config failed! Exited with error code $exitCode.")
+  }
+  Write-Host "Building release CMake project" -ForegroundColor Green
+  if ($BuildInstaller) {
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $CMAKE_EXE -ArgumentList "--build . ${releaseConfig} --parallel ${NumberOfBuildWorkers}"
+  }
+  else {
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $CMAKE_EXE -ArgumentList "--build . ${releaseConfig} --parallel ${NumberOfBuildWorkers} --target install"
+  }
+  $handle = $proc.Handle
+  $proc.WaitForExit()
+  $exitCode = $proc.ExitCode
+  if (-Not ($exitCode -eq 0)) {
+    MyThrow("Build failed! Exited with error code $exitCode.")
+  }
+  if (-Not $UseVCPKG -And -Not $DisableDLLcopy) {
+    $dllfiles = Get-ChildItem ./${dllfolder}/*.dll
+    if ($dllfiles) {
+      Copy-Item $dllfiles ..
+    }
+  }
+  if ($BuildInstaller) {
+    Write-Host "Building package with CPack" -ForegroundColor Green
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $CMAKE_EXE -ArgumentList "--build . --target package"
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Packaging failed! Exited with error code $exitCode.")
+    }
+  }
+}
+
+Pop-Location
+Write-Host "Build complete!" -ForegroundColor Green
+
+if ($ForceVCPKGBuildtreesRemoval -and (-Not $UseVCPKG)) {
+  Write-Host "VCPKG is not enabled, so local vcpkg buildtrees folder will not be deleted even if requested" -ForegroundColor Yellow
+}
+
+if ($UseVCPKG -and $ForceVCPKGBuildtreesRemoval) {
+  Write-Host "Removing local vcpkg buildtrees folder from $vcpkgbuildtreespath" -ForegroundColor Yellow
+  Remove-Item -Force -Recurse -ErrorAction SilentlyContinue $vcpkgbuildtreespath
+}
+
+if ($ForceVCPKGPackagesRemoval -and (-Not $UseVCPKG)) {
+  Write-Host "VCPKG is not enabled, so local vcpkg packages folder will not be deleted even if requested" -ForegroundColor Yellow
+}
+
+if ($UseVCPKG -and $ForceVCPKGPackagesRemoval) {
+  $vcpkgpackagespath = "$vcpkg_path/packages"
+  Write-Host "Removing local vcpkg packages folder from $vcpkgpackagespath" -ForegroundColor Yellow
+  Remove-Item -Force -Recurse -ErrorAction SilentlyContinue $vcpkgpackagespath
+}
+
+if ($DownloadWeights) {
+  Write-Host "Downloading weights..." -ForegroundColor Yellow
+  & $PSScriptRoot/scripts/download_weights.ps1
+  Write-Host "Weights downloaded" -ForegroundColor Green
+}
+
+if ($vcpkg_root_set_by_this_script) {
+  $env:VCPKG_ROOT = $null
+}
+if ($vcpkg_triplet_set_by_this_script) {
+  $env:VCPKG_DEFAULT_TRIPLET = $null
+}
+if ($vcpkg_host_triplet_set_by_this_script) {
+  $env:VCPKG_DEFAULT_HOST_TRIPLET = $null
+}
+
+if($ForceOpenCVVersion -gt 0) {
+  Move-Item $PSCustomScriptRoot/vcpkg.json $PSCustomScriptRoot/vcpkg.json.opencv23
+  Move-Item $PSCustomScriptRoot/vcpkg.json.bak $PSCustomScriptRoot/vcpkg.json
+}
+
+if ($vcpkg_branch_set_by_this_script) {
+  Push-Location $vcpkg_path
+  $git_args = "checkout -"
+  $proc = Start-Process -NoNewWindow -PassThru -FilePath $GIT_EXE -ArgumentList "$git_args"
+  $handle = $proc.Handle
+  $proc.WaitForExit()
+  $exitCode = $proc.ExitCode
+  if (-Not ($exitCode -eq 0)) {
+    MyThrow("Checking out previous branch failed! Exited with error code $exitCode.")
+  }
+  if ($VCPKGFork -ne "") {
+    $git_args = "remote rm vcpkgfork"
+    $proc = Start-Process -NoNewWindow -PassThru -FilePath $GIT_EXE -ArgumentList "$git_args"
+    $handle = $proc.Handle
+    $proc.WaitForExit()
+    $exitCode = $proc.ExitCode
+    if (-Not ($exitCode -eq 0)) {
+      MyThrow("Checking out previous branch failed! Exited with error code $exitCode.")
+    }
+  }
+  Pop-Location
+}
+
+$ErrorActionPreference = "SilentlyContinue"
+Stop-Transcript | out-null
+$ErrorActionPreference = "Continue"
diff --git a/darknet-master/cmake/Modules/FindCUDNN.cmake b/darknet-master/cmake/Modules/FindCUDNN.cmake
new file mode 100644
index 0000000..7a692b0
--- /dev/null
+++ b/darknet-master/cmake/Modules/FindCUDNN.cmake
@@ -0,0 +1,104 @@
+# Distributed under the OSI-approved BSD 3-Clause License.
+# Copyright Stefano Sinigardi
+
+#.rst:
+# FindCUDNN
+# --------
+#
+# Result Variables
+# ^^^^^^^^^^^^^^^^
+#
+# This module will set the following variables in your project::
+#
+#  ``CUDNN_FOUND``
+#    True if CUDNN found on the local system
+#
+#  ``CUDNN_INCLUDE_DIRS``
+#    Location of CUDNN header files.
+#
+#  ``CUDNN_LIBRARIES``
+#    The CUDNN libraries.
+#
+#  ``CuDNN::CuDNN``
+#    The CUDNN target
+#
+
+include(FindPackageHandleStandardArgs)
+
+find_path(CUDNN_INCLUDE_DIR NAMES cudnn.h cudnn_v8.h cudnn_v7.h
+  HINTS $ENV{CUDA_PATH} $ENV{CUDA_TOOLKIT_ROOT_DIR} $ENV{CUDA_HOME} $ENV{CUDNN_ROOT_DIR} /usr/include
+  PATH_SUFFIXES cuda/include include)
+find_library(CUDNN_LIBRARY NAMES cudnn cudnn8 cudnn7
+  HINTS $ENV{CUDA_PATH} $ENV{CUDA_TOOLKIT_ROOT_DIR} $ENV{CUDA_HOME} $ENV{CUDNN_ROOT_DIR} /usr/lib/x86_64-linux-gnu/
+  PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64 cuda/lib/x64)
+if(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn.h")
+  file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_HEADER_CONTENTS)
+elseif(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn_v8.h")
+  file(READ ${CUDNN_INCLUDE_DIR}/cudnn_v8.h CUDNN_HEADER_CONTENTS)
+elseif(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn_v7.h")
+  file(READ ${CUDNN_INCLUDE_DIR}/cudnn_v7.h CUDNN_HEADER_CONTENTS)
+endif()
+if(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn_version.h")
+  file(READ "${CUDNN_INCLUDE_DIR}/cudnn_version.h" CUDNN_VERSION_H_CONTENTS)
+  string(APPEND CUDNN_HEADER_CONTENTS "${CUDNN_VERSION_H_CONTENTS}")
+  unset(CUDNN_VERSION_H_CONTENTS)
+elseif(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn_version_v8.h")
+  file(READ "${CUDNN_INCLUDE_DIR}/cudnn_version_v8.h" CUDNN_VERSION_H_CONTENTS)
+  string(APPEND CUDNN_HEADER_CONTENTS "${CUDNN_VERSION_H_CONTENTS}")
+  unset(CUDNN_VERSION_H_CONTENTS)
+elseif(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn_version_v7.h")
+  file(READ "${CUDNN_INCLUDE_DIR}/cudnn_version_v7.h" CUDNN_VERSION_H_CONTENTS)
+  string(APPEND CUDNN_HEADER_CONTENTS "${CUDNN_VERSION_H_CONTENTS}")
+  unset(CUDNN_VERSION_H_CONTENTS)
+endif()
+if(CUDNN_HEADER_CONTENTS)
+  string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
+               _CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
+               _CUDNN_VERSION_MAJOR "${_CUDNN_VERSION_MAJOR}")
+  string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
+               _CUDNN_VERSION_MINOR "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
+               _CUDNN_VERSION_MINOR "${_CUDNN_VERSION_MINOR}")
+  string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
+               _CUDNN_VERSION_PATCH "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
+               _CUDNN_VERSION_PATCH "${_CUDNN_VERSION_PATCH}")
+  if(NOT _CUDNN_VERSION_MAJOR)
+    set(CUDNN_VERSION "?")
+  else()
+    set(CUDNN_VERSION "${_CUDNN_VERSION_MAJOR}.${_CUDNN_VERSION_MINOR}.${_CUDNN_VERSION_PATCH}")
+  endif()
+endif()
+
+set(CUDNN_INCLUDE_DIRS ${CUDNN_INCLUDE_DIR})
+set(CUDNN_LIBRARIES ${CUDNN_LIBRARY})
+mark_as_advanced(CUDNN_LIBRARY CUDNN_INCLUDE_DIR)
+
+find_package_handle_standard_args(CUDNN
+      REQUIRED_VARS  CUDNN_INCLUDE_DIR CUDNN_LIBRARY
+      VERSION_VAR    CUDNN_VERSION
+)
+
+if(WIN32)
+  set(CUDNN_DLL_DIR ${CUDNN_INCLUDE_DIR})
+  list(TRANSFORM CUDNN_DLL_DIR APPEND "/../bin")
+  find_file(CUDNN_LIBRARY_DLL NAMES cudnn64_${CUDNN_VERSION_MAJOR}.dll PATHS ${CUDNN_DLL_DIR})
+endif()
+
+if( CUDNN_FOUND AND NOT TARGET CuDNN::CuDNN )
+  if( EXISTS "${CUDNN_LIBRARY_DLL}" )
+    add_library( CuDNN::CuDNN      SHARED IMPORTED )
+    set_target_properties( CuDNN::CuDNN PROPERTIES
+      IMPORTED_LOCATION                 "${CUDNN_LIBRARY_DLL}"
+      IMPORTED_IMPLIB                   "${CUDNN_LIBRARY}"
+      INTERFACE_INCLUDE_DIRECTORIES     "${CUDNN_INCLUDE_DIR}"
+      IMPORTED_LINK_INTERFACE_LANGUAGES "C" )
+  else()
+    add_library( CuDNN::CuDNN      UNKNOWN IMPORTED )
+    set_target_properties( CuDNN::CuDNN PROPERTIES
+      IMPORTED_LOCATION                 "${CUDNN_LIBRARY}"
+      INTERFACE_INCLUDE_DIRECTORIES     "${CUDNN_INCLUDE_DIR}"
+      IMPORTED_LINK_INTERFACE_LANGUAGES "C" )
+  endif()
+endif()
diff --git a/darknet-master/cmake/Modules/FindPThreads4W.cmake b/darknet-master/cmake/Modules/FindPThreads4W.cmake
new file mode 100644
index 0000000..ef20cc2
--- /dev/null
+++ b/darknet-master/cmake/Modules/FindPThreads4W.cmake
@@ -0,0 +1,79 @@
+# Distributed under the OSI-approved BSD 3-Clause License.
+# Copyright Stefano Sinigardi
+
+#.rst:
+# FindPThreads4W
+# ------------
+#
+# Find the PThread4W includes and library.
+#
+# Result Variables
+# ^^^^^^^^^^^^^^^^
+#
+# This script defines the following variables:
+#
+# ``PThreads4W_FOUND``
+#   True if PThreads4W library found
+#
+# ``PThreads4W_VERSION``
+#   Containing the PThreads4W version tag (manually defined)
+#
+# ``PThreads4W_INCLUDE_DIR``
+#   Location of PThreads4W headers
+#
+# ``PThreads4W_LIBRARY``
+#   List of libraries to link with when using PThreads4W (no exception handling)
+#
+# Result Targets
+# ^^^^^^^^^^^^^^^^
+#
+# This script defines the following targets:
+#
+# ``PThreads4W::PThreads4W``
+#   Target to use PThreads4W (no exception handling)
+#
+
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake)
+
+if(NOT PThreads4W_INCLUDE_DIR)
+  find_path(PThreads4W_INCLUDE_DIR NAMES pthread.h)
+endif()
+
+set(PThreads4W_MAJOR_VERSION 2)
+set(PThreads4W_MINOR_VERSION 0)
+set(PThreads4W_PATCH_VERSION 0)
+set(PThreads4W_VERSION "${PThreads4W_MAJOR_VERSION}.${PThreads4W_MINOR_VERSION}.${PThreads4W_PATCH_VERSION}")
+
+# Allow libraries to be set manually
+if(NOT PThreads4W_LIBRARY)
+  find_library(PThreads4W_LIBRARY NAMES pthreadVC${PThreads4W_MAJOR_VERSION})
+endif()
+
+find_package_handle_standard_args(PThreads4W DEFAULT_MSG PThreads4W_LIBRARY PThreads4W_INCLUDE_DIR)
+mark_as_advanced(PThreads4W_INCLUDE_DIR PThreads4W_LIBRARY )
+
+set(PThreads4W_DLL_DIR ${PThreads4W_INCLUDE_DIR})
+list(TRANSFORM PThreads4W_DLL_DIR APPEND "/../bin")
+message(STATUS "PThreads4W_DLL_DIR: ${PThreads4W_DLL_DIR}")
+
+find_file(PThreads4W_LIBRARY_DLL NAMES pthreadVC${PThreads4W_MAJOR_VERSION}.dll PATHS ${PThreads4W_DLL_DIR})
+
+if( PThreads4W_FOUND AND NOT TARGET PThreads4W::PThreads4W )
+  if( EXISTS "${PThreads4W_LIBRARY_RELEASE_DLL}" )
+    add_library( PThreads4W::PThreads4W      SHARED IMPORTED )
+    set_target_properties( PThreads4W::PThreads4W PROPERTIES
+      IMPORTED_LOCATION_RELEASE         "${PThreads4W_LIBRARY_DLL}"
+      IMPORTED_IMPLIB                   "${PThreads4W_LIBRARY}"
+      INTERFACE_INCLUDE_DIRECTORIES     "${PThreads4W_INCLUDE_DIR}"
+      IMPORTED_CONFIGURATIONS           Release
+      IMPORTED_LINK_INTERFACE_LANGUAGES "C" )
+  else()
+    add_library( PThreads4W::PThreads4W      UNKNOWN IMPORTED )
+    set_target_properties( PThreads4W::PThreads4W PROPERTIES
+      IMPORTED_LOCATION_RELEASE         "${PThreads4W_LIBRARY}"
+      INTERFACE_INCLUDE_DIRECTORIES     "${PThreads4W_INCLUDE_DIR}"
+      IMPORTED_CONFIGURATIONS           Release
+      IMPORTED_LINK_INTERFACE_LANGUAGES "C" )
+  endif()
+endif()
diff --git a/darknet-master/cmake/Modules/FindStb.cmake b/darknet-master/cmake/Modules/FindStb.cmake
new file mode 100644
index 0000000..1db213a
--- /dev/null
+++ b/darknet-master/cmake/Modules/FindStb.cmake
@@ -0,0 +1,30 @@
+# Distributed under the OSI-approved BSD 3-Clause License.
+# Copyright Stefano Sinigardi
+
+#.rst:
+# FindStb
+# ------------
+#
+# Find the Stb include headers.
+#
+# Result Variables
+# ^^^^^^^^^^^^^^^^
+#
+# This module defines the following variables:
+#
+# ``Stb_FOUND``
+#   True if Stb library found
+#
+# ``Stb_INCLUDE_DIR``
+#   Location of Stb headers
+#
+
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake)
+
+if(NOT Stb_INCLUDE_DIR)
+  find_path(Stb_INCLUDE_DIR NAMES stb_image.h PATHS ${Stb_DIR} PATH_SUFFIXES include)
+endif()
+
+find_package_handle_standard_args(Stb DEFAULT_MSG Stb_INCLUDE_DIR)
+mark_as_advanced(Stb_INCLUDE_DIR)
diff --git a/darknet-master/darknet.py b/darknet-master/darknet.py
new file mode 100644
index 0000000..59953bc
--- /dev/null
+++ b/darknet-master/darknet.py
@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+
+"""
+Python 3 wrapper for identifying objects in images
+
+Running the script requires opencv-python to be installed (`pip install opencv-python`)
+Directly viewing or returning bounding-boxed images requires scikit-image to be installed (`pip install scikit-image`)
+Use pip3 instead of pip on some systems to be sure to install modules for python3
+"""
+
+import ctypes as ct
+import random
+import os
+import cv2
+import numpy as np
+
+
+class BOX(ct.Structure):
+    _fields_ = (
+        ("x", ct.c_float),
+        ("y", ct.c_float),
+        ("w", ct.c_float),
+        ("h", ct.c_float),
+    )
+
+
+FloatPtr = ct.POINTER(ct.c_float)
+IntPtr = ct.POINTER(ct.c_int)
+
+
+class DETECTION(ct.Structure):
+    _fields_ = (
+        ("bbox", BOX),
+        ("classes", ct.c_int),
+        ("best_class_idx", ct.c_int),
+        ("prob", FloatPtr),
+        ("mask", FloatPtr),
+        ("objectness", ct.c_float),
+        ("sort_class", ct.c_int),
+        ("uc", FloatPtr),
+        ("points", ct.c_int),
+        ("embeddings", FloatPtr),
+        ("embedding_size", ct.c_int),
+        ("sim", ct.c_float),
+        ("track_id", ct.c_int),
+    )
+
+
+DETECTIONPtr = ct.POINTER(DETECTION)
+
+
+class DETNUMPAIR(ct.Structure):
+    _fields_ = (
+        ("num", ct.c_int),
+        ("dets", DETECTIONPtr),
+    )
+
+
+DETNUMPAIRPtr = ct.POINTER(DETNUMPAIR)
+
+
+class IMAGE(ct.Structure):
+    _fields_ = (
+        ("w", ct.c_int),
+        ("h", ct.c_int),
+        ("c", ct.c_int),
+        ("data", FloatPtr),
+    )
+
+
+class METADATA(ct.Structure):
+    _fields_ = (
+        ("classes", ct.c_int),
+        ("names", ct.POINTER(ct.c_char_p)),
+    )
+
+
+def network_width(net):
+    return lib.network_width(net)
+
+
+def network_height(net):
+    return lib.network_height(net)
+
+
+def bbox2points(bbox):
+    """
+    From bounding box yolo format
+    to corner points cv2 rectangle
+    """
+    x, y, w, h = bbox
+    xmin = round(x - (w / 2))
+    xmax = round(x + (w / 2))
+    ymin = round(y - (h / 2))
+    ymax = round(y + (h / 2))
+    return xmin, ymin, xmax, ymax
+
+
+def class_colors(names):
+    """
+    Create a dict with one random BGR color for each
+    class name
+    """
+    return {name: (
+        random.randint(0, 255),
+        random.randint(0, 255),
+        random.randint(0, 255)) for name in names}
+
+
+def load_network(config_file, data_file, weights, batch_size=1):
+    """
+    load model description and weights from config files
+    args:
+        config_file (str): path to .cfg model file
+        data_file (str): path to .data model file
+        weights (str): path to weights
+    returns:
+        network: trained model
+        class_names
+        class_colors
+    """
+    network = load_net_custom(
+        config_file.encode("ascii"),
+        weights.encode("ascii"), 0, batch_size)
+    metadata = load_meta(data_file.encode("ascii"))
+    class_names = [metadata.names[i].decode("ascii") for i in range(metadata.classes)]
+    colors = class_colors(class_names)
+    return network, class_names, colors
+
+
+def print_detections(detections, coordinates=False):
+    print("\nObjects:")
+    for label, confidence, bbox in detections:
+        x, y, w, h = bbox
+        if coordinates:
+            print("{}: {}%    (left_x: {:.0f}   top_y:  {:.0f}   width:   {:.0f}   height:  {:.0f})".format(label, confidence, x, y, w, h))
+        else:
+            print("{}: {}%".format(label, confidence))
+
+
+def draw_boxes(detections, image, colors):
+    import cv2
+    for label, confidence, bbox in detections:
+        left, top, right, bottom = bbox2points(bbox)
+        cv2.rectangle(image, (left, top), (right, bottom), colors[label], 1)
+        cv2.putText(image, "{} [{:.2f}]".format(label, float(confidence)),
+                    (left, top - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
+                    colors[label], 2)
+    return image
+
+
+def decode_detection(detections):
+    decoded = []
+    for label, confidence, bbox in detections:
+        confidence = str(round(confidence * 100, 2))
+        decoded.append((str(label), confidence, bbox))
+    return decoded
+
+
+# https://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/
+# Malisiewicz et al.
+def non_max_suppression_fast(detections, overlap_thresh):
+    boxes = []
+    for detection in detections:
+        _, _, _, (x, y, w, h) = detection
+        x1 = x - w / 2
+        y1 = y - h / 2
+        x2 = x + w / 2
+        y2 = y + h / 2
+        boxes.append(np.array([x1, y1, x2, y2]))
+    boxes_array = np.array(boxes)
+
+    # initialize the list of picked indexes
+    pick = []
+    # grab the coordinates of the bounding boxes
+    x1 = boxes_array[:, 0]
+    y1 = boxes_array[:, 1]
+    x2 = boxes_array[:, 2]
+    y2 = boxes_array[:, 3]
+    # compute the area of the bounding boxes and sort the bounding
+    # boxes by the bottom-right y-coordinate of the bounding box
+    area = (x2 - x1 + 1) * (y2 - y1 + 1)
+    idxs = np.argsort(y2)
+    # keep looping while some indexes still remain in the indexes
+    # list
+    while len(idxs) > 0:
+        # grab the last index in the indexes list and add the
+        # index value to the list of picked indexes
+        last = len(idxs) - 1
+        i = idxs[last]
+        pick.append(i)
+        # find the largest (x, y) coordinates for the start of
+        # the bounding box and the smallest (x, y) coordinates
+        # for the end of the bounding box
+        xx1 = np.maximum(x1[i], x1[idxs[:last]])
+        yy1 = np.maximum(y1[i], y1[idxs[:last]])
+        xx2 = np.minimum(x2[i], x2[idxs[:last]])
+        yy2 = np.minimum(y2[i], y2[idxs[:last]])
+        # compute the width and height of the bounding box
+        w = np.maximum(0, xx2 - xx1 + 1)
+        h = np.maximum(0, yy2 - yy1 + 1)
+        # compute the ratio of overlap
+        overlap = (w * h) / area[idxs[:last]]
+        # delete all indexes from the index list that have
+        idxs = np.delete(idxs, np.concatenate(([last],
+                                               np.where(overlap > overlap_thresh)[0])))
+        # return only the bounding boxes that were picked using the
+        # integer data type
+    return [detections[i] for i in pick]
+
+
+def remove_negatives(detections, class_names, num):
+    """
+    Remove all classes with 0% confidence within the detection
+    """
+    predictions = []
+    for j in range(num):
+        for idx, name in enumerate(class_names):
+            if detections[j].prob[idx] > 0:
+                bbox = detections[j].bbox
+                bbox = (bbox.x, bbox.y, bbox.w, bbox.h)
+                predictions.append((name, detections[j].prob[idx], (bbox)))
+    return predictions
+
+
+def remove_negatives_faster(detections, class_names, num):
+    """
+    Faster version of remove_negatives (very useful when using yolo9000)
+    """
+    predictions = []
+    for j in range(num):
+        if detections[j].best_class_idx == -1:
+            continue
+        name = class_names[detections[j].best_class_idx]
+        bbox = detections[j].bbox
+        bbox = (bbox.x, bbox.y, bbox.w, bbox.h)
+        predictions.append((name, detections[j].prob[detections[j].best_class_idx], bbox))
+    return predictions
+
+
+def detect_image(network, class_names, image, thresh=.5, hier_thresh=.5, nms=.45):
+    """
+        Returns a list with highest confidence class and their bbox
+    """
+    pnum = ct.pointer(ct.c_int(0))
+    predict_image(network, image)
+    detections = get_network_boxes(network, image.w, image.h,
+                                   thresh, hier_thresh, None, 0, pnum, 0)
+    num = pnum[0]
+    if nms:
+        do_nms_sort(detections, num, len(class_names), nms)
+    predictions = remove_negatives(detections, class_names, num)
+    predictions = decode_detection(predictions)
+    free_detections(detections, num)
+    return sorted(predictions, key=lambda x: x[1])
+
+
+if os.name == "posix":
+    cwd = os.path.dirname(__file__)
+    lib = ct.CDLL(cwd + "/libdarknet.so", ct.RTLD_GLOBAL)
+elif os.name == "nt":
+    cwd = os.path.dirname(__file__)
+    os.environ["PATH"] = os.path.pathsep.join((cwd, os.environ["PATH"]))
+    lib = ct.CDLL("darknet.dll", winmode = 0, mode = ct.RTLD_GLOBAL)
+else:
+    lib = None  # Intellisense
+    print("Unsupported OS")
+    exit()
+
+lib.network_width.argtypes = (ct.c_void_p,)
+lib.network_width.restype = ct.c_int
+lib.network_height.argtypes = (ct.c_void_p,)
+lib.network_height.restype = ct.c_int
+
+copy_image_from_bytes = lib.copy_image_from_bytes
+copy_image_from_bytes.argtypes = (IMAGE, ct.c_char_p)
+
+predict = lib.network_predict_ptr
+predict.argtypes = (ct.c_void_p, FloatPtr)
+predict.restype = FloatPtr
+
+set_gpu = lib.cuda_set_device
+init_cpu = lib.init_cpu
+
+make_image = lib.make_image
+make_image.argtypes = (ct.c_int, ct.c_int, ct.c_int)
+make_image.restype = IMAGE
+
+get_network_boxes = lib.get_network_boxes
+get_network_boxes.argtypes = (ct.c_void_p, ct.c_int, ct.c_int, ct.c_float, ct.c_float, IntPtr, ct.c_int, IntPtr,
+                              ct.c_int)
+get_network_boxes.restype = DETECTIONPtr
+
+make_network_boxes = lib.make_network_boxes
+make_network_boxes.argtypes = (ct.c_void_p,)
+make_network_boxes.restype = DETECTIONPtr
+
+free_detections = lib.free_detections
+free_detections.argtypes = (DETECTIONPtr, ct.c_int)
+
+free_batch_detections = lib.free_batch_detections
+free_batch_detections.argtypes = (DETNUMPAIRPtr, ct.c_int)
+
+free_ptrs = lib.free_ptrs
+free_ptrs.argtypes = (ct.POINTER(ct.c_void_p), ct.c_int)
+
+network_predict = lib.network_predict_ptr
+network_predict.argtypes = (ct.c_void_p, FloatPtr)
+
+reset_rnn = lib.reset_rnn
+reset_rnn.argtypes = (ct.c_void_p,)
+
+load_net = lib.load_network
+load_net.argtypes = (ct.c_char_p, ct.c_char_p, ct.c_int)
+load_net.restype = ct.c_void_p
+
+load_net_custom = lib.load_network_custom
+load_net_custom.argtypes = (ct.c_char_p, ct.c_char_p, ct.c_int, ct.c_int)
+load_net_custom.restype = ct.c_void_p
+
+free_network_ptr = lib.free_network_ptr
+free_network_ptr.argtypes = (ct.c_void_p,)
+free_network_ptr.restype = ct.c_void_p
+
+do_nms_obj = lib.do_nms_obj
+do_nms_obj.argtypes = (DETECTIONPtr, ct.c_int, ct.c_int, ct.c_float)
+
+do_nms_sort = lib.do_nms_sort
+do_nms_sort.argtypes = (DETECTIONPtr, ct.c_int, ct.c_int, ct.c_float)
+
+free_image = lib.free_image
+free_image.argtypes = (IMAGE,)
+
+letterbox_image = lib.letterbox_image
+letterbox_image.argtypes = (IMAGE, ct.c_int, ct.c_int)
+letterbox_image.restype = IMAGE
+
+load_meta = lib.get_metadata
+lib.get_metadata.argtypes = (ct.c_char_p,)
+lib.get_metadata.restype = METADATA
+
+load_image = lib.load_image_color
+load_image.argtypes = (ct.c_char_p, ct.c_int, ct.c_int)
+load_image.restype = IMAGE
+
+rgbgr_image = lib.rgbgr_image
+rgbgr_image.argtypes = (IMAGE,)
+
+predict_image = lib.network_predict_image
+predict_image.argtypes = (ct.c_void_p, IMAGE)
+predict_image.restype = FloatPtr
+
+predict_image_letterbox = lib.network_predict_image_letterbox
+predict_image_letterbox.argtypes = (ct.c_void_p, IMAGE)
+predict_image_letterbox.restype = FloatPtr
+
+network_predict_batch = lib.network_predict_batch
+network_predict_batch.argtypes = (ct.c_void_p, IMAGE, ct.c_int, ct.c_int, ct.c_int,
+                                  ct.c_float, ct.c_float, IntPtr, ct.c_int, ct.c_int)
+network_predict_batch.restype = DETNUMPAIRPtr
diff --git a/darknet-master/darknet_images.py b/darknet-master/darknet_images.py
new file mode 100644
index 0000000..bfdd88c
--- /dev/null
+++ b/darknet-master/darknet_images.py
@@ -0,0 +1,237 @@
+import argparse
+import os
+import glob
+import random
+import time
+import cv2
+import numpy as np
+import darknet
+
+
+def parser():
+    parser = argparse.ArgumentParser(description="YOLO Object Detection")
+    parser.add_argument("--input", type=str, default="",
+                        help="image source. It can be a single image, a"
+                        "txt with paths to them, or a folder. Image valid"
+                        " formats are jpg, jpeg or png."
+                        "If no input is given, ")
+    parser.add_argument("--batch_size", default=1, type=int,
+                        help="number of images to be processed at the same time")
+    parser.add_argument("--weights", default="yolov4.weights",
+                        help="yolo weights path")
+    parser.add_argument("--dont_show", action='store_true',
+                        help="windown inference display. For headless systems")
+    parser.add_argument("--ext_output", action='store_true',
+                        help="display bbox coordinates of detected objects")
+    parser.add_argument("--save_labels", action='store_true',
+                        help="save detections bbox for each image in yolo format")
+    parser.add_argument("--config_file", default="./cfg/yolov4.cfg",
+                        help="path to config file")
+    parser.add_argument("--data_file", default="./cfg/coco.data",
+                        help="path to data file")
+    parser.add_argument("--thresh", type=float, default=.25,
+                        help="remove detections with lower confidence")
+    return parser.parse_args()
+
+
+def check_arguments_errors(args):
+    assert 0 < args.thresh < 1, "Threshold should be a float between zero and one (non-inclusive)"
+    if not os.path.exists(args.config_file):
+        raise(ValueError("Invalid config path {}".format(os.path.abspath(args.config_file))))
+    if not os.path.exists(args.weights):
+        raise(ValueError("Invalid weight path {}".format(os.path.abspath(args.weights))))
+    if not os.path.exists(args.data_file):
+        raise(ValueError("Invalid data file path {}".format(os.path.abspath(args.data_file))))
+    if args.input and not os.path.exists(args.input):
+        raise(ValueError("Invalid image path {}".format(os.path.abspath(args.input))))
+
+
+def check_batch_shape(images, batch_size):
+    """
+        Image sizes should be the same width and height
+    """
+    shapes = [image.shape for image in images]
+    if len(set(shapes)) > 1:
+        raise ValueError("Images don't have same shape")
+    if len(shapes) > batch_size:
+        raise ValueError("Batch size higher than number of images")
+    return shapes[0]
+
+
+def load_images(images_path):
+    """
+    If image path is given, return it directly
+    For txt file, read it and return each line as image path
+    In other case, it's a folder, return a list with names of each
+    jpg, jpeg and png file
+    """
+    input_path_extension = images_path.split('.')[-1]
+    if input_path_extension in ['jpg', 'jpeg', 'png']:
+        return [images_path]
+    elif input_path_extension == "txt":
+        with open(images_path, "r") as f:
+            return f.read().splitlines()
+    else:
+        return glob.glob(
+            os.path.join(images_path, "*.jpg")) + \
+            glob.glob(os.path.join(images_path, "*.png")) + \
+            glob.glob(os.path.join(images_path, "*.jpeg"))
+
+
+def prepare_batch(images, network):
+    width = darknet.network_width(network)
+    height = darknet.network_height(network)
+
+    darknet_images = []
+    for image in images:
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image_resized = cv2.resize(image_rgb, (width, height),
+                                   interpolation=cv2.INTER_LINEAR)
+        custom_image = image_resized.transpose(2, 0, 1)
+        darknet_images.append(custom_image)
+
+    batch_array = np.concatenate(darknet_images, axis=0)
+    batch_array = np.ascontiguousarray(batch_array.flat, dtype=np.float32)/255.0
+    return batch_array
+
+def image_detection(image_or_path, network, class_names, class_colors, thresh):
+    # Darknet doesn't accept numpy images.
+    # Create one with image we reuse for each detect
+    width = darknet.network_width(network)
+    height = darknet.network_height(network)
+    darknet_image = darknet.make_image(width, height, 3)
+
+    if isinstance(image_or_path, str):
+        image = cv2.imread(image_or_path)
+    else:
+        image = image_or_path
+    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image_resized = cv2.resize(image_rgb, (width, height),
+                               interpolation=cv2.INTER_LINEAR)
+
+    darknet.copy_image_from_bytes(darknet_image, image_resized.tobytes())
+    detections = darknet.detect_image(network, class_names, darknet_image, thresh=thresh)
+    darknet.free_image(darknet_image)
+    image = darknet.draw_boxes(detections, image_resized, class_colors)
+    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB), detections
+
+
+def batch_detection(network, images, class_names, class_colors,
+                    thresh=0.25, hier_thresh=.5, nms=.45, batch_size=4):
+    image_height, image_width, _ = check_batch_shape(images, batch_size)
+    batch_array = prepare_batch(images, network)
+    batch_array = batch_array.ctypes.data_as(darknet.POINTER(darknet.c_float))
+    darknet_images = darknet.IMAGE(image_width, image_height, 3, batch_array)
+    batch_detections = darknet.network_predict_batch(network, darknet_images, batch_size, image_width,
+                                                     image_height, thresh, hier_thresh, None, 0, 0)
+    batch_predictions = []
+    for idx in range(batch_size):
+        num = batch_detections[idx].num
+        detections = batch_detections[idx].dets
+        if nms:
+            darknet.do_nms_obj(detections, num, len(class_names), nms)
+        predictions = darknet.remove_negatives(detections, class_names, num)
+        images[idx] = darknet.draw_boxes(predictions, images[idx], class_colors)
+        batch_predictions.append(predictions)
+    darknet.free_batch_detections(batch_detections, batch_size)
+    return images, batch_predictions
+
+
+def image_classification(image, network, class_names):
+    width = darknet.network_width(network)
+    height = darknet.network_height(network)
+    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image_resized = cv2.resize(image_rgb, (width, height),
+                                interpolation=cv2.INTER_LINEAR)
+    darknet_image = darknet.make_image(width, height, 3)
+    darknet.copy_image_from_bytes(darknet_image, image_resized.tobytes())
+    detections = darknet.predict_image(network, darknet_image)
+    predictions = [(name, detections[idx]) for idx, name in enumerate(class_names)]
+    darknet.free_image(darknet_image)
+    return sorted(predictions, key=lambda x: -x[1])
+
+
+def convert2relative(image, bbox):
+    """
+    YOLO format use relative coordinates for annotation
+    """
+    x, y, w, h = bbox
+    height, width, _ = image.shape
+    return x/width, y/height, w/width, h/height
+
+
+def save_annotations(name, image, detections, class_names):
+    """
+    Files saved with image_name.txt and relative coordinates
+    """
+    file_name = os.path.splitext(name)[0] + ".txt"
+    with open(file_name, "w") as f:
+        for label, confidence, bbox in detections:
+            x, y, w, h = convert2relative(image, bbox)
+            label = class_names.index(label)
+            f.write("{} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}\n".format(label, x, y, w, h, float(confidence)))
+
+
+def batch_detection_example():
+    args = parser()
+    check_arguments_errors(args)
+    batch_size = 3
+    random.seed(3)  # deterministic bbox colors
+    network, class_names, class_colors = darknet.load_network(
+        args.config_file,
+        args.data_file,
+        args.weights,
+        batch_size=batch_size
+    )
+    image_names = ['data/horses.jpg', 'data/horses.jpg', 'data/eagle.jpg']
+    images = [cv2.imread(image) for image in image_names]
+    images, detections,  = batch_detection(network, images, class_names,
+                                           class_colors, batch_size=batch_size)
+    for name, image in zip(image_names, images):
+        cv2.imwrite(name.replace("data/", ""), image)
+    print(detections)
+
+
+def main():
+    args = parser()
+    check_arguments_errors(args)
+
+    random.seed(3)  # deterministic bbox colors
+    network, class_names, class_colors = darknet.load_network(
+        args.config_file,
+        args.data_file,
+        args.weights,
+        batch_size=args.batch_size
+    )
+
+    images = load_images(args.input)
+
+    index = 0
+    while True:
+        # loop asking for new image paths if no list is given
+        if args.input:
+            if index >= len(images):
+                break
+            image_name = images[index]
+        else:
+            image_name = input("Enter Image Path: ")
+        prev_time = time.time()
+        image, detections = image_detection(
+            image_name, network, class_names, class_colors, args.thresh
+            )
+        if args.save_labels:
+            save_annotations(image_name, image, detections, class_names)
+        darknet.print_detections(detections, args.ext_output)
+        fps = int(1/(time.time() - prev_time))
+        print("FPS: {}".format(fps))
+        if not args.dont_show:
+            cv2.imshow('Inference', image)
+            if cv2.waitKey() & 0xFF == ord('q'):
+                break
+        index += 1
+
+
+if __name__ == "__main__":
+    # unconmment next line for an example of batch processing
+    # batch_detection_example()
+    main()
diff --git a/darknet-master/darknet_video.py b/darknet-master/darknet_video.py
new file mode 100644
index 0000000..c6682a2
--- /dev/null
+++ b/darknet-master/darknet_video.py
@@ -0,0 +1,213 @@
+import random
+import os
+import cv2
+import time
+import darknet
+import argparse
+import threading
+import queue
+
+
+def parser():
+    parser = argparse.ArgumentParser(description="YOLO Object Detection")
+    parser.add_argument("--input", type=str, default=0,
+                        help="video source. If empty, uses webcam 0 stream")
+    parser.add_argument("--out_filename", type=str, default="",
+                        help="inference video name. Not saved if empty")
+    parser.add_argument("--weights", default="yolov4.weights",
+                        help="yolo weights path")
+    parser.add_argument("--dont_show", action="store_true",
+                        help="window inference display. For headless systems")
+    parser.add_argument("--ext_output", action="store_true",
+                        help="display bbox coordinates of detected objects")
+    parser.add_argument("--config_file", default="./cfg/yolov4.cfg",
+                        help="path to config file")
+    parser.add_argument("--data_file", default="./cfg/coco.data",
+                        help="path to data file")
+    parser.add_argument("--thresh", type=float, default=.25,
+                        help="remove detections with confidence below this value")
+    return parser.parse_args()
+
+
+def str2int(video_path):
+    """
+    argparse returns strings although webcam uses int (0, 1 ...)
+    Cast to int if needed
+    """
+    try:
+        return int(video_path)
+    except ValueError:
+        return video_path
+
+
+def check_arguments_errors(args):
+    assert 0 < args.thresh < 1, "Threshold should be a float between zero and one (non-inclusive)"
+    if not os.path.exists(args.config_file):
+        raise(ValueError("Invalid config path {}".format(os.path.abspath(args.config_file))))
+    if not os.path.exists(args.weights):
+        raise(ValueError("Invalid weight path {}".format(os.path.abspath(args.weights))))
+    if not os.path.exists(args.data_file):
+        raise(ValueError("Invalid data file path {}".format(os.path.abspath(args.data_file))))
+    if str2int(args.input) == str and not os.path.exists(args.input):
+        raise(ValueError("Invalid video path {}".format(os.path.abspath(args.input))))
+
+
+def set_saved_video(output_video, size, fps):
+    fourcc = cv2.VideoWriter_fourcc(*"MJPG")
+    return cv2.VideoWriter(output_video, fourcc, fps, size)
+
+
+def convert2relative(bbox, preproc_h, preproc_w):
+    """
+    YOLO format use relative coordinates for annotation
+    """
+    x, y, w, h = bbox
+    return x / preproc_w, y / preproc_h, w / preproc_w, h / preproc_h
+
+
+def convert2original(image, bbox, preproc_h, preproc_w):
+    x, y, w, h = convert2relative(bbox, preproc_h, preproc_w)
+
+    image_h, image_w, __ = image.shape
+
+    orig_x = int(x * image_w)
+    orig_y = int(y * image_h)
+    orig_width = int(w * image_w)
+    orig_height = int(h * image_h)
+
+    bbox_converted = (orig_x, orig_y, orig_width, orig_height)
+
+    return bbox_converted
+
+
+# @TODO - cfati: Unused
+def convert4cropping(image, bbox, preproc_h, preproc_w):
+    x, y, w, h = convert2relative(bbox, preproc_h, preproc_w)
+
+    image_h, image_w, __ = image.shape
+
+    orig_left = int((x - w / 2.) * image_w)
+    orig_right = int((x + w / 2.) * image_w)
+    orig_top = int((y - h / 2.) * image_h)
+    orig_bottom = int((y + h / 2.) * image_h)
+
+    if orig_left < 0:
+        orig_left = 0
+    if orig_right > image_w - 1:
+        orig_right = image_w - 1
+    if orig_top < 0:
+        orig_top = 0
+    if orig_bottom > image_h - 1:
+        orig_bottom = image_h - 1
+
+    bbox_cropping = (orig_left, orig_top, orig_right, orig_bottom)
+
+    return bbox_cropping
+
+
+def video_capture(stop_flag, input_path, raw_frame_queue, preprocessed_frame_queue, preproc_h, preproc_w):
+    cap = cv2.VideoCapture(input_path)
+    while cap.isOpened() and not stop_flag.is_set():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frame_resized = cv2.resize(frame_rgb, (preproc_w, preproc_h),
+                                   interpolation=cv2.INTER_LINEAR)
+        raw_frame_queue.put(frame)
+        img_for_detect = darknet.make_image(preproc_w, preproc_h, 3)
+        darknet.copy_image_from_bytes(img_for_detect, frame_resized.tobytes())
+        preprocessed_frame_queue.put(img_for_detect)
+    stop_flag.set()
+    cap.release()
+
+
+def inference(stop_flag, preprocessed_frame_queue, detections_queue, fps_queue,
+              network, class_names, threshold):
+    while not stop_flag.is_set():
+        darknet_image = preprocessed_frame_queue.get()
+        prev_time = time.time()
+        detections = darknet.detect_image(network, class_names, darknet_image, thresh=threshold)
+        fps = 1 / (time.time() - prev_time)
+        detections_queue.put(detections)
+        fps_queue.put(int(fps))
+        print("FPS: {:.2f}".format(fps))
+        darknet.print_detections(detections, args.ext_output)
+        darknet.free_image(darknet_image)
+
+
+def drawing(stop_flag, input_video_fps, queues, preproc_h, preproc_w, vid_h, vid_w):
+    random.seed(3)  # deterministic bbox colors
+    raw_frame_queue, preprocessed_frame_queue, detections_queue, fps_queue = queues
+    video = set_saved_video(args.out_filename, (vid_w, vid_h), input_video_fps)
+    fps = 1
+    while not stop_flag.is_set():
+        frame = raw_frame_queue.get()
+        detections = detections_queue.get()
+        fps = fps_queue.get()
+        detections_adjusted = []
+        if frame is not None:
+            for label, confidence, bbox in detections:
+                bbox_adjusted = convert2original(frame, bbox, preproc_h, preproc_w)
+                detections_adjusted.append((str(label), confidence, bbox_adjusted))
+            image = darknet.draw_boxes(detections_adjusted, frame, class_colors)
+            if not args.dont_show:
+                cv2.imshow("Inference", image)
+            if args.out_filename is not None:
+                video.write(image)
+            if cv2.waitKey(fps) == 27:
+                break
+    stop_flag.set()
+    video.release()
+    cv2.destroyAllWindows()
+    timeout = 1 / (fps if fps > 0 else 0.5)
+    for q in (preprocessed_frame_queue, detections_queue, fps_queue):
+        try:
+            q.get(block=True, timeout=timeout)
+        except queue.Empty:
+            pass
+
+
+if __name__ == "__main__":
+    args = parser()
+    check_arguments_errors(args)
+    network, class_names, class_colors = darknet.load_network(
+        args.config_file,
+        args.data_file,
+        args.weights,
+        batch_size=1)
+    darknet_width = darknet.network_width(network)
+    darknet_height = darknet.network_height(network)
+    input_path = str2int(args.input)
+    cap = cv2.VideoCapture(input_path)  # Open video twice :(
+    video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    video_fps = int(cap.get(cv2.CAP_PROP_FPS))
+    cap.release()
+    del cap
+
+    ExecUnit = threading.Thread
+    Queue = queue.Queue
+    stop_flag = threading.Event()
+
+    raw_frame_queue = Queue()
+    preprocessed_frame_queue = Queue(maxsize=1)
+    detections_queue = Queue(maxsize=1)
+    fps_queue = Queue(maxsize=1)
+
+    exec_units = (
+        ExecUnit(target=video_capture, args=(stop_flag, input_path, raw_frame_queue, preprocessed_frame_queue,
+                                             darknet_height, darknet_width)),
+        ExecUnit(target=inference, args=(stop_flag, preprocessed_frame_queue, detections_queue, fps_queue,
+                                         network, class_names, args.thresh)),
+        ExecUnit(target=drawing, args=(stop_flag, video_fps,
+                                       (raw_frame_queue, preprocessed_frame_queue, detections_queue, fps_queue),
+                                       darknet_height, darknet_width, video_height, video_width)),
+    )
+    for exec_unit in exec_units:
+        exec_unit.start()
+    for exec_unit in exec_units:
+        exec_unit.join()
+
+    print("\nDone.")
+
diff --git a/darknet-master/docker-compose.yml b/darknet-master/docker-compose.yml
new file mode 100644
index 0000000..968fbcc
--- /dev/null
+++ b/darknet-master/docker-compose.yml
@@ -0,0 +1,20 @@
+version: '2'
+
+services:
+  yolo-gpu:
+    build: 
+      context: .
+      dockerfile: Dockerfile.gpu
+    image: yolo:gpu
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+  yolo-cpu:
+    build: 
+      context: .
+      dockerfile: Dockerfile.cpu
+    image: yolo:cpu
\ No newline at end of file
diff --git a/darknet-master/image_yolov3.sh b/darknet-master/image_yolov3.sh
new file mode 100644
index 0000000..49cc5eb
--- /dev/null
+++ b/darknet-master/image_yolov3.sh
@@ -0,0 +1,6 @@
+
+
+./darknet detector test ./cfg/coco.data ./cfg/yolov3.cfg ./yolov3.weights data/dog.jpg -i 0 -thresh 0.25
+
+
+
diff --git a/darknet-master/image_yolov4.sh b/darknet-master/image_yolov4.sh
new file mode 100644
index 0000000..be3fd10
--- /dev/null
+++ b/darknet-master/image_yolov4.sh
@@ -0,0 +1,6 @@
+
+
+./darknet detector test ./cfg/coco.data ./cfg/yolov4.cfg ./yolov4.weights data/dog.jpg -i 0 -thresh 0.25
+
+
+
diff --git a/darknet-master/include/darknet.h b/darknet-master/include/darknet.h
new file mode 100644
index 0000000..55ab50d
--- /dev/null
+++ b/darknet-master/include/darknet.h
@@ -0,0 +1,1125 @@
+#ifndef DARKNET_API
+#define DARKNET_API
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define inline __inline
+#endif
+
+#if defined(DEBUG) && !defined(_CRTDBG_MAP_ALLOC)
+#define _CRTDBG_MAP_ALLOC
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <assert.h>
+#include <pthread.h>
+
+#ifndef LIB_API
+#ifdef LIB_EXPORTS
+#if defined(_MSC_VER)
+#define LIB_API __declspec(dllexport)
+#else
+#define LIB_API __attribute__((visibility("default")))
+#endif
+#else
+#if defined(_MSC_VER)
+#define LIB_API
+#else
+#define LIB_API
+#endif
+#endif
+#endif
+
+#define SECRET_NUM -1234
+
+typedef enum { UNUSED_DEF_VAL } UNUSED_ENUM_TYPE;
+
+#ifdef GPU
+
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+
+#ifdef CUDNN
+#include <cudnn.h>
+#endif  // CUDNN
+#endif  // GPU
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct network;
+typedef struct network network;
+
+struct network_state;
+typedef struct network_state network_state;
+
+struct layer;
+typedef struct layer layer;
+
+struct image;
+typedef struct image image;
+
+struct detection;
+typedef struct detection detection;
+
+struct load_args;
+typedef struct load_args load_args;
+
+struct data;
+typedef struct data data;
+
+struct metadata;
+typedef struct metadata metadata;
+
+struct tree;
+typedef struct tree tree;
+
+extern int gpu_index;
+
+// option_list.h
+typedef struct metadata {
+    int classes;
+    char **names;
+} metadata;
+
+
+// tree.h
+typedef struct tree {
+    int *leaf;
+    int n;
+    int *parent;
+    int *child;
+    int *group;
+    char **name;
+
+    int groups;
+    int *group_size;
+    int *group_offset;
+} tree;
+
+
+// activations.h
+typedef enum {
+    LOGISTIC, RELU, RELU6, RELIE, LINEAR, RAMP, TANH, PLSE, REVLEAKY, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU, GELU, SWISH, MISH, HARD_MISH, NORM_CHAN, NORM_CHAN_SOFTMAX, NORM_CHAN_SOFTMAX_MAXVAL
+}ACTIVATION;
+
+// parser.h
+typedef enum {
+    IOU, GIOU, MSE, DIOU, CIOU
+} IOU_LOSS;
+
+// parser.h
+typedef enum {
+    DEFAULT_NMS, GREEDY_NMS, DIOU_NMS, CORNERS_NMS
+} NMS_KIND;
+
+// parser.h
+typedef enum {
+    YOLO_CENTER = 1 << 0, YOLO_LEFT_TOP = 1 << 1, YOLO_RIGHT_BOTTOM = 1 << 2
+} YOLO_POINT;
+
+// parser.h
+typedef enum {
+    NO_WEIGHTS, PER_FEATURE, PER_CHANNEL
+} WEIGHTS_TYPE_T;
+
+// parser.h
+typedef enum {
+    NO_NORMALIZATION, RELU_NORMALIZATION, SOFTMAX_NORMALIZATION
+} WEIGHTS_NORMALIZATION_T;
+
+// image.h
+typedef enum{
+    PNG, BMP, TGA, JPG
+} IMTYPE;
+
+// activations.h
+typedef enum{
+    MULT, ADD, SUB, DIV
+} BINARY_ACTIVATION;
+
+// blas.h
+typedef struct contrastive_params {
+    float sim;
+    float exp_sim;
+    float P;
+    int i, j;
+    int time_step_i, time_step_j;
+} contrastive_params;
+
+
+// layer.h
+typedef enum {
+    CONVOLUTIONAL,
+    DECONVOLUTIONAL,
+    CONNECTED,
+    MAXPOOL,
+    LOCAL_AVGPOOL,
+    SOFTMAX,
+    DETECTION,
+    DROPOUT,
+    CROP,
+    ROUTE,
+    COST,
+    NORMALIZATION,
+    AVGPOOL,
+    LOCAL,
+    SHORTCUT,
+    SCALE_CHANNELS,
+    SAM,
+    ACTIVE,
+    RNN,
+    GRU,
+    LSTM,
+    CONV_LSTM,
+    HISTORY,
+    CRNN,
+    BATCHNORM,
+    NETWORK,
+    XNOR,
+    REGION,
+    YOLO,
+    GAUSSIAN_YOLO,
+    ISEG,
+    REORG,
+    REORG_OLD,
+    UPSAMPLE,
+    LOGXENT,
+    L2NORM,
+    EMPTY,
+    BLANK,
+    CONTRASTIVE,
+    IMPLICIT
+} LAYER_TYPE;
+
+// layer.h
+typedef enum{
+    SSE, MASKED, L1, SEG, SMOOTH,WGAN
+} COST_TYPE;
+
+// layer.h
+typedef struct update_args {
+    int batch;
+    float learning_rate;
+    float momentum;
+    float decay;
+    int adam;
+    float B1;
+    float B2;
+    float eps;
+    int t;
+} update_args;
+
+// layer.h
+struct layer {
+    LAYER_TYPE type;
+    ACTIVATION activation;
+    ACTIVATION lstm_activation;
+    COST_TYPE cost_type;
+    void(*forward)   (struct layer, struct network_state);
+    void(*backward)  (struct layer, struct network_state);
+    void(*update)    (struct layer, int, float, float, float);
+    void(*forward_gpu)   (struct layer, struct network_state);
+    void(*backward_gpu)  (struct layer, struct network_state);
+    void(*update_gpu)    (struct layer, int, float, float, float, float);
+    layer *share_layer;
+    int train;
+    int avgpool;
+    int batch_normalize;
+    int shortcut;
+    int batch;
+    int dynamic_minibatch;
+    int forced;
+    int flipped;
+    int inputs;
+    int outputs;
+    float mean_alpha;
+    int nweights;
+    int nbiases;
+    int extra;
+    int truths;
+    int h, w, c;
+    int out_h, out_w, out_c;
+    int n;
+    int max_boxes;
+    int truth_size;
+    int groups;
+    int group_id;
+    int size;
+    int side;
+    int stride;
+    int stride_x;
+    int stride_y;
+    int dilation;
+    int antialiasing;
+    int maxpool_depth;
+    int maxpool_zero_nonmax;
+    int out_channels;
+    float reverse;
+    int coordconv;
+    int flatten;
+    int spatial;
+    int pad;
+    int sqrt;
+    int flip;
+    int index;
+    int scale_wh;
+    int binary;
+    int xnor;
+    int peephole;
+    int use_bin_output;
+    int keep_delta_gpu;
+    int optimized_memory;
+    int steps;
+    int history_size;
+    int bottleneck;
+    float time_normalizer;
+    int state_constrain;
+    int hidden;
+    int truth;
+    float smooth;
+    float dot;
+    int deform;
+    int grad_centr;
+    int sway;
+    int rotate;
+    int stretch;
+    int stretch_sway;
+    float angle;
+    float jitter;
+    float resize;
+    float saturation;
+    float exposure;
+    float shift;
+    float ratio;
+    float learning_rate_scale;
+    float clip;
+    int focal_loss;
+    float *classes_multipliers;
+    float label_smooth_eps;
+    int noloss;
+    int softmax;
+    int classes;
+    int detection;
+    int embedding_layer_id;
+    float *embedding_output;
+    int embedding_size;
+    float sim_thresh;
+    int track_history_size;
+    int dets_for_track;
+    int dets_for_show;
+    float track_ciou_norm;
+    int coords;
+    int background;
+    int rescore;
+    int objectness;
+    int does_cost;
+    int joint;
+    int noadjust;
+    int reorg;
+    int log;
+    int tanh;
+    int *mask;
+    int total;
+    float bflops;
+
+    int adam;
+    float B1;
+    float B2;
+    float eps;
+
+    int t;
+
+    float alpha;
+    float beta;
+    float kappa;
+
+    float coord_scale;
+    float object_scale;
+    float noobject_scale;
+    float mask_scale;
+    float class_scale;
+    int bias_match;
+    float random;
+    float ignore_thresh;
+    float truth_thresh;
+    float iou_thresh;
+    float thresh;
+    float focus;
+    int classfix;
+    int absolute;
+    int assisted_excitation;
+
+    int onlyforward;
+    int stopbackward;
+    int train_only_bn;
+    int dont_update;
+    int burnin_update;
+    int dontload;
+    int dontsave;
+    int dontloadscales;
+    int numload;
+
+    float temperature;
+    float probability;
+    float dropblock_size_rel;
+    int dropblock_size_abs;
+    int dropblock;
+    float scale;
+
+    int receptive_w;
+    int receptive_h;
+    int receptive_w_scale;
+    int receptive_h_scale;
+
+    char  * cweights;
+    int   * indexes;
+    int   * input_layers;
+    int   * input_sizes;
+    float **layers_output;
+    float **layers_delta;
+    WEIGHTS_TYPE_T weights_type;
+    WEIGHTS_NORMALIZATION_T weights_normalization;
+    int   * map;
+    int   * counts;
+    float ** sums;
+    float * rand;
+    float * cost;
+    int *labels;
+    int *class_ids;
+    int contrastive_neg_max;
+    float *cos_sim;
+    float *exp_cos_sim;
+    float *p_constrastive;
+    contrastive_params *contrast_p_gpu;
+    float * state;
+    float * prev_state;
+    float * forgot_state;
+    float * forgot_delta;
+    float * state_delta;
+    float * combine_cpu;
+    float * combine_delta_cpu;
+
+    float *concat;
+    float *concat_delta;
+
+    float *binary_weights;
+
+    float *biases;
+    float *bias_updates;
+
+    float *scales;
+    float *scale_updates;
+
+    float *weights_ema;
+    float *biases_ema;
+    float *scales_ema;
+
+    float *weights;
+    float *weight_updates;
+
+    float scale_x_y;
+    int objectness_smooth;
+    int new_coords;
+    int show_details;
+    float max_delta;
+    float uc_normalizer;
+    float iou_normalizer;
+    float obj_normalizer;
+    float cls_normalizer;
+    float delta_normalizer;
+    IOU_LOSS iou_loss;
+    IOU_LOSS iou_thresh_kind;
+    NMS_KIND nms_kind;
+    float beta_nms;
+    YOLO_POINT yolo_point;
+
+    char *align_bit_weights_gpu;
+    float *mean_arr_gpu;
+    float *align_workspace_gpu;
+    float *transposed_align_workspace_gpu;
+    int align_workspace_size;
+
+    char *align_bit_weights;
+    float *mean_arr;
+    int align_bit_weights_size;
+    int lda_align;
+    int new_lda;
+    int bit_align;
+
+    float *col_image;
+    float * delta;
+    float * output;
+    float * activation_input;
+    int delta_pinned;
+    int output_pinned;
+    float * loss;
+    float * squared;
+    float * norms;
+
+    float * spatial_mean;
+    float * mean;
+    float * variance;
+
+    float * mean_delta;
+    float * variance_delta;
+
+    float * rolling_mean;
+    float * rolling_variance;
+
+    float * x;
+    float * x_norm;
+
+    float * m;
+    float * v;
+
+    float * bias_m;
+    float * bias_v;
+    float * scale_m;
+    float * scale_v;
+
+
+    float *z_cpu;
+    float *r_cpu;
+    float *h_cpu;
+    float *stored_h_cpu;
+    float * prev_state_cpu;
+
+    float *temp_cpu;
+    float *temp2_cpu;
+    float *temp3_cpu;
+
+    float *dh_cpu;
+    float *hh_cpu;
+    float *prev_cell_cpu;
+    float *cell_cpu;
+    float *f_cpu;
+    float *i_cpu;
+    float *g_cpu;
+    float *o_cpu;
+    float *c_cpu;
+    float *stored_c_cpu;
+    float *dc_cpu;
+
+    float *binary_input;
+    uint32_t *bin_re_packed_input;
+    char *t_bit_input;
+
+    struct layer *input_layer;
+    struct layer *self_layer;
+    struct layer *output_layer;
+
+    struct layer *reset_layer;
+    struct layer *update_layer;
+    struct layer *state_layer;
+
+    struct layer *input_gate_layer;
+    struct layer *state_gate_layer;
+    struct layer *input_save_layer;
+    struct layer *state_save_layer;
+    struct layer *input_state_layer;
+    struct layer *state_state_layer;
+
+    struct layer *input_z_layer;
+    struct layer *state_z_layer;
+
+    struct layer *input_r_layer;
+    struct layer *state_r_layer;
+
+    struct layer *input_h_layer;
+    struct layer *state_h_layer;
+
+    struct layer *wz;
+    struct layer *uz;
+    struct layer *wr;
+    struct layer *ur;
+    struct layer *wh;
+    struct layer *uh;
+    struct layer *uo;
+    struct layer *wo;
+    struct layer *vo;
+    struct layer *uf;
+    struct layer *wf;
+    struct layer *vf;
+    struct layer *ui;
+    struct layer *wi;
+    struct layer *vi;
+    struct layer *ug;
+    struct layer *wg;
+
+    tree *softmax_tree;
+
+    size_t workspace_size;
+
+//#ifdef GPU
+    int *indexes_gpu;
+
+    int stream;
+    int wait_stream_id;
+
+    float *z_gpu;
+    float *r_gpu;
+    float *h_gpu;
+    float *stored_h_gpu;
+    float *bottelneck_hi_gpu;
+    float *bottelneck_delta_gpu;
+
+    float *temp_gpu;
+    float *temp2_gpu;
+    float *temp3_gpu;
+
+    float *dh_gpu;
+    float *hh_gpu;
+    float *prev_cell_gpu;
+    float *prev_state_gpu;
+    float *last_prev_state_gpu;
+    float *last_prev_cell_gpu;
+    float *cell_gpu;
+    float *f_gpu;
+    float *i_gpu;
+    float *g_gpu;
+    float *o_gpu;
+    float *c_gpu;
+    float *stored_c_gpu;
+    float *dc_gpu;
+
+    // adam
+    float *m_gpu;
+    float *v_gpu;
+    float *bias_m_gpu;
+    float *scale_m_gpu;
+    float *bias_v_gpu;
+    float *scale_v_gpu;
+
+    float * combine_gpu;
+    float * combine_delta_gpu;
+
+    float * forgot_state_gpu;
+    float * forgot_delta_gpu;
+    float * state_gpu;
+    float * state_delta_gpu;
+    float * gate_gpu;
+    float * gate_delta_gpu;
+    float * save_gpu;
+    float * save_delta_gpu;
+    float * concat_gpu;
+    float * concat_delta_gpu;
+
+    float *binary_input_gpu;
+    float *binary_weights_gpu;
+    float *bin_conv_shortcut_in_gpu;
+    float *bin_conv_shortcut_out_gpu;
+
+    float * mean_gpu;
+    float * variance_gpu;
+    float * m_cbn_avg_gpu;
+    float * v_cbn_avg_gpu;
+
+    float * rolling_mean_gpu;
+    float * rolling_variance_gpu;
+
+    float * variance_delta_gpu;
+    float * mean_delta_gpu;
+
+    float * col_image_gpu;
+
+    float * x_gpu;
+    float * x_norm_gpu;
+    float * weights_gpu;
+    float * weight_updates_gpu;
+    float * weight_deform_gpu;
+    float * weight_change_gpu;
+
+    float * weights_gpu16;
+    float * weight_updates_gpu16;
+
+    float * biases_gpu;
+    float * bias_updates_gpu;
+    float * bias_change_gpu;
+
+    float * scales_gpu;
+    float * scale_updates_gpu;
+    float * scale_change_gpu;
+
+    float * input_antialiasing_gpu;
+    float * output_gpu;
+    float * output_avg_gpu;
+    float * activation_input_gpu;
+    float * loss_gpu;
+    float * delta_gpu;
+    float * cos_sim_gpu;
+    float * rand_gpu;
+    float * drop_blocks_scale;
+    float * drop_blocks_scale_gpu;
+    float * squared_gpu;
+    float * norms_gpu;
+
+    float *gt_gpu;
+    float *a_avg_gpu;
+
+    int *input_sizes_gpu;
+    float **layers_output_gpu;
+    float **layers_delta_gpu;
+#ifdef CUDNN
+    cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
+    cudnnTensorDescriptor_t srcTensorDesc16, dstTensorDesc16;
+    cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
+    cudnnTensorDescriptor_t dsrcTensorDesc16, ddstTensorDesc16;
+    cudnnTensorDescriptor_t normTensorDesc, normDstTensorDesc, normDstTensorDescF16;
+    cudnnFilterDescriptor_t weightDesc, weightDesc16;
+    cudnnFilterDescriptor_t dweightDesc, dweightDesc16;
+    cudnnConvolutionDescriptor_t convDesc;
+    cudnnConvolutionFwdAlgo_t fw_algo, fw_algo16;
+    cudnnConvolutionBwdDataAlgo_t bd_algo, bd_algo16;
+    cudnnConvolutionBwdFilterAlgo_t bf_algo, bf_algo16;
+    cudnnPoolingDescriptor_t poolingDesc;
+#else   // CUDNN
+    void* srcTensorDesc, *dstTensorDesc;
+    void* srcTensorDesc16, *dstTensorDesc16;
+    void* dsrcTensorDesc, *ddstTensorDesc;
+    void* dsrcTensorDesc16, *ddstTensorDesc16;
+    void* normTensorDesc, *normDstTensorDesc, *normDstTensorDescF16;
+    void* weightDesc, *weightDesc16;
+    void* dweightDesc, *dweightDesc16;
+    void* convDesc;
+    UNUSED_ENUM_TYPE fw_algo, fw_algo16;
+    UNUSED_ENUM_TYPE bd_algo, bd_algo16;
+    UNUSED_ENUM_TYPE bf_algo, bf_algo16;
+    void* poolingDesc;
+#endif  // CUDNN
+//#endif  // GPU
+};
+
+
+// network.h
+typedef enum {
+    CONSTANT, STEP, EXP, POLY, STEPS, SIG, RANDOM, SGDR
+} learning_rate_policy;
+
+// network.h
+typedef struct network {
+    int n;
+    int batch;
+    uint64_t *seen;
+    float *badlabels_reject_threshold;
+    float *delta_rolling_max;
+    float *delta_rolling_avg;
+    float *delta_rolling_std;
+    int weights_reject_freq;
+    int equidistant_point;
+    float badlabels_rejection_percentage;
+    float num_sigmas_reject_badlabels;
+    float ema_alpha;
+    int *cur_iteration;
+    float loss_scale;
+    int *t;
+    float epoch;
+    int subdivisions;
+    layer *layers;
+    float *output;
+    learning_rate_policy policy;
+    int benchmark_layers;
+    int *total_bbox;
+    int *rewritten_bbox;
+
+    float learning_rate;
+    float learning_rate_min;
+    float learning_rate_max;
+    int batches_per_cycle;
+    int batches_cycle_mult;
+    float momentum;
+    float decay;
+    float gamma;
+    float scale;
+    float power;
+    int time_steps;
+    int step;
+    int max_batches;
+    int num_boxes;
+    int train_images_num;
+    float *seq_scales;
+    float *scales;
+    int   *steps;
+    int num_steps;
+    int burn_in;
+    int cudnn_half;
+
+    int adam;
+    float B1;
+    float B2;
+    float eps;
+
+    int inputs;
+    int outputs;
+    int truths;
+    int notruth;
+    int h, w, c;
+    int max_crop;
+    int min_crop;
+    float max_ratio;
+    float min_ratio;
+    int center;
+    int flip; // horizontal flip 50% probability augmentaiont for classifier training (default = 1)
+    int gaussian_noise;
+    int blur;
+    int mixup;
+    float label_smooth_eps;
+    int resize_step;
+    int attention;
+    int adversarial;
+    float adversarial_lr;
+    float max_chart_loss;
+    int letter_box;
+    int mosaic_bound;
+    int contrastive;
+    int contrastive_jit_flip;
+    int contrastive_color;
+    int unsupervised;
+    float angle;
+    float aspect;
+    float exposure;
+    float saturation;
+    float hue;
+    int random;
+    int track;
+    int augment_speed;
+    int sequential_subdivisions;
+    int init_sequential_subdivisions;
+    int current_subdivision;
+    int try_fix_nan;
+
+    int gpu_index;
+    tree *hierarchy;
+
+    float *input;
+    float *truth;
+    float *delta;
+    float *workspace;
+    int train;
+    int index;
+    float *cost;
+    float clip;
+
+//#ifdef GPU
+    //float *input_gpu;
+    //float *truth_gpu;
+    float *delta_gpu;
+    float *output_gpu;
+
+    float *input_state_gpu;
+    float *input_pinned_cpu;
+    int input_pinned_cpu_flag;
+
+    float **input_gpu;
+    float **truth_gpu;
+    float **input16_gpu;
+    float **output16_gpu;
+    size_t *max_input16_size;
+    size_t *max_output16_size;
+    int wait_stream;
+
+    void *cuda_graph;
+    void *cuda_graph_exec;
+    int use_cuda_graph;
+    int *cuda_graph_ready;
+
+    float *global_delta_gpu;
+    float *state_delta_gpu;
+    size_t max_delta_gpu_size;
+//#endif  // GPU
+    int optimized_memory;
+    int dynamic_minibatch;
+    size_t workspace_size_limit;
+} network;
+
+// network.h
+typedef struct network_state {
+    float *truth;
+    float *input;
+    float *delta;
+    float *workspace;
+    int train;
+    int index;
+    network net;
+} network_state;
+
+//typedef struct {
+//    int w;
+//    int h;
+//    float scale;
+//    float rad;
+//    float dx;
+//    float dy;
+//    float aspect;
+//} augment_args;
+
+// image.h
+typedef struct image {
+    int w;
+    int h;
+    int c;
+    float *data;
+} image;
+
+//typedef struct {
+//    int w;
+//    int h;
+//    int c;
+//    float *data;
+//} image;
+
+// box.h
+typedef struct box {
+    float x, y, w, h;
+} box;
+
+// box.h
+typedef struct boxabs {
+    float left, right, top, bot;
+} boxabs;
+
+// box.h
+typedef struct dxrep {
+    float dt, db, dl, dr;
+} dxrep;
+
+// box.h
+typedef struct ious {
+    float iou, giou, diou, ciou;
+    dxrep dx_iou;
+    dxrep dx_giou;
+} ious;
+
+
+// box.h
+typedef struct detection{
+    box bbox;
+    int classes;
+    int best_class_idx;
+    float *prob;
+    float *mask;
+    float objectness;
+    int sort_class;
+    float *uc; // Gaussian_YOLOv3 - tx,ty,tw,th uncertainty
+    int points; // bit-0 - center, bit-1 - top-left-corner, bit-2 - bottom-right-corner
+    float *embeddings;  // embeddings for tracking
+    int embedding_size;
+    float sim;
+    int track_id;
+} detection;
+
+// network.c -batch inference
+typedef struct det_num_pair {
+    int num;
+    detection *dets;
+} det_num_pair, *pdet_num_pair;
+
+// matrix.h
+typedef struct matrix {
+    int rows, cols;
+    float **vals;
+} matrix;
+
+// data.h
+typedef struct data {
+    int w, h;
+    matrix X;
+    matrix y;
+    int shallow;
+    int *num_boxes;
+    box **boxes;
+} data;
+
+// data.h
+typedef enum {
+    CLASSIFICATION_DATA, DETECTION_DATA, CAPTCHA_DATA, REGION_DATA, IMAGE_DATA, COMPARE_DATA, WRITING_DATA, SWAG_DATA, TAG_DATA, OLD_CLASSIFICATION_DATA, STUDY_DATA, DET_DATA, SUPER_DATA, LETTERBOX_DATA, REGRESSION_DATA, SEGMENTATION_DATA, INSTANCE_DATA, ISEG_DATA
+} data_type;
+
+// data.h
+typedef struct load_args {
+    int threads;
+    char **paths;
+    char *path;
+    int n;
+    int m;
+    char **labels;
+    int h;
+    int w;
+    int c; // color depth
+    int out_w;
+    int out_h;
+    int nh;
+    int nw;
+    int num_boxes;
+    int truth_size;
+    int min, max, size;
+    int classes;
+    int background;
+    int scale;
+    int center;
+    int coords;
+    int mini_batch;
+    int track;
+    int augment_speed;
+    int letter_box;
+    int mosaic_bound;
+    int show_imgs;
+    int dontuse_opencv;
+    int contrastive;
+    int contrastive_jit_flip;
+    int contrastive_color;
+    float jitter;
+    float resize;
+    int flip;
+    int gaussian_noise;
+    int blur;
+    int mixup;
+    float label_smooth_eps;
+    float angle;
+    float aspect;
+    float saturation;
+    float exposure;
+    float hue;
+    data *d;
+    image *im;
+    image *resized;
+    data_type type;
+    tree *hierarchy;
+} load_args;
+
+// data.h
+typedef struct box_label {
+    int id;
+    int track_id;
+    float x, y, w, h;
+    float left, right, top, bottom;
+} box_label;
+
+// list.h
+//typedef struct node {
+//    void *val;
+//    struct node *next;
+//    struct node *prev;
+//} node;
+
+// list.h
+//typedef struct list {
+//    int size;
+//    node *front;
+//    node *back;
+//} list;
+// -----------------------------------------------------
+
+
+// parser.c
+LIB_API network *load_network(char *cfg, char *weights, int clear);
+LIB_API network *load_network_custom(char *cfg, char *weights, int clear, int batch);
+LIB_API void free_network(network net);
+LIB_API void free_network_ptr(network* net);
+
+// network.c
+LIB_API load_args get_base_args(network *net);
+
+// box.h
+LIB_API void do_nms_sort(detection *dets, int total, int classes, float thresh);
+LIB_API void do_nms_obj(detection *dets, int total, int classes, float thresh);
+LIB_API void diounms_sort(detection *dets, int total, int classes, float thresh, NMS_KIND nms_kind, float beta1);
+
+// network.h
+LIB_API float *network_predict(network net, float *input);
+LIB_API float *network_predict_ptr(network *net, float *input);
+#ifdef CUDA_OPENGL_INTEGRATION
+LIB_API float *network_predict_gl_texture(network *net, uint32_t texture_id);
+#endif // CUDA_OPENGL_INTEGRATION
+
+LIB_API void set_batch_network(network *net, int b);
+LIB_API detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter);
+LIB_API det_num_pair* network_predict_batch(network *net, image im, int batch_size, int w, int h, float thresh, float hier, int *map, int relative, int letter);
+LIB_API void free_detections(detection *dets, int n);
+LIB_API void free_batch_detections(det_num_pair *det_num_pairs, int n);
+LIB_API void fuse_conv_batchnorm(network net);
+LIB_API void calculate_binary_weights(network net);
+LIB_API char *detection_to_json(detection *dets, int nboxes, int classes, char **names, long long int frame_id, char *filename);
+
+LIB_API layer* get_network_layer(network* net, int i);
+//LIB_API detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter);
+LIB_API detection *make_network_boxes(network *net, float thresh, int *num);
+LIB_API void reset_rnn(network *net);
+LIB_API float *network_predict_image(network *net, image im);
+LIB_API float *network_predict_image_letterbox(network *net, image im);
+LIB_API float validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, float thresh_calc_avg_iou, const float iou_thresh, const int map_points, int letter_box, network *existing_net);
+LIB_API void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int dont_show, int calc_map, float thresh, float iou_thresh, int mjpeg_port, int show_imgs, int benchmark_layers, char* chart_path, int mAP_epochs);
+LIB_API void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh,
+    float hier_thresh, int dont_show, int ext_output, int save_labels, char *outfile, int letter_box, int benchmark_layers);
+LIB_API int network_width(network *net);
+LIB_API int network_height(network *net);
+LIB_API void optimize_picture(network *net, image orig, int max_layer, float scale, float rate, float thresh, int norm);
+
+// image.h
+LIB_API void make_image_red(image im);
+LIB_API image make_attention_image(int img_size, float *original_delta_cpu, float *original_input_cpu, int w, int h, int c, float alpha);
+LIB_API image resize_image(image im, int w, int h);
+LIB_API void quantize_image(image im);
+LIB_API void copy_image_from_bytes(image im, char *pdata);
+LIB_API image letterbox_image(image im, int w, int h);
+LIB_API void rgbgr_image(image im);
+LIB_API image make_image(int w, int h, int c);
+LIB_API image load_image_color(char *filename, int w, int h);
+LIB_API void free_image(image m);
+LIB_API image crop_image(image im, int dx, int dy, int w, int h);
+LIB_API image resize_min(image im, int min);
+
+// layer.h
+LIB_API void free_layer_custom(layer l, int keep_cudnn_desc);
+LIB_API void free_layer(layer l);
+
+// data.c
+LIB_API void free_data(data d);
+LIB_API pthread_t load_data(load_args args);
+LIB_API void free_load_threads(void *ptr);
+LIB_API pthread_t load_data_in_thread(load_args args);
+LIB_API void *load_thread(void *ptr);
+
+// dark_cuda.h
+LIB_API void cuda_pull_array(float *x_gpu, float *x, size_t n);
+LIB_API void cuda_pull_array_async(float *x_gpu, float *x, size_t n);
+LIB_API void cuda_set_device(int n);
+LIB_API void *cuda_get_context();
+
+// utils.h
+LIB_API void free_ptrs(void **ptrs, int n);
+LIB_API void top_k(float *a, int n, int k, int *index);
+
+// tree.h
+LIB_API tree *read_tree(char *filename);
+
+// option_list.h
+LIB_API metadata get_metadata(char *file);
+
+
+// http_stream.h
+LIB_API void delete_json_sender();
+LIB_API void send_json_custom(char const* send_buf, int port, int timeout);
+LIB_API double get_time_point();
+void start_timer();
+void stop_timer();
+double get_time();
+void stop_timer_and_show();
+void stop_timer_and_show_name(char *name);
+void show_total_time();
+
+LIB_API void set_track_id(detection *new_dets, int new_dets_num, float thresh, float sim_thresh, float track_ciou_norm, int deque_size, int dets_for_track, int dets_for_show);
+LIB_API int fill_remaining_id(detection *new_dets, int new_dets_num, int new_track_id, float thresh);
+
+
+// gemm.h
+LIB_API void init_cpu();
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+#endif  // DARKNET_API
diff --git a/darknet-master/include/yolo_v2_class.hpp b/darknet-master/include/yolo_v2_class.hpp
new file mode 100644
index 0000000..1d70a2c
--- /dev/null
+++ b/darknet-master/include/yolo_v2_class.hpp
@@ -0,0 +1,1053 @@
+#ifndef YOLO_V2_CLASS_HPP
+#define YOLO_V2_CLASS_HPP
+
+#ifndef LIB_API
+#ifdef LIB_EXPORTS
+#if defined(_MSC_VER)
+#define LIB_API __declspec(dllexport)
+#else
+#define LIB_API __attribute__((visibility("default")))
+#endif
+#else
+#if defined(_MSC_VER)
+#define LIB_API
+#else
+#define LIB_API
+#endif
+#endif
+#endif
+
+#define C_SHARP_MAX_OBJECTS 1000
+
+struct bbox_t {
+    unsigned int x, y, w, h;       // (x,y) - top-left corner, (w, h) - width & height of bounded box
+    float prob;                    // confidence - probability that the object was found correctly
+    unsigned int obj_id;           // class of object - from range [0, classes-1]
+    unsigned int track_id;         // tracking id for video (0 - untracked, 1 - inf - tracked object)
+    unsigned int frames_counter;   // counter of frames on which the object was detected
+    float x_3d, y_3d, z_3d;        // center of object (in Meters) if ZED 3D Camera is used
+};
+
+struct image_t {
+    int h;                        // height
+    int w;                        // width
+    int c;                        // number of chanels (3 - for RGB)
+    float *data;                  // pointer to the image data
+};
+
+struct bbox_t_container {
+    bbox_t candidates[C_SHARP_MAX_OBJECTS];
+};
+
+#ifdef __cplusplus
+#include <memory>
+#include <vector>
+#include <deque>
+#include <algorithm>
+#include <chrono>
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <cmath>
+
+#ifdef OPENCV
+#include <opencv2/opencv.hpp>            // C++
+#include <opencv2/highgui/highgui_c.h>   // C
+#include <opencv2/imgproc/imgproc_c.h>   // C
+#endif
+
+extern "C" LIB_API int init(const char *configurationFilename, const char *weightsFilename, int gpu, int batch_size);
+extern "C" LIB_API int detect_image(const char *filename, bbox_t_container &container);
+extern "C" LIB_API int detect_mat(const uint8_t* data, const size_t data_length, bbox_t_container &container);
+extern "C" LIB_API int dispose();
+extern "C" LIB_API int get_device_count();
+extern "C" LIB_API int get_device_name(int gpu, char* deviceName);
+extern "C" LIB_API bool built_with_cuda();
+extern "C" LIB_API bool built_with_cudnn();
+extern "C" LIB_API bool built_with_opencv();
+extern "C" LIB_API void send_json_custom(char const* send_buf, int port, int timeout);
+
+class Detector {
+    std::shared_ptr<void> detector_gpu_ptr;
+    std::deque<std::vector<bbox_t>> prev_bbox_vec_deque;
+    std::string _cfg_filename, _weight_filename;
+public:
+    const int cur_gpu_id;
+    float nms = .4;
+    bool wait_stream;
+
+    LIB_API Detector(std::string cfg_filename, std::string weight_filename, int gpu_id = 0, int batch_size = 1);
+    LIB_API ~Detector();
+
+    LIB_API std::vector<bbox_t> detect(std::string image_filename, float thresh = 0.2, bool use_mean = false);
+    LIB_API std::vector<bbox_t> detect(image_t img, float thresh = 0.2, bool use_mean = false);
+    LIB_API std::vector<std::vector<bbox_t>> detectBatch(image_t img, int batch_size, int width, int height, float thresh, bool make_nms = true);
+    static LIB_API image_t load_image(std::string image_filename);
+    static LIB_API void free_image(image_t m);
+    LIB_API int get_net_width() const;
+    LIB_API int get_net_height() const;
+    LIB_API int get_net_color_depth() const;
+
+    LIB_API std::vector<bbox_t> tracking_id(std::vector<bbox_t> cur_bbox_vec, bool const change_history = true,
+                                                int const frames_story = 5, int const max_dist = 40);
+
+    LIB_API void *get_cuda_context();
+
+    //LIB_API bool send_json_http(std::vector<bbox_t> cur_bbox_vec, std::vector<std::string> obj_names, int frame_id,
+    //    std::string filename = std::string(), int timeout = 400000, int port = 8070);
+
+    std::vector<bbox_t> detect_resized(image_t img, int init_w, int init_h, float thresh = 0.2, bool use_mean = false)
+    {
+        if (img.data == NULL)
+            throw std::runtime_error("Image is empty");
+        auto detection_boxes = detect(img, thresh, use_mean);
+        float wk = (float)init_w / img.w, hk = (float)init_h / img.h;
+        for (auto &i : detection_boxes) i.x *= wk, i.w *= wk, i.y *= hk, i.h *= hk;
+        return detection_boxes;
+    }
+
+#ifdef OPENCV
+    std::vector<bbox_t> detect(cv::Mat mat, float thresh = 0.2, bool use_mean = false)
+    {
+        if(mat.data == NULL)
+            throw std::runtime_error("Image is empty");
+        auto image_ptr = mat_to_image_resize(mat);
+        return detect_resized(*image_ptr, mat.cols, mat.rows, thresh, use_mean);
+    }
+
+    std::shared_ptr<image_t> mat_to_image_resize(cv::Mat mat) const
+    {
+        if (mat.data == NULL) return std::shared_ptr<image_t>(NULL);
+
+        cv::Size network_size = cv::Size(get_net_width(), get_net_height());
+        cv::Mat det_mat;
+        if (mat.size() != network_size)
+            cv::resize(mat, det_mat, network_size);
+        else
+            det_mat = mat;  // only reference is copied
+
+        return mat_to_image(det_mat);
+    }
+
+    static std::shared_ptr<image_t> mat_to_image(cv::Mat img_src)
+    {
+        cv::Mat img;
+        if (img_src.channels() == 4) cv::cvtColor(img_src, img, cv::COLOR_RGBA2BGR);
+        else if (img_src.channels() == 3) cv::cvtColor(img_src, img, cv::COLOR_RGB2BGR);
+        else if (img_src.channels() == 1) cv::cvtColor(img_src, img, cv::COLOR_GRAY2BGR);
+        else std::cerr << " Warning: img_src.channels() is not 1, 3 or 4. It is = " << img_src.channels() << std::endl;
+        std::shared_ptr<image_t> image_ptr(new image_t, [](image_t *img) { free_image(*img); delete img; });
+        *image_ptr = mat_to_image_custom(img);
+        return image_ptr;
+    }
+
+private:
+
+    static image_t mat_to_image_custom(cv::Mat mat)
+    {
+        int w = mat.cols;
+        int h = mat.rows;
+        int c = mat.channels();
+        image_t im = make_image_custom(w, h, c);
+        unsigned char *data = (unsigned char *)mat.data;
+        int step = mat.step;
+        for (int y = 0; y < h; ++y) {
+            for (int k = 0; k < c; ++k) {
+                for (int x = 0; x < w; ++x) {
+                    im.data[k*w*h + y*w + x] = data[y*step + x*c + k] / 255.0f;
+                }
+            }
+        }
+        return im;
+    }
+
+    static image_t make_empty_image(int w, int h, int c)
+    {
+        image_t out;
+        out.data = 0;
+        out.h = h;
+        out.w = w;
+        out.c = c;
+        return out;
+    }
+
+    static image_t make_image_custom(int w, int h, int c)
+    {
+        image_t out = make_empty_image(w, h, c);
+        out.data = (float *)calloc(h*w*c, sizeof(float));
+        return out;
+    }
+
+#endif    // OPENCV
+
+public:
+
+    bool send_json_http(std::vector<bbox_t> cur_bbox_vec, std::vector<std::string> obj_names, int frame_id,
+        std::string filename = std::string(), int timeout = 400000, int port = 8070)
+    {
+        std::string send_str;
+
+        char *tmp_buf = (char *)calloc(1024, sizeof(char));
+        if (!filename.empty()) {
+            sprintf(tmp_buf, "{\n \"frame_id\":%d, \n \"filename\":\"%s\", \n \"objects\": [ \n", frame_id, filename.c_str());
+        }
+        else {
+            sprintf(tmp_buf, "{\n \"frame_id\":%d, \n \"objects\": [ \n", frame_id);
+        }
+        send_str = tmp_buf;
+        free(tmp_buf);
+
+        for (auto & i : cur_bbox_vec) {
+            char *buf = (char *)calloc(2048, sizeof(char));
+
+            sprintf(buf, "  {\"class_id\":%d, \"name\":\"%s\", \"absolute_coordinates\":{\"center_x\":%d, \"center_y\":%d, \"width\":%d, \"height\":%d}, \"confidence\":%f",
+                i.obj_id, obj_names[i.obj_id].c_str(), i.x, i.y, i.w, i.h, i.prob);
+
+            //sprintf(buf, "  {\"class_id\":%d, \"name\":\"%s\", \"relative_coordinates\":{\"center_x\":%f, \"center_y\":%f, \"width\":%f, \"height\":%f}, \"confidence\":%f",
+            //    i.obj_id, obj_names[i.obj_id], i.x, i.y, i.w, i.h, i.prob);
+
+            send_str += buf;
+
+            if (!std::isnan(i.z_3d)) {
+                sprintf(buf, "\n    , \"coordinates_in_meters\":{\"x_3d\":%.2f, \"y_3d\":%.2f, \"z_3d\":%.2f}",
+                    i.x_3d, i.y_3d, i.z_3d);
+                send_str += buf;
+            }
+
+            send_str += "}\n";
+
+            free(buf);
+        }
+
+        //send_str +=  "\n ] \n}, \n";
+        send_str += "\n ] \n}";
+
+        send_json_custom(send_str.c_str(), port, timeout);
+        return true;
+    }
+};
+// --------------------------------------------------------------------------------
+
+
+#if defined(TRACK_OPTFLOW) && defined(OPENCV) && defined(GPU)
+
+#include <opencv2/cudaoptflow.hpp>
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudaarithm.hpp>
+#include <opencv2/core/cuda.hpp>
+
+class Tracker_optflow {
+public:
+    const int gpu_count;
+    const int gpu_id;
+    const int flow_error;
+
+
+    Tracker_optflow(int _gpu_id = 0, int win_size = 15, int max_level = 3, int iterations = 8000, int _flow_error = -1) :
+        gpu_count(cv::cuda::getCudaEnabledDeviceCount()), gpu_id(std::min(_gpu_id, gpu_count-1)),
+        flow_error((_flow_error > 0)? _flow_error:(win_size*4))
+    {
+        int const old_gpu_id = cv::cuda::getDevice();
+        cv::cuda::setDevice(gpu_id);
+
+        stream = cv::cuda::Stream();
+
+        sync_PyrLKOpticalFlow_gpu = cv::cuda::SparsePyrLKOpticalFlow::create();
+        sync_PyrLKOpticalFlow_gpu->setWinSize(cv::Size(win_size, win_size));    // 9, 15, 21, 31
+        sync_PyrLKOpticalFlow_gpu->setMaxLevel(max_level);        // +- 3 pt
+        sync_PyrLKOpticalFlow_gpu->setNumIters(iterations);    // 2000, def: 30
+
+        cv::cuda::setDevice(old_gpu_id);
+    }
+
+    // just to avoid extra allocations
+    cv::cuda::GpuMat src_mat_gpu;
+    cv::cuda::GpuMat dst_mat_gpu, dst_grey_gpu;
+    cv::cuda::GpuMat prev_pts_flow_gpu, cur_pts_flow_gpu;
+    cv::cuda::GpuMat status_gpu, err_gpu;
+
+    cv::cuda::GpuMat src_grey_gpu;    // used in both functions
+    cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> sync_PyrLKOpticalFlow_gpu;
+    cv::cuda::Stream stream;
+
+    std::vector<bbox_t> cur_bbox_vec;
+    std::vector<bool> good_bbox_vec_flags;
+    cv::Mat prev_pts_flow_cpu;
+
+    void update_cur_bbox_vec(std::vector<bbox_t> _cur_bbox_vec)
+    {
+        cur_bbox_vec = _cur_bbox_vec;
+        good_bbox_vec_flags = std::vector<bool>(cur_bbox_vec.size(), true);
+        cv::Mat prev_pts, cur_pts_flow_cpu;
+
+        for (auto &i : cur_bbox_vec) {
+            float x_center = (i.x + i.w / 2.0F);
+            float y_center = (i.y + i.h / 2.0F);
+            prev_pts.push_back(cv::Point2f(x_center, y_center));
+        }
+
+        if (prev_pts.rows == 0)
+            prev_pts_flow_cpu = cv::Mat();
+        else
+            cv::transpose(prev_pts, prev_pts_flow_cpu);
+
+        if (prev_pts_flow_gpu.cols < prev_pts_flow_cpu.cols) {
+            prev_pts_flow_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), prev_pts_flow_cpu.type());
+            cur_pts_flow_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), prev_pts_flow_cpu.type());
+
+            status_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), CV_8UC1);
+            err_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), CV_32FC1);
+        }
+
+        prev_pts_flow_gpu.upload(cv::Mat(prev_pts_flow_cpu), stream);
+    }
+
+
+    void update_tracking_flow(cv::Mat src_mat, std::vector<bbox_t> _cur_bbox_vec)
+    {
+        int const old_gpu_id = cv::cuda::getDevice();
+        if (old_gpu_id != gpu_id)
+            cv::cuda::setDevice(gpu_id);
+
+        if (src_mat.channels() == 1 || src_mat.channels() == 3 || src_mat.channels() == 4) {
+            if (src_mat_gpu.cols == 0) {
+                src_mat_gpu = cv::cuda::GpuMat(src_mat.size(), src_mat.type());
+                src_grey_gpu = cv::cuda::GpuMat(src_mat.size(), CV_8UC1);
+            }
+
+            if (src_mat.channels() == 1) {
+                src_mat_gpu.upload(src_mat, stream);
+                src_mat_gpu.copyTo(src_grey_gpu);
+            }
+            else if (src_mat.channels() == 3) {
+                src_mat_gpu.upload(src_mat, stream);
+                cv::cuda::cvtColor(src_mat_gpu, src_grey_gpu, CV_BGR2GRAY, 1, stream);
+            }
+            else if (src_mat.channels() == 4) {
+                src_mat_gpu.upload(src_mat, stream);
+                cv::cuda::cvtColor(src_mat_gpu, src_grey_gpu, CV_BGRA2GRAY, 1, stream);
+            }
+            else {
+                std::cerr << " Warning: src_mat.channels() is not: 1, 3 or 4. It is = " << src_mat.channels() << " \n";
+                return;
+            }
+
+        }
+        update_cur_bbox_vec(_cur_bbox_vec);
+
+        if (old_gpu_id != gpu_id)
+            cv::cuda::setDevice(old_gpu_id);
+    }
+
+
+    std::vector<bbox_t> tracking_flow(cv::Mat dst_mat, bool check_error = true)
+    {
+        if (sync_PyrLKOpticalFlow_gpu.empty()) {
+            std::cout << "sync_PyrLKOpticalFlow_gpu isn't initialized \n";
+            return cur_bbox_vec;
+        }
+
+        int const old_gpu_id = cv::cuda::getDevice();
+        if(old_gpu_id != gpu_id)
+            cv::cuda::setDevice(gpu_id);
+
+        if (dst_mat_gpu.cols == 0) {
+            dst_mat_gpu = cv::cuda::GpuMat(dst_mat.size(), dst_mat.type());
+            dst_grey_gpu = cv::cuda::GpuMat(dst_mat.size(), CV_8UC1);
+        }
+
+        //dst_grey_gpu.upload(dst_mat, stream);    // use BGR
+        dst_mat_gpu.upload(dst_mat, stream);
+        cv::cuda::cvtColor(dst_mat_gpu, dst_grey_gpu, CV_BGR2GRAY, 1, stream);
+
+        if (src_grey_gpu.rows != dst_grey_gpu.rows || src_grey_gpu.cols != dst_grey_gpu.cols) {
+            stream.waitForCompletion();
+            src_grey_gpu = dst_grey_gpu.clone();
+            cv::cuda::setDevice(old_gpu_id);
+            return cur_bbox_vec;
+        }
+
+        ////sync_PyrLKOpticalFlow_gpu.sparse(src_grey_gpu, dst_grey_gpu, prev_pts_flow_gpu, cur_pts_flow_gpu, status_gpu, &err_gpu);    // OpenCV 2.4.x
+        sync_PyrLKOpticalFlow_gpu->calc(src_grey_gpu, dst_grey_gpu, prev_pts_flow_gpu, cur_pts_flow_gpu, status_gpu, err_gpu, stream);    // OpenCV 3.x
+
+        cv::Mat cur_pts_flow_cpu;
+        cur_pts_flow_gpu.download(cur_pts_flow_cpu, stream);
+
+        dst_grey_gpu.copyTo(src_grey_gpu, stream);
+
+        cv::Mat err_cpu, status_cpu;
+        err_gpu.download(err_cpu, stream);
+        status_gpu.download(status_cpu, stream);
+
+        stream.waitForCompletion();
+
+        std::vector<bbox_t> result_bbox_vec;
+
+        if (err_cpu.cols == cur_bbox_vec.size() && status_cpu.cols == cur_bbox_vec.size())
+        {
+            for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
+            {
+                cv::Point2f cur_key_pt = cur_pts_flow_cpu.at<cv::Point2f>(0, i);
+                cv::Point2f prev_key_pt = prev_pts_flow_cpu.at<cv::Point2f>(0, i);
+
+                float moved_x = cur_key_pt.x - prev_key_pt.x;
+                float moved_y = cur_key_pt.y - prev_key_pt.y;
+
+                if (abs(moved_x) < 100 && abs(moved_y) < 100 && good_bbox_vec_flags[i])
+                    if (err_cpu.at<float>(0, i) < flow_error && status_cpu.at<unsigned char>(0, i) != 0 &&
+                        ((float)cur_bbox_vec[i].x + moved_x) > 0 && ((float)cur_bbox_vec[i].y + moved_y) > 0)
+                    {
+                        cur_bbox_vec[i].x += moved_x + 0.5;
+                        cur_bbox_vec[i].y += moved_y + 0.5;
+                        result_bbox_vec.push_back(cur_bbox_vec[i]);
+                    }
+                    else good_bbox_vec_flags[i] = false;
+                else good_bbox_vec_flags[i] = false;
+
+                //if(!check_error && !good_bbox_vec_flags[i]) result_bbox_vec.push_back(cur_bbox_vec[i]);
+            }
+        }
+
+        cur_pts_flow_gpu.swap(prev_pts_flow_gpu);
+        cur_pts_flow_cpu.copyTo(prev_pts_flow_cpu);
+
+        if (old_gpu_id != gpu_id)
+            cv::cuda::setDevice(old_gpu_id);
+
+        return result_bbox_vec;
+    }
+
+};
+
+#elif defined(TRACK_OPTFLOW) && defined(OPENCV)
+
+//#include <opencv2/optflow.hpp>
+#include <opencv2/video/tracking.hpp>
+
+class Tracker_optflow {
+public:
+    const int flow_error;
+
+
+    Tracker_optflow(int win_size = 15, int max_level = 3, int iterations = 8000, int _flow_error = -1) :
+        flow_error((_flow_error > 0)? _flow_error:(win_size*4))
+    {
+        sync_PyrLKOpticalFlow = cv::SparsePyrLKOpticalFlow::create();
+        sync_PyrLKOpticalFlow->setWinSize(cv::Size(win_size, win_size));    // 9, 15, 21, 31
+        sync_PyrLKOpticalFlow->setMaxLevel(max_level);        // +- 3 pt
+
+    }
+
+    // just to avoid extra allocations
+    cv::Mat dst_grey;
+    cv::Mat prev_pts_flow, cur_pts_flow;
+    cv::Mat status, err;
+
+    cv::Mat src_grey;    // used in both functions
+    cv::Ptr<cv::SparsePyrLKOpticalFlow> sync_PyrLKOpticalFlow;
+
+    std::vector<bbox_t> cur_bbox_vec;
+    std::vector<bool> good_bbox_vec_flags;
+
+    void update_cur_bbox_vec(std::vector<bbox_t> _cur_bbox_vec)
+    {
+        cur_bbox_vec = _cur_bbox_vec;
+        good_bbox_vec_flags = std::vector<bool>(cur_bbox_vec.size(), true);
+        cv::Mat prev_pts, cur_pts_flow;
+
+        for (auto &i : cur_bbox_vec) {
+            float x_center = (i.x + i.w / 2.0F);
+            float y_center = (i.y + i.h / 2.0F);
+            prev_pts.push_back(cv::Point2f(x_center, y_center));
+        }
+
+        if (prev_pts.rows == 0)
+            prev_pts_flow = cv::Mat();
+        else
+            cv::transpose(prev_pts, prev_pts_flow);
+    }
+
+
+    void update_tracking_flow(cv::Mat new_src_mat, std::vector<bbox_t> _cur_bbox_vec)
+    {
+        if (new_src_mat.channels() == 1) {
+            src_grey = new_src_mat.clone();
+        }
+        else if (new_src_mat.channels() == 3) {
+            cv::cvtColor(new_src_mat, src_grey, CV_BGR2GRAY, 1);
+        }
+        else if (new_src_mat.channels() == 4) {
+            cv::cvtColor(new_src_mat, src_grey, CV_BGRA2GRAY, 1);
+        }
+        else {
+            std::cerr << " Warning: new_src_mat.channels() is not: 1, 3 or 4. It is = " << new_src_mat.channels() << " \n";
+            return;
+        }
+        update_cur_bbox_vec(_cur_bbox_vec);
+    }
+
+
+    std::vector<bbox_t> tracking_flow(cv::Mat new_dst_mat, bool check_error = true)
+    {
+        if (sync_PyrLKOpticalFlow.empty()) {
+            std::cout << "sync_PyrLKOpticalFlow isn't initialized \n";
+            return cur_bbox_vec;
+        }
+
+        cv::cvtColor(new_dst_mat, dst_grey, CV_BGR2GRAY, 1);
+
+        if (src_grey.rows != dst_grey.rows || src_grey.cols != dst_grey.cols) {
+            src_grey = dst_grey.clone();
+            //std::cerr << " Warning: src_grey.rows != dst_grey.rows || src_grey.cols != dst_grey.cols \n";
+            return cur_bbox_vec;
+        }
+
+        if (prev_pts_flow.cols < 1) {
+            return cur_bbox_vec;
+        }
+
+        ////sync_PyrLKOpticalFlow_gpu.sparse(src_grey_gpu, dst_grey_gpu, prev_pts_flow_gpu, cur_pts_flow_gpu, status_gpu, &err_gpu);    // OpenCV 2.4.x
+        sync_PyrLKOpticalFlow->calc(src_grey, dst_grey, prev_pts_flow, cur_pts_flow, status, err);    // OpenCV 3.x
+
+        dst_grey.copyTo(src_grey);
+
+        std::vector<bbox_t> result_bbox_vec;
+
+        if (err.rows == cur_bbox_vec.size() && status.rows == cur_bbox_vec.size())
+        {
+            for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
+            {
+                cv::Point2f cur_key_pt = cur_pts_flow.at<cv::Point2f>(0, i);
+                cv::Point2f prev_key_pt = prev_pts_flow.at<cv::Point2f>(0, i);
+
+                float moved_x = cur_key_pt.x - prev_key_pt.x;
+                float moved_y = cur_key_pt.y - prev_key_pt.y;
+
+                if (abs(moved_x) < 100 && abs(moved_y) < 100 && good_bbox_vec_flags[i])
+                    if (err.at<float>(0, i) < flow_error && status.at<unsigned char>(0, i) != 0 &&
+                        ((float)cur_bbox_vec[i].x + moved_x) > 0 && ((float)cur_bbox_vec[i].y + moved_y) > 0)
+                    {
+                        cur_bbox_vec[i].x += moved_x + 0.5;
+                        cur_bbox_vec[i].y += moved_y + 0.5;
+                        result_bbox_vec.push_back(cur_bbox_vec[i]);
+                    }
+                    else good_bbox_vec_flags[i] = false;
+                else good_bbox_vec_flags[i] = false;
+
+                //if(!check_error && !good_bbox_vec_flags[i]) result_bbox_vec.push_back(cur_bbox_vec[i]);
+            }
+        }
+
+        prev_pts_flow = cur_pts_flow.clone();
+
+        return result_bbox_vec;
+    }
+
+};
+#else
+
+class Tracker_optflow {};
+
+#endif    // defined(TRACK_OPTFLOW) && defined(OPENCV)
+
+
+#ifdef OPENCV
+
+static cv::Scalar obj_id_to_color(int obj_id) {
+    int const colors[6][3] = { { 1,0,1 },{ 0,0,1 },{ 0,1,1 },{ 0,1,0 },{ 1,1,0 },{ 1,0,0 } };
+    int const offset = obj_id * 123457 % 6;
+    int const color_scale = 150 + (obj_id * 123457) % 100;
+    cv::Scalar color(colors[offset][0], colors[offset][1], colors[offset][2]);
+    color *= color_scale;
+    return color;
+}
+
+class preview_boxes_t {
+    enum { frames_history = 30 };    // how long to keep the history saved
+
+    struct preview_box_track_t {
+        unsigned int track_id, obj_id, last_showed_frames_ago;
+        bool current_detection;
+        bbox_t bbox;
+        cv::Mat mat_obj, mat_resized_obj;
+        preview_box_track_t() : track_id(0), obj_id(0), last_showed_frames_ago(frames_history), current_detection(false) {}
+    };
+    std::vector<preview_box_track_t> preview_box_track_id;
+    size_t const preview_box_size, bottom_offset;
+    bool const one_off_detections;
+public:
+    preview_boxes_t(size_t _preview_box_size = 100, size_t _bottom_offset = 100, bool _one_off_detections = false) :
+        preview_box_size(_preview_box_size), bottom_offset(_bottom_offset), one_off_detections(_one_off_detections)
+    {}
+
+    void set(cv::Mat src_mat, std::vector<bbox_t> result_vec)
+    {
+        size_t const count_preview_boxes = src_mat.cols / preview_box_size;
+        if (preview_box_track_id.size() != count_preview_boxes) preview_box_track_id.resize(count_preview_boxes);
+
+        // increment frames history
+        for (auto &i : preview_box_track_id)
+            i.last_showed_frames_ago = std::min((unsigned)frames_history, i.last_showed_frames_ago + 1);
+
+        // occupy empty boxes
+        for (auto &k : result_vec) {
+            bool found = false;
+            // find the same (track_id)
+            for (auto &i : preview_box_track_id) {
+                if (i.track_id == k.track_id) {
+                    if (!one_off_detections) i.last_showed_frames_ago = 0; // for tracked objects
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                // find empty box
+                for (auto &i : preview_box_track_id) {
+                    if (i.last_showed_frames_ago == frames_history) {
+                        if (!one_off_detections && k.frames_counter == 0) break; // don't show if obj isn't tracked yet
+                        i.track_id = k.track_id;
+                        i.obj_id = k.obj_id;
+                        i.bbox = k;
+                        i.last_showed_frames_ago = 0;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // draw preview box (from old or current frame)
+        for (size_t i = 0; i < preview_box_track_id.size(); ++i)
+        {
+            // get object image
+            cv::Mat dst = preview_box_track_id[i].mat_resized_obj;
+            preview_box_track_id[i].current_detection = false;
+
+            for (auto &k : result_vec) {
+                if (preview_box_track_id[i].track_id == k.track_id) {
+                    if (one_off_detections && preview_box_track_id[i].last_showed_frames_ago > 0) {
+                        preview_box_track_id[i].last_showed_frames_ago = frames_history; break;
+                    }
+                    bbox_t b = k;
+                    cv::Rect r(b.x, b.y, b.w, b.h);
+                    cv::Rect img_rect(cv::Point2i(0, 0), src_mat.size());
+                    cv::Rect rect_roi = r & img_rect;
+                    if (rect_roi.width > 1 || rect_roi.height > 1) {
+                        cv::Mat roi = src_mat(rect_roi);
+                        cv::resize(roi, dst, cv::Size(preview_box_size, preview_box_size), cv::INTER_NEAREST);
+                        preview_box_track_id[i].mat_obj = roi.clone();
+                        preview_box_track_id[i].mat_resized_obj = dst.clone();
+                        preview_box_track_id[i].current_detection = true;
+                        preview_box_track_id[i].bbox = k;
+                    }
+                    break;
+                }
+            }
+        }
+    }
+
+
+    void draw(cv::Mat draw_mat, bool show_small_boxes = false)
+    {
+        // draw preview box (from old or current frame)
+        for (size_t i = 0; i < preview_box_track_id.size(); ++i)
+        {
+            auto &prev_box = preview_box_track_id[i];
+
+            // draw object image
+            cv::Mat dst = prev_box.mat_resized_obj;
+            if (prev_box.last_showed_frames_ago < frames_history &&
+                dst.size() == cv::Size(preview_box_size, preview_box_size))
+            {
+                cv::Rect dst_rect_roi(cv::Point2i(i * preview_box_size, draw_mat.rows - bottom_offset), dst.size());
+                cv::Mat dst_roi = draw_mat(dst_rect_roi);
+                dst.copyTo(dst_roi);
+
+                cv::Scalar color = obj_id_to_color(prev_box.obj_id);
+                int thickness = (prev_box.current_detection) ? 5 : 1;
+                cv::rectangle(draw_mat, dst_rect_roi, color, thickness);
+
+                unsigned int const track_id = prev_box.track_id;
+                std::string track_id_str = (track_id > 0) ? std::to_string(track_id) : "";
+                putText(draw_mat, track_id_str, dst_rect_roi.tl() - cv::Point2i(-4, 5), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.9, cv::Scalar(0, 0, 0), 2);
+
+                std::string size_str = std::to_string(prev_box.bbox.w) + "x" + std::to_string(prev_box.bbox.h);
+                putText(draw_mat, size_str, dst_rect_roi.tl() + cv::Point2i(0, 12), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.8, cv::Scalar(0, 0, 0), 1);
+
+                if (!one_off_detections && prev_box.current_detection) {
+                    cv::line(draw_mat, dst_rect_roi.tl() + cv::Point2i(preview_box_size, 0),
+                        cv::Point2i(prev_box.bbox.x, prev_box.bbox.y + prev_box.bbox.h),
+                        color);
+                }
+
+                if (one_off_detections && show_small_boxes) {
+                    cv::Rect src_rect_roi(cv::Point2i(prev_box.bbox.x, prev_box.bbox.y),
+                        cv::Size(prev_box.bbox.w, prev_box.bbox.h));
+                    unsigned int const color_history = (255 * prev_box.last_showed_frames_ago) / frames_history;
+                    color = cv::Scalar(255 - 3 * color_history, 255 - 2 * color_history, 255 - 1 * color_history);
+                    if (prev_box.mat_obj.size() == src_rect_roi.size()) {
+                        prev_box.mat_obj.copyTo(draw_mat(src_rect_roi));
+                    }
+                    cv::rectangle(draw_mat, src_rect_roi, color, thickness);
+                    putText(draw_mat, track_id_str, src_rect_roi.tl() - cv::Point2i(0, 10), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.8, cv::Scalar(0, 0, 0), 1);
+                }
+            }
+        }
+    }
+};
+
+
+class track_kalman_t
+{
+    int track_id_counter;
+    std::chrono::steady_clock::time_point global_last_time;
+    float dT;
+
+public:
+    int max_objects;    // max objects for tracking
+    int min_frames;     // min frames to consider an object as detected
+    const float max_dist;   // max distance (in px) to track with the same ID
+    cv::Size img_size;  // max value of x,y,w,h
+
+    struct tst_t {
+        int track_id;
+        int state_id;
+        std::chrono::steady_clock::time_point last_time;
+        int detection_count;
+        tst_t() : track_id(-1), state_id(-1) {}
+    };
+    std::vector<tst_t> track_id_state_id_time;
+    std::vector<bbox_t> result_vec_pred;
+
+    struct one_kalman_t;
+    std::vector<one_kalman_t> kalman_vec;
+
+    struct one_kalman_t
+    {
+        cv::KalmanFilter kf;
+        cv::Mat state;
+        cv::Mat meas;
+        int measSize, stateSize, contrSize;
+
+        void set_delta_time(float dT) {
+            kf.transitionMatrix.at<float>(2) = dT;
+            kf.transitionMatrix.at<float>(9) = dT;
+        }
+
+        void set(bbox_t box)
+        {
+            initialize_kalman();
+
+            kf.errorCovPre.at<float>(0) = 1; // px
+            kf.errorCovPre.at<float>(7) = 1; // px
+            kf.errorCovPre.at<float>(14) = 1;
+            kf.errorCovPre.at<float>(21) = 1;
+            kf.errorCovPre.at<float>(28) = 1; // px
+            kf.errorCovPre.at<float>(35) = 1; // px
+
+            state.at<float>(0) = box.x;
+            state.at<float>(1) = box.y;
+            state.at<float>(2) = 0;
+            state.at<float>(3) = 0;
+            state.at<float>(4) = box.w;
+            state.at<float>(5) = box.h;
+            // <<<< Initialization
+
+            kf.statePost = state;
+        }
+
+        // Kalman.correct() calculates: statePost = statePre + gain * (z(k)-measurementMatrix*statePre);
+        // corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
+        void correct(bbox_t box) {
+            meas.at<float>(0) = box.x;
+            meas.at<float>(1) = box.y;
+            meas.at<float>(2) = box.w;
+            meas.at<float>(3) = box.h;
+
+            kf.correct(meas);
+
+            bbox_t new_box = predict();
+            if (new_box.w == 0 || new_box.h == 0) {
+                set(box);
+                //std::cerr << " force set(): track_id = " << box.track_id <<
+                //    ", x = " << box.x << ", y = " << box.y << ", w = " << box.w << ", h = " << box.h << std::endl;
+            }
+        }
+
+        // Kalman.predict() calculates: statePre = TransitionMatrix * statePost;
+        // predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
+        bbox_t predict() {
+            bbox_t box;
+            state = kf.predict();
+
+            box.x = state.at<float>(0);
+            box.y = state.at<float>(1);
+            box.w = state.at<float>(4);
+            box.h = state.at<float>(5);
+            return box;
+        }
+
+        void initialize_kalman()
+        {
+            kf = cv::KalmanFilter(stateSize, measSize, contrSize, CV_32F);
+
+            // Transition State Matrix A
+            // Note: set dT at each processing step!
+            // [ 1 0 dT 0  0 0 ]
+            // [ 0 1 0  dT 0 0 ]
+            // [ 0 0 1  0  0 0 ]
+            // [ 0 0 0  1  0 0 ]
+            // [ 0 0 0  0  1 0 ]
+            // [ 0 0 0  0  0 1 ]
+            cv::setIdentity(kf.transitionMatrix);
+
+            // Measure Matrix H
+            // [ 1 0 0 0 0 0 ]
+            // [ 0 1 0 0 0 0 ]
+            // [ 0 0 0 0 1 0 ]
+            // [ 0 0 0 0 0 1 ]
+            kf.measurementMatrix = cv::Mat::zeros(measSize, stateSize, CV_32F);
+            kf.measurementMatrix.at<float>(0) = 1.0f;
+            kf.measurementMatrix.at<float>(7) = 1.0f;
+            kf.measurementMatrix.at<float>(16) = 1.0f;
+            kf.measurementMatrix.at<float>(23) = 1.0f;
+
+            // Process Noise Covariance Matrix Q - result smoother with lower values (1e-2)
+            // [ Ex   0   0     0     0    0  ]
+            // [ 0    Ey  0     0     0    0  ]
+            // [ 0    0   Ev_x  0     0    0  ]
+            // [ 0    0   0     Ev_y  0    0  ]
+            // [ 0    0   0     0     Ew   0  ]
+            // [ 0    0   0     0     0    Eh ]
+            //cv::setIdentity(kf.processNoiseCov, cv::Scalar(1e-3));
+            kf.processNoiseCov.at<float>(0) = 1e-2;
+            kf.processNoiseCov.at<float>(7) = 1e-2;
+            kf.processNoiseCov.at<float>(14) = 1e-2;// 5.0f;
+            kf.processNoiseCov.at<float>(21) = 1e-2;// 5.0f;
+            kf.processNoiseCov.at<float>(28) = 5e-3;
+            kf.processNoiseCov.at<float>(35) = 5e-3;
+
+            // Measures Noise Covariance Matrix R - result smoother with higher values (1e-1)
+            cv::setIdentity(kf.measurementNoiseCov, cv::Scalar(1e-1));
+
+            //cv::setIdentity(kf.errorCovPost, cv::Scalar::all(1e-2));
+            // <<<< Kalman Filter
+
+            set_delta_time(0);
+        }
+
+
+        one_kalman_t(int _stateSize = 6, int _measSize = 4, int _contrSize = 0) :
+            kf(_stateSize, _measSize, _contrSize, CV_32F), measSize(_measSize), stateSize(_stateSize), contrSize(_contrSize)
+        {
+            state = cv::Mat(stateSize, 1, CV_32F);  // [x,y,v_x,v_y,w,h]
+            meas = cv::Mat(measSize, 1, CV_32F);    // [z_x,z_y,z_w,z_h]
+            //cv::Mat procNoise(stateSize, 1, type)
+            // [E_x,E_y,E_v_x,E_v_y,E_w,E_h]
+
+            initialize_kalman();
+        }
+    };
+    // ------------------------------------------
+
+
+
+    track_kalman_t(int _max_objects = 1000, int _min_frames = 3, float _max_dist = 40, cv::Size _img_size = cv::Size(10000, 10000)) :
+        track_id_counter(0), max_objects(_max_objects), min_frames(_min_frames), max_dist(_max_dist), img_size(_img_size)
+    {
+        kalman_vec.resize(max_objects);
+        track_id_state_id_time.resize(max_objects);
+        result_vec_pred.resize(max_objects);
+    }
+
+    float calc_dt() {
+        dT = std::chrono::duration<double>(std::chrono::steady_clock::now() - global_last_time).count();
+        return dT;
+    }
+
+    static float get_distance(float src_x, float src_y, float dst_x, float dst_y) {
+        return sqrtf((src_x - dst_x)*(src_x - dst_x) + (src_y - dst_y)*(src_y - dst_y));
+    }
+
+    void clear_old_states() {
+        // clear old bboxes
+        for (size_t state_id = 0; state_id < track_id_state_id_time.size(); ++state_id)
+        {
+            float time_sec = std::chrono::duration<double>(std::chrono::steady_clock::now() - track_id_state_id_time[state_id].last_time).count();
+            float time_wait = 0.5;    // 0.5 second
+            if (track_id_state_id_time[state_id].track_id > -1)
+            {
+                if ((result_vec_pred[state_id].x > img_size.width) ||
+                    (result_vec_pred[state_id].y > img_size.height))
+                {
+                    track_id_state_id_time[state_id].track_id = -1;
+                }
+
+                if (time_sec >= time_wait || track_id_state_id_time[state_id].detection_count < 0) {
+                    //std::cerr << " remove track_id = " << track_id_state_id_time[state_id].track_id << ", state_id = " << state_id << std::endl;
+                    track_id_state_id_time[state_id].track_id = -1; // remove bbox
+                }
+            }
+        }
+    }
+
+    tst_t get_state_id(bbox_t find_box, std::vector<bool> &busy_vec)
+    {
+        tst_t tst;
+        tst.state_id = -1;
+
+        float min_dist = std::numeric_limits<float>::max();
+
+        for (size_t i = 0; i < max_objects; ++i)
+        {
+            if (track_id_state_id_time[i].track_id > -1 && result_vec_pred[i].obj_id == find_box.obj_id && busy_vec[i] == false)
+            {
+                bbox_t pred_box = result_vec_pred[i];
+
+                float dist = get_distance(pred_box.x, pred_box.y, find_box.x, find_box.y);
+
+                float movement_dist = std::max(max_dist, static_cast<float>(std::max(pred_box.w, pred_box.h)) );
+
+                if ((dist < movement_dist) && (dist < min_dist)) {
+                    min_dist = dist;
+                    tst.state_id = i;
+                }
+            }
+        }
+
+        if (tst.state_id > -1) {
+            track_id_state_id_time[tst.state_id].last_time = std::chrono::steady_clock::now();
+            track_id_state_id_time[tst.state_id].detection_count = std::max(track_id_state_id_time[tst.state_id].detection_count + 2, 10);
+            tst = track_id_state_id_time[tst.state_id];
+            busy_vec[tst.state_id] = true;
+        }
+        else {
+            //std::cerr << " Didn't find: obj_id = " << find_box.obj_id << ", x = " << find_box.x << ", y = " << find_box.y <<
+            //    ", track_id_counter = " << track_id_counter << std::endl;
+        }
+
+        return tst;
+    }
+
+    tst_t new_state_id(std::vector<bool> &busy_vec)
+    {
+        tst_t tst;
+        // find empty cell to add new track_id
+        auto it = std::find_if(track_id_state_id_time.begin(), track_id_state_id_time.end(), [&](tst_t &v) { return v.track_id == -1; });
+        if (it != track_id_state_id_time.end()) {
+            it->state_id = it - track_id_state_id_time.begin();
+            //it->track_id = track_id_counter++;
+            it->track_id = 0;
+            it->last_time = std::chrono::steady_clock::now();
+            it->detection_count = 1;
+            tst = *it;
+            busy_vec[it->state_id] = true;
+        }
+
+        return tst;
+    }
+
+    std::vector<tst_t> find_state_ids(std::vector<bbox_t> result_vec)
+    {
+        std::vector<tst_t> tst_vec(result_vec.size());
+
+        std::vector<bool> busy_vec(max_objects, false);
+
+        for (size_t i = 0; i < result_vec.size(); ++i)
+        {
+            tst_t tst = get_state_id(result_vec[i], busy_vec);
+            int state_id = tst.state_id;
+            int track_id = tst.track_id;
+
+            // if new state_id
+            if (state_id < 0) {
+                tst = new_state_id(busy_vec);
+                state_id = tst.state_id;
+                track_id = tst.track_id;
+                if (state_id > -1) {
+                    kalman_vec[state_id].set(result_vec[i]);
+                    //std::cerr << " post: ";
+                }
+            }
+
+            //std::cerr << " track_id = " << track_id << ", state_id = " << state_id <<
+            //    ", x = " << result_vec[i].x << ", det_count = " << tst.detection_count << std::endl;
+
+            if (state_id > -1) {
+                tst_vec[i] = tst;
+                result_vec_pred[state_id] = result_vec[i];
+                result_vec_pred[state_id].track_id = track_id;
+            }
+        }
+
+        return tst_vec;
+    }
+
+    std::vector<bbox_t> predict()
+    {
+        clear_old_states();
+        std::vector<bbox_t> result_vec;
+
+        for (size_t i = 0; i < max_objects; ++i)
+        {
+            tst_t tst = track_id_state_id_time[i];
+            if (tst.track_id > -1) {
+                bbox_t box = kalman_vec[i].predict();
+
+                result_vec_pred[i].x = box.x;
+                result_vec_pred[i].y = box.y;
+                result_vec_pred[i].w = box.w;
+                result_vec_pred[i].h = box.h;
+
+                if (tst.detection_count >= min_frames)
+                {
+                    if (track_id_state_id_time[i].track_id == 0) {
+                        track_id_state_id_time[i].track_id = ++track_id_counter;
+                        result_vec_pred[i].track_id = track_id_counter;
+                    }
+
+                    result_vec.push_back(result_vec_pred[i]);
+                }
+            }
+        }
+        //std::cerr << "         result_vec.size() = " << result_vec.size() << std::endl;
+
+        //global_last_time = std::chrono::steady_clock::now();
+
+        return result_vec;
+    }
+
+
+    std::vector<bbox_t> correct(std::vector<bbox_t> result_vec)
+    {
+        calc_dt();
+        clear_old_states();
+
+        for (size_t i = 0; i < max_objects; ++i)
+            track_id_state_id_time[i].detection_count--;
+
+        std::vector<tst_t> tst_vec = find_state_ids(result_vec);
+
+        for (size_t i = 0; i < tst_vec.size(); ++i) {
+            tst_t tst = tst_vec[i];
+            int state_id = tst.state_id;
+            if (state_id > -1)
+            {
+                kalman_vec[state_id].set_delta_time(dT);
+                kalman_vec[state_id].correct(result_vec_pred[state_id]);
+            }
+        }
+
+        result_vec = predict();
+
+        global_last_time = std::chrono::steady_clock::now();
+
+        return result_vec;
+    }
+
+};
+// ----------------------------------------------
+#endif    // OPENCV
+
+#endif    // __cplusplus
+
+#endif    // YOLO_V2_CLASS_HPP
diff --git a/darknet-master/json_mjpeg_streams.sh b/darknet-master/json_mjpeg_streams.sh
new file mode 100644
index 0000000..0bf180b
--- /dev/null
+++ b/darknet-master/json_mjpeg_streams.sh
@@ -0,0 +1,6 @@
+# Run this file and then open URL in Chrome/Firefox in 2 tabs: http://localhost:8070 and http://localhost:8090
+# Or open: http://ip-address:8070 and http://ip-address:8090
+# to get <ip-address> run: sudo ifconfig
+
+./darknet detector demo ./cfg/coco.data ./cfg/yolov3.cfg ./yolov3.weights test50.mp4 -json_port 8070 -mjpeg_port 8090 -ext_output
+
diff --git a/darknet-master/net_cam_v3.sh b/darknet-master/net_cam_v3.sh
new file mode 100644
index 0000000..10fbf36
--- /dev/null
+++ b/darknet-master/net_cam_v3.sh
@@ -0,0 +1,6 @@
+#rm test_dnn_out.avi
+
+./darknet detector demo ./cfg/coco.data ./cfg/yolov3.cfg ./yolov3.weights rtsp://admin:admin12345@192.168.0.228:554 -i 0 -thresh 0.25
+
+
+
diff --git a/darknet-master/net_cam_v4.sh b/darknet-master/net_cam_v4.sh
new file mode 100644
index 0000000..5b7520e
--- /dev/null
+++ b/darknet-master/net_cam_v4.sh
@@ -0,0 +1,6 @@
+#rm test_dnn_out.avi
+
+./darknet detector demo ./cfg/coco.data ./cfg/yolov4.cfg ./yolov4.weights rtsp://admin:admin12345@192.168.0.228:554 -i 0 -thresh 0.25
+
+
+
diff --git a/darknet-master/package.xml b/darknet-master/package.xml
new file mode 100644
index 0000000..f68d3bc
--- /dev/null
+++ b/darknet-master/package.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<package format="2">
+  <name>darknet</name>
+  <version>0.1.0</version>
+  <description>Darknet, an open source neural network framework.</description>
+
+  <author email="you@example.com">Joseph Redmon</author>
+  <maintainer email="you@example.com">Alexey Bochkovskiy</maintainer>
+
+  <license>YOLO license</license>
+
+  <buildtool_depend>cmake</buildtool_depend>
+
+  <depend>libopencv-dev</depend>
+
+  <!-- nvidia-cuda-dev is just the libraries to link against -->
+  <exec_depend>nvidia-cuda-dev</exec_depend>
+
+  <!-- nvidia-cuda includes nvidia-cuda-dev plus the nvcc compiler -->
+  <build_depend>nvidia-cuda</build_depend>
+
+  <export>
+    <build_type>cmake</build_type>
+  </export>
+</package>
+
diff --git a/darknet-master/scripts/README.md b/darknet-master/scripts/README.md
new file mode 100644
index 0000000..91b1722
--- /dev/null
+++ b/darknet-master/scripts/README.md
@@ -0,0 +1,71 @@
+# Datasets
+
+59.26TB of research data: http://academictorrents.com/
+
+ImageNet Torrent (Stanford): http://academictorrents.com/browse.php?search=imagenet&page=0
+
+25 thousand datasets on Kaggle: https://www.kaggle.com/datasets
+
+BDD100K - Diverse Driving Video (Berkeley): https://bair.berkeley.edu/blog/2018/05/30/bdd/
+
+KITTI - for autonomous driving (Toyota): http://www.cvlibs.net/datasets/kitti/
+
+A2D2 - for autonomous driving (Audi): https://www.a2d2.audi/a2d2/en.html
+
+nuScenes (for autonomous driving): https://www.nuscenes.org/overview
+
+Pascal VOC (Oxford): http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html
+
+MS COCO (Microsoft): http://cocodataset.org/#download
+
+ImageNet (Stanford): http://imagenet.stanford.edu/download.php
+
+ImageNet (ILSVRC2012): http://www.image-net.org/challenges/LSVRC/2012/nonpub-downloads
+
+ImageNet (ILSVRC2015): http://image-net.org/small/download.php
+
+ImageNet VID: http://bvisionweb1.cs.unc.edu/ilsvrc2015/download-videos-3j16.php
+
+Open Images (Google): https://storage.googleapis.com/openimages/web/download.html
+
+Cityscapes: https://www.cityscapes-dataset.com/
+
+Object Tracking Benchmark: http://cvlab.hanyang.ac.kr/tracker_benchmark/datasets.html
+
+MOT (Multiple object tracking benchmark): https://motchallenge.net/
+
+VOT (Visual object tracking): http://www.votchallenge.net/challenges.html
+
+FREE FLIR Thermal Dataset (infrared): https://www.flir.eu/oem/adas/adas-dataset-form/
+
+MARS: http://www.liangzheng.com.cn/Project/project_mars.html
+
+Market-1501: http://www.liangzheng.org/Project/project_reid.html
+
+German Traffic Sign Recognition Benchmark: http://benchmark.ini.rub.de/
+
+Labeled Faces in the Wild: http://vis-www.cs.umass.edu/lfw/
+
+Core50: https://vlomonaco.github.io/core50/
+
+Visual Question Answering: https://visualqa.org/download.html
+
+Large Movie Review Dataset: http://ai.stanford.edu/~amaas/data/sentiment/
+
+----
+
+Wikipedia's List of datasets: https://en.wikipedia.org/wiki/List_of_datasets_for_machine-learning_research
+
+Other datasets (Music, Natural Images, Artificial Datasets, Faces, Text, Speech, Recommendation Systems, Misc): http://deeplearning.net/datasets/
+
+25 datasets: https://www.analyticsvidhya.com/blog/2018/03/comprehensive-collection-deep-learning-datasets/
+
+List of datasets: https://riemenschneider.hayko.at/vision/dataset/index.php
+
+Another list of datasets: http://homepages.inf.ed.ac.uk/rbf/CVonline/Imagedbase.htm
+
+Pedestrian DATASETs for Vision based Detection and Tracking: https://hemprasad.wordpress.com/2014/11/08/pedestrian-datasets-for-vision-based-detection-and-tracking/
+
+TrackingNet: https://tracking-net.org/
+
+RGB, RGBD, Texture-mapped 3D mesh models: http://www.ycbbenchmarks.com/
diff --git a/darknet-master/scripts/deploy-cuda.ps1 b/darknet-master/scripts/deploy-cuda.ps1
new file mode 100644
index 0000000..ab6b4f2
--- /dev/null
+++ b/darknet-master/scripts/deploy-cuda.ps1
@@ -0,0 +1,78 @@
+#!/usr/bin/env pwsh
+
+param (
+  [switch]$DisableVisualStudioFeatures = $false,
+  [switch]$DisableSilentMode = $false
+)
+
+Import-Module -Name $PSScriptRoot/utils.psm1 -Force
+
+$url = "https://developer.download.nvidia.com/compute/cuda/${cuda_version_full}/network_installers/cuda_${cuda_version_full}_windows_network.exe"
+
+$CudaFeatures = " nvcc_${cuda_version_short} " + `
+  " cublas_${cuda_version_short} " + `
+  " cublas_dev_${cuda_version_short} " + `
+  " cuda_profiler_api_${cuda_version_short} " + `
+  " cudart_${cuda_version_short} " + `
+  " cufft_${cuda_version_short} " + `
+  " cufft_dev_${cuda_version_short} " + `
+  " cuobjdump_${cuda_version_short} " + `
+  " cupti_${cuda_version_short} " + `
+  " curand_${cuda_version_short} " + `
+  " curand_dev_${cuda_version_short} " + `
+  " cusolver_${cuda_version_short} " + `
+  " cusolver_dev_${cuda_version_short} " + `
+  " cusparse_${cuda_version_short} " + `
+  " cusparse_dev_${cuda_version_short} " + `
+  " cuxxfilt_${cuda_version_short} " + `
+  " npp_${cuda_version_short} " + `
+  " npp_dev_${cuda_version_short} " + `
+  " nsight_compute_${cuda_version_short} " + `
+  " nsight_systems_${cuda_version_short} " + `
+  " nsight_vse_${cuda_version_short} " + `
+  " nvdisasm_${cuda_version_short} " + `
+  " nvjitlink_${cuda_version_short} " + `
+  " nvjpeg_${cuda_version_short} " + `
+  " nvjpeg_dev_${cuda_version_short} " + `
+  " nvml_dev_${cuda_version_short} " + `
+  " nvprof_${cuda_version_short} " + `
+  " nvprune_${cuda_version_short} " + `
+  " nvrtc_${cuda_version_short} " + `
+  " nvrtc_dev_${cuda_version_short} " + `
+  " nvtx_${cuda_version_short} " + `
+  " occupancy_calculator_${cuda_version_short} " + `
+  " opencl_${cuda_version_short} " + `
+  " sanitizer_${cuda_version_short} " + `
+  " thrust_${cuda_version_short} " + `
+  " visual_profiler_${cuda_version_short} "
+
+if (-Not $DisableVisualStudioFeatures) {
+  $CudaFeatures = $CudaFeatures + "visual_studio_integration_${cuda_version_short} visual_profiler_${cuda_version_short}  "
+}
+
+if ($DisableSilentMode) {
+  $SilentFlag = ' '
+}
+else {
+  $SilentFlag = '-s '
+}
+
+try {
+  Push-Location $PSScriptRoot
+  Write-Host "Downloading CUDA from $url..."
+  curl.exe -L -o cuda_${cuda_version_full}_windows_network.exe -s -S $url
+  Write-Host 'Installing CUDA...'
+  $proc = Start-Process -PassThru -FilePath "./cuda_${cuda_version_full}_windows_network.exe" -ArgumentList @($SilentFlag + $CudaFeatures)
+  $proc.WaitForExit()
+  $exitCode = $proc.ExitCode
+  Pop-Location
+  if ($exitCode -eq 0) {
+    Write-Host 'Installation successful!'
+  }
+  else {
+    Throw "Installation failed! Exited with $exitCode."
+  }
+}
+catch {
+  Throw "Failed to install CUDA! $($_.Exception.Message)"
+}
diff --git a/darknet-master/scripts/deploy-cuda.sh b/darknet-master/scripts/deploy-cuda.sh
new file mode 100644
index 0000000..fb85d52
--- /dev/null
+++ b/darknet-master/scripts/deploy-cuda.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+if [[ "$OSTYPE" == "darwin"* ]]; then
+  echo "Unable to deploy CUDA on macOS, please wait for a future script update"
+  exit 1
+elif [[ $(cut -f2 <<< $(lsb_release -i)) == "Ubuntu" ]]; then
+  distr_name="$(cut -f2 <<< $(lsb_release -i) | tr '[:upper:]' '[:lower:]')$(cut -f2 <<< $(lsb_release -r) | tr -d '.')"
+else
+  echo "Unable to deploy CUDA on this OS, please wait for a future script update"
+  exit 2
+fi
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+
+if [ -f $script_dir/requested_cuda_version.sh ]; then
+  source $script_dir/requested_cuda_version.sh
+else
+  echo "Unable to find requested_cuda_version.sh script"
+  exit 3
+fi
+
+sudo apt-key del 7fa2af80
+wget https://developer.download.nvidia.com/compute/cuda/repos/$distr_name/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt-get update
+sudo apt-get install -y --no-install-recommends build-essential g++
+sudo apt-get install -y --no-install-recommends apt-transport-https ca-certificates gnupg software-properties-common wget
+sudo apt-get install -y --no-install-recommends zlib1g
+sudo apt-get dist-upgrade -y
+sudo apt-get install -y --no-install-recommends cuda-${CUDA_VERSION_DASHED}
+sudo apt-get install -y --no-install-recommends libcudnn8
+sudo apt-get install -y --no-install-recommends libcudnn8-dev
+
+sudo rm -rf /usr/local/cuda
+sudo ln -s /usr/local/cuda-${CUDA_VERSION} /usr/local/cuda
+
+sudo apt-get clean
diff --git a/darknet-master/scripts/dice_label.sh b/darknet-master/scripts/dice_label.sh
new file mode 100644
index 0000000..f19f8a4
--- /dev/null
+++ b/darknet-master/scripts/dice_label.sh
@@ -0,0 +1,20 @@
+mkdir -p images
+mkdir -p images/orig
+mkdir -p images/train
+mkdir -p images/val
+
+ffmpeg -i Face1.mp4 images/orig/face1_%6d.jpg
+ffmpeg -i Face2.mp4 images/orig/face2_%6d.jpg
+ffmpeg -i Face3.mp4 images/orig/face3_%6d.jpg
+ffmpeg -i Face4.mp4 images/orig/face4_%6d.jpg
+ffmpeg -i Face5.mp4 images/orig/face5_%6d.jpg
+ffmpeg -i Face6.mp4 images/orig/face6_%6d.jpg
+
+mogrify -resize 100x100^ -gravity center -crop 100x100+0+0 +repage images/orig/*
+
+ls images/orig/* | shuf | head -n 1000 | xargs mv -t images/val
+mv images/orig/* images/train
+
+find `pwd`/images/train > dice.train.list -name \*.jpg
+find `pwd`/images/val > dice.val.list -name \*.jpg
+
diff --git a/darknet-master/scripts/download_weights.ps1 b/darknet-master/scripts/download_weights.ps1
new file mode 100644
index 0000000..bc35260
--- /dev/null
+++ b/darknet-master/scripts/download_weights.ps1
@@ -0,0 +1,37 @@
+#!/usr/bin/env pwsh
+
+$url = "https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights"
+Invoke-WebRequest -Uri $url -OutFile "yolov4-tiny.weights"
+
+$url = "https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights"
+Invoke-WebRequest -Uri $url -OutFile "yolov4.weights"
+
+$url = "https://drive.google.com/u/0/uc?id=18yYZWyKbo4XSDVyztmsEcF9B_6bxrhUY&export=download"
+Invoke-WebRequest -Uri $url -OutFile "yolov3-tiny-prn.weights"
+
+$url = "https://pjreddie.com/media/files/yolov3.weights"
+Invoke-WebRequest -Uri $url -OutFile "yolov3.weights"
+
+$url = "https://pjreddie.com/media/files/yolov3-openimages.weights"
+Invoke-WebRequest -Uri $url -OutFile "yolov3-openimages.weights"
+
+$url = "https://pjreddie.com/media/files/yolov2.weights"
+Invoke-WebRequest -Uri $url -OutFile "yolov2.weights"
+
+$url = "https://pjreddie.com/media/files/yolov3-tiny.weights"
+Invoke-WebRequest -Uri $url -OutFile "yolov3-tiny.weights"
+
+$url = "https://pjreddie.com/media/files/yolov2-tiny.weights"
+Invoke-WebRequest -Uri $url -OutFile "yolov2-tiny.weights"
+
+$url = "https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.conv.29"
+Invoke-WebRequest -Uri $url -OutFile "yolov4-tiny.conv.29"
+
+$url = "https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.conv.137"
+Invoke-WebRequest -Uri $url -OutFile "yolov4.conv.137"
+
+$url = "https://pjreddie.com/media/files/darknet53.conv.74"
+Invoke-WebRequest -Uri $url -OutFile "darknet53.conv.74"
+
+$url = "https://pjreddie.com/media/files/darknet19_448.conv.23"
+Invoke-WebRequest -Uri $url -OutFile "darknet19_448.conv.23"
diff --git a/darknet-master/scripts/gen_anchors.py b/darknet-master/scripts/gen_anchors.py
new file mode 100644
index 0000000..a1a2343
--- /dev/null
+++ b/darknet-master/scripts/gen_anchors.py
@@ -0,0 +1,165 @@
+'''
+Created on Feb 20, 2017
+
+@author: jumabek
+'''
+from os import listdir
+from os.path import isfile, join
+import argparse
+#import cv2
+import numpy as np
+import sys
+import os
+import shutil
+import random 
+import math
+
+width_in_cfg_file = 416.
+height_in_cfg_file = 416.
+
+def IOU(x,centroids):
+    similarities = []
+    k = len(centroids)
+    for centroid in centroids:
+        c_w,c_h = centroid
+        w,h = x
+        if c_w>=w and c_h>=h:
+            similarity = w*h/(c_w*c_h)
+        elif c_w>=w and c_h<=h:
+            similarity = w*c_h/(w*h + (c_w-w)*c_h)
+        elif c_w<=w and c_h>=h:
+            similarity = c_w*h/(w*h + c_w*(c_h-h))
+        else: #means both w,h are bigger than c_w and c_h respectively
+            similarity = (c_w*c_h)/(w*h)
+        similarities.append(similarity) # will become (k,) shape
+    return np.array(similarities) 
+
+def avg_IOU(X,centroids):
+    n,d = X.shape
+    sum = 0.
+    for i in range(X.shape[0]):
+        #note IOU() will return array which contains IoU for each centroid and X[i] // slightly ineffective, but I am too lazy
+        sum+= max(IOU(X[i],centroids)) 
+    return sum/n
+
+def write_anchors_to_file(centroids,X,anchor_file):
+    f = open(anchor_file,'w')
+    
+    anchors = centroids.copy()
+    print(anchors.shape)
+
+    for i in range(anchors.shape[0]):
+        anchors[i][0]*=width_in_cfg_file/32.
+        anchors[i][1]*=height_in_cfg_file/32.
+         
+
+    widths = anchors[:,0]
+    sorted_indices = np.argsort(widths)
+
+    print('Anchors = ', anchors[sorted_indices])
+        
+    for i in sorted_indices[:-1]:
+        f.write('%0.2f,%0.2f, '%(anchors[i,0],anchors[i,1]))
+
+    #there should not be comma after last anchor, that's why
+    f.write('%0.2f,%0.2f\n'%(anchors[sorted_indices[-1:],0],anchors[sorted_indices[-1:],1]))
+    
+    f.write('%f\n'%(avg_IOU(X,centroids)))
+    print()
+
+def kmeans(X,centroids,eps,anchor_file):
+    
+    N = X.shape[0]
+    iterations = 0
+    k,dim = centroids.shape
+    prev_assignments = np.ones(N)*(-1)    
+    iter = 0
+    old_D = np.zeros((N,k))
+
+    while True:
+        D = [] 
+        iter+=1           
+        for i in range(N):
+            d = 1 - IOU(X[i],centroids)
+            D.append(d)
+        D = np.array(D) # D.shape = (N,k)
+        
+        print("iter {}: dists = {}".format(iter,np.sum(np.abs(old_D-D))))
+            
+        #assign samples to centroids 
+        assignments = np.argmin(D,axis=1)
+        
+        if (assignments == prev_assignments).all() :
+            print("Centroids = ",centroids)
+            write_anchors_to_file(centroids,X,anchor_file)
+            return
+
+        #calculate new centroids
+        centroid_sums=np.zeros((k,dim),np.float)
+        for i in range(N):
+            centroid_sums[assignments[i]]+=X[i]        
+        for j in range(k):            
+            centroids[j] = centroid_sums[j]/(np.sum(assignments==j))
+        
+        prev_assignments = assignments.copy()     
+        old_D = D.copy()  
+
+def main(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-filelist', default = '\\path\\to\\voc\\filelist\\train.txt', 
+                        help='path to filelist\n' )
+    parser.add_argument('-output_dir', default = 'generated_anchors/anchors', type = str, 
+                        help='Output anchor directory\n' )  
+    parser.add_argument('-num_clusters', default = 0, type = int, 
+                        help='number of clusters\n' )  
+
+   
+    args = parser.parse_args()
+    
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    f = open(args.filelist)
+  
+    lines = [line.rstrip('\n') for line in f.readlines()]
+    
+    annotation_dims = []
+
+    size = np.zeros((1,1,3))
+    for line in lines:
+                    
+        #line = line.replace('images','labels')
+        #line = line.replace('img1','labels')
+        line = line.replace('JPEGImages','labels')        
+        
+
+        line = line.replace('.jpg','.txt')
+        line = line.replace('.png','.txt')
+        print(line)
+        f2 = open(line)
+        for line in f2.readlines():
+            line = line.rstrip('\n')
+            w,h = line.split(' ')[3:]            
+            #print(w,h)
+            annotation_dims.append(tuple(map(float,(w,h))))
+    annotation_dims = np.array(annotation_dims)
+  
+    eps = 0.005
+    
+    if args.num_clusters == 0:
+        for num_clusters in range(1,11): #we make 1 through 10 clusters 
+            anchor_file = join( args.output_dir,'anchors%d.txt'%(num_clusters))
+
+            indices = [ random.randrange(annotation_dims.shape[0]) for i in range(num_clusters)]
+            centroids = annotation_dims[indices]
+            kmeans(annotation_dims,centroids,eps,anchor_file)
+            print('centroids.shape', centroids.shape)
+    else:
+        anchor_file = join( args.output_dir,'anchors%d.txt'%(args.num_clusters))
+        indices = [ random.randrange(annotation_dims.shape[0]) for i in range(args.num_clusters)]
+        centroids = annotation_dims[indices]
+        kmeans(annotation_dims,centroids,eps,anchor_file)
+        print('centroids.shape', centroids.shape)
+
+if __name__=="__main__":
+    main(sys.argv)
diff --git a/darknet-master/scripts/gen_tactic.sh b/darknet-master/scripts/gen_tactic.sh
new file mode 100644
index 0000000..ffa30d2
--- /dev/null
+++ b/darknet-master/scripts/gen_tactic.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+# Usage:
+# wget http://pjreddie.com/media/files/peek.weights
+# scripts/gen_tactic.sh < data/goal.txt
+./darknet rnn generatetactic cfg/gru.cfg peek.weights 2>/dev/null
diff --git a/darknet-master/scripts/get_coco2017.sh b/darknet-master/scripts/get_coco2017.sh
new file mode 100644
index 0000000..fed5747
--- /dev/null
+++ b/darknet-master/scripts/get_coco2017.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Zip coco folder
+# zip -r coco.zip coco
+# tar -czvf coco.tar.gz coco
+
+# Download labels from Google Drive, accepting presented query
+filename="coco2017labels.zip"
+fileid="1cXZR_ckHki6nddOmcysCuuJFM--T-Q6L"
+curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null
+curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename}
+rm ./cookie
+
+# Unzip labels
+unzip -q ${filename}  # for coco.zip
+# tar -xzf ${filename}  # for coco.tar.gz
+rm ${filename}
+
+# Download and unzip images
+cd coco/images
+f="train2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f  # 19G, 118k images
+f="val2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f  # 1G, 5k images
+# f="test2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f  # 7G,  41k images
+
+# cd out
+cd ../..
diff --git a/darknet-master/scripts/get_coco_dataset.sh b/darknet-master/scripts/get_coco_dataset.sh
new file mode 100644
index 0000000..d7911e6
--- /dev/null
+++ b/darknet-master/scripts/get_coco_dataset.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Clone COCO API
+git clone https://github.com/pdollar/coco
+cd coco
+
+mkdir images
+cd images
+
+# Download Images
+#very slow downloading
+#wget -c https://pjreddie.com/media/files/train2014.zip
+#wget -c https://pjreddie.com/media/files/val2014.zip
+wget -c http://images.cocodataset.org/zips/train2014.zip
+wget -c http://images.cocodataset.org/zips/val2014.zip
+
+# Unzip
+unzip -q train2014.zip
+unzip -q val2014.zip
+
+cd ..
+
+# Download COCO Metadata
+wget -c https://pjreddie.com/media/files/instances_train-val2014.zip
+wget -c https://pjreddie.com/media/files/coco/5k.part
+wget -c https://pjreddie.com/media/files/coco/trainvalno5k.part
+wget -c https://pjreddie.com/media/files/coco/labels.tgz
+tar xzf labels.tgz
+unzip -q instances_train-val2014.zip
+
+# Set Up Image Lists
+paste <(awk "{print \"$PWD\"}" <5k.part) 5k.part | tr -d '\t' > 5k.txt
+paste <(awk "{print \"$PWD\"}" <trainvalno5k.part) trainvalno5k.part | tr -d '\t' > trainvalno5k.txt
+
diff --git a/darknet-master/scripts/get_imagenet_train.sh b/darknet-master/scripts/get_imagenet_train.sh
new file mode 100644
index 0000000..b357842
--- /dev/null
+++ b/darknet-master/scripts/get_imagenet_train.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+wget http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_train.tar
+mkdir -p ILSVRC2012_img_train
+tar --force-local -xf ILSVRC2012_img_train.tar -C ILSVRC2012_img_train
+
+wd=`pwd`
+
+for f in ILSVRC2012_img_train/*.tar;
+do
+name=$(echo "$f" | cut -f 1 -d '.')
+mkdir "${wd}/${name}"
+tar --force-local -xf "${wd}/${f}" -C "${wd}/${name}"
+done
+
+find "${wd}/ILSVRC2012_img_train" -name \*.JPEG > imagenet1k.train.list
+
diff --git a/darknet-master/scripts/get_openimages_dataset.py b/darknet-master/scripts/get_openimages_dataset.py
new file mode 100644
index 0000000..3508b3e
--- /dev/null
+++ b/darknet-master/scripts/get_openimages_dataset.py
@@ -0,0 +1,19 @@
+import csv
+import os
+
+#select classes you want to download at https://github.com/openimages/dataset/blob/master/dict.csv
+CLASS_LIST = ('/m/01g317','/m/04yx4')
+img_name = "111111111111"
+
+#download csv from https://storage.googleapis.com/openimages/web/download.html
+with open('path\\train-annotations-bbox.csv', newline='') as csvfile:
+    bboxs = csv.reader(csvfile, delimiter=',', quotechar='|')
+    for bbox in bboxs:
+        if bbox[2] in CLASS_LIST:
+            if img_name != bbox[0]:
+                if not os.path.isfile("destination_path\\%s.jpg"%bbox[0]):
+                    os.system("gsutil cp gs://open-images-dataset/train/%s.jpg destination_path"%bbox[0])
+                    out_file = open("destination_path\\%s.txt"%bbox[0], 'w')
+                    img_name = bbox[0]
+            if img_name == bbox[0]:
+                out_file.write(str(CLASS_LIST.index(bbox[2])) + " " + str(float(bbox[4])+(float(bbox[5])-float(bbox[4]))/2) + " " + str(float(bbox[6])+(float(bbox[7])-float(bbox[6]))/2)+ " " + str(float(bbox[5])-float(bbox[4])) + " " + str(float(bbox[7])-float(bbox[6])) + '\n')
diff --git a/darknet-master/scripts/imagenet_label.sh b/darknet-master/scripts/imagenet_label.sh
new file mode 100644
index 0000000..1335c72
--- /dev/null
+++ b/darknet-master/scripts/imagenet_label.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#wget http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_bbox_val_v3.tgz
+#other downloads: http://www.image-net.org/challenges/LSVRC/2012/nonpub-downloads
+#read: https://pjreddie.com/darknet/imagenet/
+
+mkdir -p labelled
+wd=`pwd`
+
+for f in val/*.xml;
+do
+label=`grep -m1 "<name>" $f | grep -oP '<name>\K[^<]*'`
+im=`echo $f | sed 's/val/imgs/; s/xml/JPEG/'`
+out=`echo $im | sed 's/JPEG/'${label}'.JPEG/; s/imgs/labelled/'`
+ln -s ${wd}/$im ${wd}/$out
+done
+
+find ${wd}/labelled -name \*.JPEG > inet.val.list
+
diff --git a/darknet-master/scripts/kitti2yolo.py b/darknet-master/scripts/kitti2yolo.py
new file mode 100644
index 0000000..4bd69db
--- /dev/null
+++ b/darknet-master/scripts/kitti2yolo.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+from __future__ import print_function
+from __future__ import division
+#
+# This is a utility for converting ground truth data from the kitti format
+# to the YOLO format.
+#
+#
+#
+# YOLO FORMAT
+# .txt for each .jpg - in the same directory and with the same name
+# <object-class> <x> <y> <width> <height>
+#
+# Where:
+#
+# <object-class> - integer number of object from 0 to (classes-1)
+# <x> <y> <width> <height> - floats relative to image width/height 0.0 to 1.0
+# eg. <x> = <absolute_x> / <image_width>
+# Note: <x> <y> - are center of rectangle (not top-left corner)
+#
+# For example for img1.jpg you will be created img1.txt containing:
+#
+#                        1 0.716797 0.395833 0.216406 0.147222
+#                        0 0.687109 0.379167 0.255469 0.158333
+#                        1 0.420312 0.395833 0.140625 0.166667
+#
+# KITTI FORMAT
+#
+# All images as .png in a separate folder to the .txt labels of the same name
+# One label line is as follows:
+#
+#    1 type Describes the type of object: Car, Van, Truck,
+#    Pedestrian, Person_sitting, Cyclist, Tram,
+#    Misc or DontCare
+#    1 truncated Float from 0 (non-truncated) to 1 (truncated), where
+#    truncated refers to the object leaving image boundaries
+#    1 occluded Integer (0,1,2,3) indicating occlusion state:
+#    0 = fully visible, 1 = partly occluded
+#    2 = largely occluded, 3 = unknown
+#    1 alpha Observation angle of object, ranging [-pi..pi]
+#    4 bbox 2D bounding box of object in the image (0-based index):
+#    contains left, top, right, bottom pixel coordinates
+#    3 dimensions 3D object dimensions: height, width, length (in meters)
+#    3 location 3D object location x,y,z in camera coordinates (in meters)
+#    1 rotation_y Rotation ry around Y-axis in camera coordinates [-pi..pi]
+#    1 score Only for results: Float, indicating confidence in
+#    detection, needed for p/r curves, higher is better.
+#
+# Car 0.0 0 -1.5 57.0 17.3 614.1 200.12 1.65 1.67 3.64 -0.65 1.71 46.70 -1.59
+# Cyclist 0.0 0 -2.46 665.45 160.00 717.9 217.9 1.7 0.4 1.6 2.4 1.3 22.1 -2.35
+# Pedestrian 0.00 2 0.2 42.1 17.6 433.1 24.0 1.6 0.38 0.30 -5.8 1.6 23.1 -0.03
+# DontCare -1 -1 -10 650.19 175.02 668.98 210.48 -1 -1 -1 -1000 -1000 -1000 -10
+
+# core imports
+import argparse
+import sys
+import os
+import shutil
+import cv2
+
+
+kitti2yolotype_dict = {'Car': '0',
+                       'Van': '0',
+                       'Pedestrian': '1',
+                       'Person_sitting': '1',
+                       'Cyclist': '2',
+                       'Truck': '3',
+                       'Tram': '6',
+                       'Misc': '6',
+                       'DontCare': '6'}
+
+
+def kitti2yolo(kitti_label, img_height, img_width):
+
+    kitti_label_arr = kitti_label.split(' ')
+    x1 = float(kitti_label_arr[4])
+    y1 = float(kitti_label_arr[5])
+    x2 = float(kitti_label_arr[6])
+    y2 = float(kitti_label_arr[7])
+
+    bb_width = x2 - x1
+    bb_height = y2 - y1
+    yolo_x = (x1 + 0.5*bb_width) / img_width
+    yolo_y = (y1 + 0.5*bb_height) / img_height
+    yolo_bb_width = bb_width / img_width
+    yolo_bb_height = bb_height / img_height
+    yolo_label = kitti2yolotype_dict[kitti_label_arr[0]]
+
+    return (yolo_label + ' '
+            + str(yolo_x) + ' '
+            + str(yolo_y) + ' '
+            + str(yolo_bb_width) + ' '
+            + str(yolo_bb_height))
+
+
+def main(args):
+
+    # parse command line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--kitti",
+                        help="path to kitti-format images and labels, images\
+                        should be under images_path/images and labels should\
+                        be under images_path/labels")
+    parser.add_argument("--yolo",
+                        help="path to output yolo-ready training data")
+    # kitti paths
+    args = parser.parse_args()
+    root_path = args.kitti
+    yolo_path = args.yolo
+    if root_path is None:
+        root_path = os.getcwd()
+    if (root_path[-1] != os.sep):
+        root_path += os.sep
+    kitti_images_path = root_path + 'image_2' + os.sep
+    kitti_labels_path = root_path + 'label_2' + os.sep
+
+    # yolo paths
+    if yolo_path is None:
+        yolo_path = root_path + 'yolo_labels' + os.sep
+
+    if not os.path.exists(yolo_path):
+        os.makedirs(yolo_path)
+
+    # load each kitti label, convert to yolo and save
+    for labelfilename in os.listdir(kitti_labels_path):
+        yolo_labels = []
+        with open(kitti_labels_path + labelfilename, 'r') as kittilabelfile:
+            cvimage = cv2.imread(kitti_images_path
+                                 + labelfilename.split('.txt')[0] + '.png')
+            height, width, frame_depth = cvimage.shape
+            for kitti_label in kittilabelfile:
+                yolo_labels.append(kitti2yolo(kitti_label,
+                                              img_height=height,
+                                              img_width=width))
+        with open(yolo_path + labelfilename, 'w+') as yololabelfile:
+            for label in yolo_labels:
+                yololabelfile.write(label + '\n')
+
+
+if __name__ == '__main__':
+    main(sys.argv)
diff --git a/darknet-master/scripts/kmeansiou.c b/darknet-master/scripts/kmeansiou.c
new file mode 100644
index 0000000..30f3391
--- /dev/null
+++ b/darknet-master/scripts/kmeansiou.c
@@ -0,0 +1,391 @@
+//usr/bin/cc -Ofast -lm "${0}" -o "${0%.c}" && ./"${0%.c}" "$@"; s=$?; rm ./"${0%.c}"; exit $s
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+typedef struct matrix{
+    int rows, cols;
+    double **vals;
+} matrix;
+
+matrix csv_to_matrix(char *filename, int header);
+matrix make_matrix(int rows, int cols);
+void zero_matrix(matrix m);
+
+void copy(double *x, double *y, int n);
+double dist(double *x, double *y, int n);
+int *sample(int n);
+
+int find_int_arg(int argc, char **argv, char *arg, int def);
+int find_arg(int argc, char* argv[], char *arg);
+
+int closest_center(double *datum, matrix centers)
+{
+    int j;
+    int best = 0;
+    double best_dist = dist(datum, centers.vals[best], centers.cols);
+    for(j = 0; j < centers.rows; ++j){
+        double new_dist = dist(datum, centers.vals[j], centers.cols);
+        if(new_dist < best_dist){
+            best_dist = new_dist;
+            best = j;
+        }
+    }
+    return best;
+}
+
+double dist_to_closest_center(double *datum, matrix centers)
+{
+    int ci = closest_center(datum, centers);
+    return dist(datum, centers.vals[ci], centers.cols);
+}
+
+int kmeans_expectation(matrix data, int *assignments, matrix centers)
+{
+    int i;
+    int converged = 1;
+    for(i = 0; i < data.rows; ++i){
+        int closest = closest_center(data.vals[i], centers);
+        if(closest != assignments[i]) converged = 0;
+        assignments[i] = closest;
+    }
+    return converged;
+}
+
+void kmeans_maximization(matrix data, int *assignments, matrix centers)
+{
+    int i,j;
+    int *counts = calloc(centers.rows, sizeof(int));
+    zero_matrix(centers);
+    for(i = 0; i < data.rows; ++i){
+        ++counts[assignments[i]];
+        for(j = 0; j < data.cols; ++j){
+            centers.vals[assignments[i]][j] += data.vals[i][j];
+        }
+    }
+    for(i = 0; i < centers.rows; ++i){
+        if(counts[i]){
+            for(j = 0; j < centers.cols; ++j){
+                centers.vals[i][j] /= counts[i];
+            }
+        }
+    }
+}
+
+double WCSS(matrix data, int *assignments, matrix centers)
+{
+    int i, j;
+    double sum = 0;
+
+    for(i = 0; i < data.rows; ++i){
+        int ci = assignments[i];
+        sum += (1 - dist(data.vals[i], centers.vals[ci], data.cols));
+    }
+    return sum / data.rows;
+}
+
+typedef struct{
+    int *assignments;
+    matrix centers;
+} model;
+
+void smart_centers(matrix data, matrix centers) {
+    int i,j;
+    copy(data.vals[rand()%data.rows], centers.vals[0], data.cols);
+    double *weights = calloc(data.rows, sizeof(double));
+    int clusters = centers.rows;
+    for (i = 1; i < clusters; ++i) {
+        double sum = 0;
+        centers.rows = i;
+        for (j = 0; j < data.rows; ++j) {
+            weights[j] = dist_to_closest_center(data.vals[j], centers);
+            sum += weights[j];
+        }
+        double r = sum*((double)rand()/RAND_MAX);
+        for (j = 0; j < data.rows; ++j) {
+            r -= weights[j];
+            if(r <= 0){
+                copy(data.vals[j], centers.vals[i], data.cols);
+                break;
+            }
+        }
+    }
+    free(weights);
+}
+
+void random_centers(matrix data, matrix centers){
+    int i;
+    int *s = sample(data.rows);
+    for(i = 0; i < centers.rows; ++i){
+        copy(data.vals[s[i]], centers.vals[i], data.cols);
+    }
+    free(s);
+}
+
+model do_kmeans(matrix data, int k)
+{
+    matrix centers = make_matrix(k, data.cols);
+    int *assignments = calloc(data.rows, sizeof(int));
+    smart_centers(data, centers);
+    //random_centers(data, centers);
+    if(k == 1) kmeans_maximization(data, assignments, centers);
+    while(!kmeans_expectation(data, assignments, centers)){
+        kmeans_maximization(data, assignments, centers);
+    }
+    model m;
+    m.assignments = assignments;
+    m.centers = centers;
+    return m;
+}
+
+int main(int argc, char *argv[])
+{
+    if(argc < 3){
+        fprintf(stderr, "usage: %s <csv-file> [points/centers/stats]\n", argv[0]);
+        return 0;
+    }
+    int i,j;
+    srand(time(0));
+    matrix data = csv_to_matrix(argv[1], 0);
+    int k = find_int_arg(argc, argv, "-k", 2);
+    int header = find_arg(argc, argv, "-h");
+    int count = find_arg(argc, argv, "-c");
+
+    if(strcmp(argv[2], "assignments")==0){
+        model m = do_kmeans(data, k);
+        int *assignments = m.assignments;
+        for(i = 0; i < k; ++i){
+            if(i != 0) printf("-\n");
+            for(j = 0; j < data.rows; ++j){
+                if(!(assignments[j] == i)) continue;
+                printf("%f, %f\n", data.vals[j][0], data.vals[j][1]);
+            }
+        }
+    }else if(strcmp(argv[2], "centers")==0){
+        model m = do_kmeans(data, k);
+        printf("WCSS: %f\n", WCSS(data, m.assignments, m.centers));
+        int *counts = 0;
+        if(count){
+            counts = calloc(k, sizeof(int));
+            for(j = 0; j < data.rows; ++j){
+                ++counts[m.assignments[j]];
+            }
+        }
+        for(j = 0; j < m.centers.rows; ++j){
+            if(count) printf("%d, ", counts[j]);
+            printf("%f, %f\n", m.centers.vals[j][0], m.centers.vals[j][1]);
+        }
+    }else if(strcmp(argv[2], "scan")==0){
+        for(i = 1; i <= k; ++i){
+            model m = do_kmeans(data, i);
+            printf("%f\n", WCSS(data, m.assignments, m.centers));
+        }
+    }
+    return 0;
+}
+
+// Utility functions
+
+int *sample(int n)
+{
+    int i;
+    int *s = calloc(n, sizeof(int));
+    for(i = 0; i < n; ++i) s[i] = i;
+    for(i = n-1; i >= 0; --i){
+        int swap = s[i];
+        int index = rand()%(i+1);
+        s[i] = s[index];
+        s[index] = swap;
+    }
+    return s;
+}
+
+double dist(double *x, double *y, int n)
+{
+    int i;
+    double mw = (x[0] < y[0]) ? x[0] : y[0];
+    double mh = (x[1] < y[1]) ? x[1] : y[1];
+    double inter = mw*mh;
+    double sum = x[0]*x[1] + y[0]*y[1];
+    double un = sum - inter;
+    double iou = inter/un;
+    return 1-iou;
+}
+
+void copy(double *x, double *y, int n)
+{
+    int i;
+    for(i = 0; i < n; ++i) y[i] = x[i];
+}
+
+void error(char *s){
+    fprintf(stderr, "Error: %s\n", s);
+    exit(-1);
+}
+
+char *fgetl(FILE *fp)
+{
+    if(feof(fp)) return 0;
+    int size = 512;
+    char *line = malloc(size*sizeof(char));
+    if(!fgets(line, size, fp)){
+        free(line);
+        return 0;
+    }
+
+    int curr = strlen(line);
+
+    while(line[curr-1]!='\n'){
+        size *= 2;
+        line = realloc(line, size*sizeof(char));
+        if(!line) error("Malloc");
+        fgets(&line[curr], size-curr, fp);
+        curr = strlen(line);
+    }
+    line[curr-1] = '\0';
+
+    return line;
+}
+
+// Matrix stuff
+
+int count_fields(char *line)
+{
+    int count = 0;
+    int done = 0;
+    char *c;
+    for(c = line; !done; ++c){
+        done = (*c == '\0');
+        if(*c == ',' || done) ++count;
+    }
+    return count;
+}
+
+double *parse_fields(char *l, int n)
+{
+    int i;
+    double *field = calloc(n, sizeof(double));
+    for(i = 0; i < n; ++i){
+        field[i] = atof(l);
+        l = strchr(l, ',')+1;
+    }
+    return field;
+}
+
+matrix make_matrix(int rows, int cols)
+{
+    matrix m;
+    m.rows = rows;
+    m.cols = cols;
+    m.vals = calloc(m.rows, sizeof(double *));
+    int i;
+    for(i = 0; i < m.rows; ++i) m.vals[i] = calloc(m.cols, sizeof(double));
+    return m;
+}
+
+void zero_matrix(matrix m)
+{
+    int i, j;
+    for(i = 0; i < m.rows; ++i){
+        for(j = 0; j < m.cols; ++j) m.vals[i][j] = 0;
+    }
+}
+
+matrix csv_to_matrix(char *filename, int header)
+{
+    FILE *fp = fopen(filename, "r");
+    if(!fp) error(filename);
+
+    matrix m;
+    m.cols = -1;
+
+    char *line;
+
+    int n = 0;
+    int size = 1024;
+    m.vals = calloc(size, sizeof(double*));
+    if(header) fgetl(fp);
+    while((line = fgetl(fp))){
+        if(m.cols == -1) m.cols = count_fields(line);
+        if(n == size){
+            size *= 2;
+            m.vals = realloc(m.vals, size*sizeof(double*));
+        }
+        m.vals[n] = parse_fields(line, m.cols);
+        free(line);
+        ++n;
+    }
+    m.vals = realloc(m.vals, n*sizeof(double*));
+    m.rows = n;
+    return m;
+}
+
+// Argument parsing
+
+void del_arg(int argc, char **argv, int index)
+{
+    int i;
+    for(i = index; i < argc-1; ++i) argv[i] = argv[i+1];
+    argv[i] = 0;
+}
+
+int find_arg(int argc, char* argv[], char *arg)
+{
+    int i;
+    for(i = 0; i < argc; ++i) {
+        if(!argv[i]) continue;
+        if(0==strcmp(argv[i], arg)) {
+            del_arg(argc, argv, i);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int find_int_arg(int argc, char **argv, char *arg, int def)
+{
+    int i;
+    for(i = 0; i < argc-1; ++i){
+        if(!argv[i]) continue;
+        if(0==strcmp(argv[i], arg)){
+            def = atoi(argv[i+1]);
+            del_arg(argc, argv, i);
+            del_arg(argc, argv, i);
+            break;
+        }
+    }
+    return def;
+}
+
+float find_float_arg(int argc, char **argv, char *arg, float def)
+{
+    int i;
+    for(i = 0; i < argc-1; ++i){
+        if(!argv[i]) continue;
+        if(0==strcmp(argv[i], arg)){
+            def = atof(argv[i+1]);
+            del_arg(argc, argv, i);
+            del_arg(argc, argv, i);
+            break;
+        }
+    }
+    return def;
+}
+
+char *find_char_arg(int argc, char **argv, char *arg, char *def)
+{
+    int i;
+    for(i = 0; i < argc-1; ++i){
+        if(!argv[i]) continue;
+        if(0==strcmp(argv[i], arg)){
+            def = argv[i+1];
+            del_arg(argc, argv, i);
+            del_arg(argc, argv, i);
+            break;
+        }
+    }
+    return def;
+}
diff --git a/darknet-master/scripts/log_parser/log_parser.py b/darknet-master/scripts/log_parser/log_parser.py
new file mode 100644
index 0000000..507c5da
--- /dev/null
+++ b/darknet-master/scripts/log_parser/log_parser.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2018/4/25 20:28
+# @Author  : Adesun
+# @Site    : https://github.com/Adesun
+# @File    : log_parser.py
+
+import argparse
+import logging
+import os
+import platform
+import re
+import sys
+
+# set non-interactive backend default when os is not windows
+if sys.platform != 'win32':
+    import matplotlib
+    matplotlib.use('Agg')
+
+import matplotlib.pyplot as plt
+from matplotlib.ticker import MultipleLocator, FormatStrFormatter
+
+
+def get_file_name_and_ext(filename):
+    (file_path, temp_filename) = os.path.split(filename)
+    (file_name, file_ext) = os.path.splitext(temp_filename)
+    return file_name, file_ext
+
+
+def show_message(message, stop=False):
+    print(message)
+    if stop:
+        sys.exit(0)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="training log parser by DeepKeeper ")
+    parser.add_argument('--source-dir', dest='source_dir', type=str, default='./',
+                        help='the log source directory')
+    parser.add_argument('--save-dir', dest='save_dir', type=str, default='./',
+                        help='the directory to be saved')
+    parser.add_argument('--csv-file', dest='csv_file', type=str, default="",
+                        help='training log file')
+    parser.add_argument('--log-file', dest='log_file', type=str, default="",
+                        help='training log file')
+    parser.add_argument('--show', dest='show_plot', type=bool, default=False,
+                        help='whether to show')
+    return parser.parse_args()
+
+
+def log_parser(args):
+    if not args.log_file:
+        show_message('log file must be specified.', True)
+
+    log_path = os.path.join(args.source_dir, args.log_file)
+    if not os.path.exists(log_path):
+        show_message('log file does not exist.', True)
+
+    file_name, _ = get_file_name_and_ext(log_path)
+    log_content = open(log_path).read()
+
+    iterations = []
+    losses = []
+    fig, ax = plt.subplots()
+    # set area we focus on
+    ax.set_ylim(0, 8)
+
+    major_locator = MultipleLocator()
+    minor_locator = MultipleLocator(0.5)
+    ax.yaxis.set_major_locator(major_locator)
+    ax.yaxis.set_minor_locator(minor_locator)
+    ax.yaxis.grid(True, which='minor')
+
+    pattern = re.compile(r"([\d].*): .*?, (.*?) avg")
+    # print(pattern.findall(log_content))
+    matches = pattern.findall(log_content)
+    # print(type(matches[0]))
+    counter = 0
+    log_count = len(matches)
+
+    if args.csv_file != '':
+        csv_path = os.path.join(args.save_dir, args.csv_file)
+        out_file = open(csv_path, 'w')
+    else:
+        csv_path = os.path.join(args.save_dir, file_name + '.csv')
+        out_file = open(csv_path, 'w')
+
+    for match in matches:
+        counter += 1
+        if log_count > 200:
+            if counter % 200 == 0:
+                print('parsing {}/{}'.format(counter, log_count))
+        else:
+            print('parsing {}/{}'.format(counter, log_count))
+        iteration, loss = match
+        iterations.append(int(iteration))
+        losses.append(float(loss))
+        out_file.write(iteration + ',' + loss + '\n')
+
+    ax.plot(iterations, losses)
+    plt.xlabel('Iteration')
+    plt.ylabel('Loss')
+    plt.tight_layout()
+
+    # saved as svg
+    save_path = os.path.join(args.save_dir, file_name + '.svg')
+    plt.savefig(save_path, dpi=300, format="svg")
+    if args.show_plot:
+        plt.show()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    log_parser(args)
diff --git a/darknet-master/scripts/log_parser/readme.md b/darknet-master/scripts/log_parser/readme.md
new file mode 100644
index 0000000..1670552
--- /dev/null
+++ b/darknet-master/scripts/log_parser/readme.md
@@ -0,0 +1,17 @@
+﻿# parsing training log and plot
+
+## Requirements
+
+1. matplotlib
+
+## Usage
+
+1. --source-dir  the directory of training log files 
+2. --save-dir the directory to save loss curve, image and csv file
+3. --log-file  log file name to be parsed 
+4. --csv-file csv file name to save loss data, default it's same with training log file name
+5. --show  whether to show after finished parsing, default False, just works on windows or linux with GUI desktop
+
+`python log_parser.py --source-dir ./ --save-dir ./ --log-file test.log --show true`
+
+![plot](https://github.com/AlexeyAB/darknet/blob/master/scripts/log_parser/test_new.svg)
diff --git a/darknet-master/scripts/log_parser/test_new.svg b/darknet-master/scripts/log_parser/test_new.svg
new file mode 100644
index 0000000..a749a78
--- /dev/null
+++ b/darknet-master/scripts/log_parser/test_new.svg
@@ -0,0 +1,2953 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Created with matplotlib (http://matplotlib.org/) -->
+<svg height="345.6pt" version="1.1" viewBox="0 0 460.8 345.6" width="460.8pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <defs>
+  <style type="text/css">
+*{stroke-linecap:butt;stroke-linejoin:round;}
+  </style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 345.6 
+L 460.8 345.6 
+L 460.8 0 
+L 0 0 
+z
+" style="fill:#ffffff;"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 38.27 303.64 
+L 447.48 303.64 
+L 447.48 14.76 
+L 38.27 14.76 
+z
+" style="fill:#ffffff;"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path d="M 0 0 
+L 0 3.5 
+" id="mcf83c14d4c" style="stroke:#000000;stroke-width:0.8;"/>
+      </defs>
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="56.776227" xlink:href="#mcf83c14d4c" y="303.64"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- 0 -->
+      <defs>
+       <path d="M 31.78125 66.40625 
+Q 24.171875 66.40625 20.328125 58.90625 
+Q 16.5 51.421875 16.5 36.375 
+Q 16.5 21.390625 20.328125 13.890625 
+Q 24.171875 6.390625 31.78125 6.390625 
+Q 39.453125 6.390625 43.28125 13.890625 
+Q 47.125 21.390625 47.125 36.375 
+Q 47.125 51.421875 43.28125 58.90625 
+Q 39.453125 66.40625 31.78125 66.40625 
+z
+M 31.78125 74.21875 
+Q 44.046875 74.21875 50.515625 64.515625 
+Q 56.984375 54.828125 56.984375 36.375 
+Q 56.984375 17.96875 50.515625 8.265625 
+Q 44.046875 -1.421875 31.78125 -1.421875 
+Q 19.53125 -1.421875 13.0625 8.265625 
+Q 6.59375 17.96875 6.59375 36.375 
+Q 6.59375 54.828125 13.0625 64.515625 
+Q 19.53125 74.21875 31.78125 74.21875 
+z
+" id="DejaVuSans-30"/>
+      </defs>
+      <g transform="translate(53.594977 318.238437)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="103.889841" xlink:href="#mcf83c14d4c" y="303.64"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- 500 -->
+      <defs>
+       <path d="M 10.796875 72.90625 
+L 49.515625 72.90625 
+L 49.515625 64.59375 
+L 19.828125 64.59375 
+L 19.828125 46.734375 
+Q 21.96875 47.46875 24.109375 47.828125 
+Q 26.265625 48.1875 28.421875 48.1875 
+Q 40.625 48.1875 47.75 41.5 
+Q 54.890625 34.8125 54.890625 23.390625 
+Q 54.890625 11.625 47.5625 5.09375 
+Q 40.234375 -1.421875 26.90625 -1.421875 
+Q 22.3125 -1.421875 17.546875 -0.640625 
+Q 12.796875 0.140625 7.71875 1.703125 
+L 7.71875 11.625 
+Q 12.109375 9.234375 16.796875 8.0625 
+Q 21.484375 6.890625 26.703125 6.890625 
+Q 35.15625 6.890625 40.078125 11.328125 
+Q 45.015625 15.765625 45.015625 23.390625 
+Q 45.015625 31 40.078125 35.4375 
+Q 35.15625 39.890625 26.703125 39.890625 
+Q 22.75 39.890625 18.8125 39.015625 
+Q 14.890625 38.140625 10.796875 36.28125 
+z
+" id="DejaVuSans-35"/>
+      </defs>
+      <g transform="translate(94.346091 318.238437)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-35"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-30"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="151.003454" xlink:href="#mcf83c14d4c" y="303.64"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- 1000 -->
+      <defs>
+       <path d="M 12.40625 8.296875 
+L 28.515625 8.296875 
+L 28.515625 63.921875 
+L 10.984375 60.40625 
+L 10.984375 69.390625 
+L 28.421875 72.90625 
+L 38.28125 72.90625 
+L 38.28125 8.296875 
+L 54.390625 8.296875 
+L 54.390625 0 
+L 12.40625 0 
+z
+" id="DejaVuSans-31"/>
+      </defs>
+      <g transform="translate(138.278454 318.238437)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-30"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-30"/>
+       <use x="190.869141" xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="198.117067" xlink:href="#mcf83c14d4c" y="303.64"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- 1500 -->
+      <g transform="translate(185.392067 318.238437)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-35"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-30"/>
+       <use x="190.869141" xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="245.230681" xlink:href="#mcf83c14d4c" y="303.64"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- 2000 -->
+      <defs>
+       <path d="M 19.1875 8.296875 
+L 53.609375 8.296875 
+L 53.609375 0 
+L 7.328125 0 
+L 7.328125 8.296875 
+Q 12.9375 14.109375 22.625 23.890625 
+Q 32.328125 33.6875 34.8125 36.53125 
+Q 39.546875 41.84375 41.421875 45.53125 
+Q 43.3125 49.21875 43.3125 52.78125 
+Q 43.3125 58.59375 39.234375 62.25 
+Q 35.15625 65.921875 28.609375 65.921875 
+Q 23.96875 65.921875 18.8125 64.3125 
+Q 13.671875 62.703125 7.8125 59.421875 
+L 7.8125 69.390625 
+Q 13.765625 71.78125 18.9375 73 
+Q 24.125 74.21875 28.421875 74.21875 
+Q 39.75 74.21875 46.484375 68.546875 
+Q 53.21875 62.890625 53.21875 53.421875 
+Q 53.21875 48.921875 51.53125 44.890625 
+Q 49.859375 40.875 45.40625 35.40625 
+Q 44.1875 33.984375 37.640625 27.21875 
+Q 31.109375 20.453125 19.1875 8.296875 
+z
+" id="DejaVuSans-32"/>
+      </defs>
+      <g transform="translate(232.505681 318.238437)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-30"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-30"/>
+       <use x="190.869141" xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="292.344294" xlink:href="#mcf83c14d4c" y="303.64"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- 2500 -->
+      <g transform="translate(279.619294 318.238437)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-35"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-30"/>
+       <use x="190.869141" xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="339.457907" xlink:href="#mcf83c14d4c" y="303.64"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- 3000 -->
+      <defs>
+       <path d="M 40.578125 39.3125 
+Q 47.65625 37.796875 51.625 33 
+Q 55.609375 28.21875 55.609375 21.1875 
+Q 55.609375 10.40625 48.1875 4.484375 
+Q 40.765625 -1.421875 27.09375 -1.421875 
+Q 22.515625 -1.421875 17.65625 -0.515625 
+Q 12.796875 0.390625 7.625 2.203125 
+L 7.625 11.71875 
+Q 11.71875 9.328125 16.59375 8.109375 
+Q 21.484375 6.890625 26.8125 6.890625 
+Q 36.078125 6.890625 40.9375 10.546875 
+Q 45.796875 14.203125 45.796875 21.1875 
+Q 45.796875 27.640625 41.28125 31.265625 
+Q 36.765625 34.90625 28.71875 34.90625 
+L 20.21875 34.90625 
+L 20.21875 43.015625 
+L 29.109375 43.015625 
+Q 36.375 43.015625 40.234375 45.921875 
+Q 44.09375 48.828125 44.09375 54.296875 
+Q 44.09375 59.90625 40.109375 62.90625 
+Q 36.140625 65.921875 28.71875 65.921875 
+Q 24.65625 65.921875 20.015625 65.03125 
+Q 15.375 64.15625 9.8125 62.3125 
+L 9.8125 71.09375 
+Q 15.4375 72.65625 20.34375 73.4375 
+Q 25.25 74.21875 29.59375 74.21875 
+Q 40.828125 74.21875 47.359375 69.109375 
+Q 53.90625 64.015625 53.90625 55.328125 
+Q 53.90625 49.265625 50.4375 45.09375 
+Q 46.96875 40.921875 40.578125 39.3125 
+z
+" id="DejaVuSans-33"/>
+      </defs>
+      <g transform="translate(326.732907 318.238437)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-33"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-30"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-30"/>
+       <use x="190.869141" xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="386.571521" xlink:href="#mcf83c14d4c" y="303.64"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 3500 -->
+      <g transform="translate(373.846521 318.238437)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-33"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-35"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-30"/>
+       <use x="190.869141" xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_9">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="433.685134" xlink:href="#mcf83c14d4c" y="303.64"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 4000 -->
+      <defs>
+       <path d="M 37.796875 64.3125 
+L 12.890625 25.390625 
+L 37.796875 25.390625 
+z
+M 35.203125 72.90625 
+L 47.609375 72.90625 
+L 47.609375 25.390625 
+L 58.015625 25.390625 
+L 58.015625 17.1875 
+L 47.609375 17.1875 
+L 47.609375 0 
+L 37.796875 0 
+L 37.796875 17.1875 
+L 4.890625 17.1875 
+L 4.890625 26.703125 
+z
+" id="DejaVuSans-34"/>
+      </defs>
+      <g transform="translate(420.960134 318.238437)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-30"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-30"/>
+       <use x="190.869141" xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_10">
+     <!-- Iteration -->
+     <defs>
+      <path d="M 9.8125 72.90625 
+L 19.671875 72.90625 
+L 19.671875 0 
+L 9.8125 0 
+z
+" id="DejaVuSans-49"/>
+      <path d="M 18.3125 70.21875 
+L 18.3125 54.6875 
+L 36.8125 54.6875 
+L 36.8125 47.703125 
+L 18.3125 47.703125 
+L 18.3125 18.015625 
+Q 18.3125 11.328125 20.140625 9.421875 
+Q 21.96875 7.515625 27.59375 7.515625 
+L 36.8125 7.515625 
+L 36.8125 0 
+L 27.59375 0 
+Q 17.1875 0 13.234375 3.875 
+Q 9.28125 7.765625 9.28125 18.015625 
+L 9.28125 47.703125 
+L 2.6875 47.703125 
+L 2.6875 54.6875 
+L 9.28125 54.6875 
+L 9.28125 70.21875 
+z
+" id="DejaVuSans-74"/>
+      <path d="M 56.203125 29.59375 
+L 56.203125 25.203125 
+L 14.890625 25.203125 
+Q 15.484375 15.921875 20.484375 11.0625 
+Q 25.484375 6.203125 34.421875 6.203125 
+Q 39.59375 6.203125 44.453125 7.46875 
+Q 49.3125 8.734375 54.109375 11.28125 
+L 54.109375 2.78125 
+Q 49.265625 0.734375 44.1875 -0.34375 
+Q 39.109375 -1.421875 33.890625 -1.421875 
+Q 20.796875 -1.421875 13.15625 6.1875 
+Q 5.515625 13.8125 5.515625 26.8125 
+Q 5.515625 40.234375 12.765625 48.109375 
+Q 20.015625 56 32.328125 56 
+Q 43.359375 56 49.78125 48.890625 
+Q 56.203125 41.796875 56.203125 29.59375 
+z
+M 47.21875 32.234375 
+Q 47.125 39.59375 43.09375 43.984375 
+Q 39.0625 48.390625 32.421875 48.390625 
+Q 24.90625 48.390625 20.390625 44.140625 
+Q 15.875 39.890625 15.1875 32.171875 
+z
+" id="DejaVuSans-65"/>
+      <path d="M 41.109375 46.296875 
+Q 39.59375 47.171875 37.8125 47.578125 
+Q 36.03125 48 33.890625 48 
+Q 26.265625 48 22.1875 43.046875 
+Q 18.109375 38.09375 18.109375 28.8125 
+L 18.109375 0 
+L 9.078125 0 
+L 9.078125 54.6875 
+L 18.109375 54.6875 
+L 18.109375 46.1875 
+Q 20.953125 51.171875 25.484375 53.578125 
+Q 30.03125 56 36.53125 56 
+Q 37.453125 56 38.578125 55.875 
+Q 39.703125 55.765625 41.0625 55.515625 
+z
+" id="DejaVuSans-72"/>
+      <path d="M 34.28125 27.484375 
+Q 23.390625 27.484375 19.1875 25 
+Q 14.984375 22.515625 14.984375 16.5 
+Q 14.984375 11.71875 18.140625 8.90625 
+Q 21.296875 6.109375 26.703125 6.109375 
+Q 34.1875 6.109375 38.703125 11.40625 
+Q 43.21875 16.703125 43.21875 25.484375 
+L 43.21875 27.484375 
+z
+M 52.203125 31.203125 
+L 52.203125 0 
+L 43.21875 0 
+L 43.21875 8.296875 
+Q 40.140625 3.328125 35.546875 0.953125 
+Q 30.953125 -1.421875 24.3125 -1.421875 
+Q 15.921875 -1.421875 10.953125 3.296875 
+Q 6 8.015625 6 15.921875 
+Q 6 25.140625 12.171875 29.828125 
+Q 18.359375 34.515625 30.609375 34.515625 
+L 43.21875 34.515625 
+L 43.21875 35.40625 
+Q 43.21875 41.609375 39.140625 45 
+Q 35.0625 48.390625 27.6875 48.390625 
+Q 23 48.390625 18.546875 47.265625 
+Q 14.109375 46.140625 10.015625 43.890625 
+L 10.015625 52.203125 
+Q 14.9375 54.109375 19.578125 55.046875 
+Q 24.21875 56 28.609375 56 
+Q 40.484375 56 46.34375 49.84375 
+Q 52.203125 43.703125 52.203125 31.203125 
+z
+" id="DejaVuSans-61"/>
+      <path d="M 9.421875 54.6875 
+L 18.40625 54.6875 
+L 18.40625 0 
+L 9.421875 0 
+z
+M 9.421875 75.984375 
+L 18.40625 75.984375 
+L 18.40625 64.59375 
+L 9.421875 64.59375 
+z
+" id="DejaVuSans-69"/>
+      <path d="M 30.609375 48.390625 
+Q 23.390625 48.390625 19.1875 42.75 
+Q 14.984375 37.109375 14.984375 27.296875 
+Q 14.984375 17.484375 19.15625 11.84375 
+Q 23.34375 6.203125 30.609375 6.203125 
+Q 37.796875 6.203125 41.984375 11.859375 
+Q 46.1875 17.53125 46.1875 27.296875 
+Q 46.1875 37.015625 41.984375 42.703125 
+Q 37.796875 48.390625 30.609375 48.390625 
+z
+M 30.609375 56 
+Q 42.328125 56 49.015625 48.375 
+Q 55.71875 40.765625 55.71875 27.296875 
+Q 55.71875 13.875 49.015625 6.21875 
+Q 42.328125 -1.421875 30.609375 -1.421875 
+Q 18.84375 -1.421875 12.171875 6.21875 
+Q 5.515625 13.875 5.515625 27.296875 
+Q 5.515625 40.765625 12.171875 48.375 
+Q 18.84375 56 30.609375 56 
+z
+" id="DejaVuSans-6f"/>
+      <path d="M 54.890625 33.015625 
+L 54.890625 0 
+L 45.90625 0 
+L 45.90625 32.71875 
+Q 45.90625 40.484375 42.875 44.328125 
+Q 39.84375 48.1875 33.796875 48.1875 
+Q 26.515625 48.1875 22.3125 43.546875 
+Q 18.109375 38.921875 18.109375 30.90625 
+L 18.109375 0 
+L 9.078125 0 
+L 9.078125 54.6875 
+L 18.109375 54.6875 
+L 18.109375 46.1875 
+Q 21.34375 51.125 25.703125 53.5625 
+Q 30.078125 56 35.796875 56 
+Q 45.21875 56 50.046875 50.171875 
+Q 54.890625 44.34375 54.890625 33.015625 
+z
+" id="DejaVuSans-6e"/>
+     </defs>
+     <g transform="translate(221.666406 331.916562)scale(0.1 -0.1)">
+      <use xlink:href="#DejaVuSans-49"/>
+      <use x="29.492188" xlink:href="#DejaVuSans-74"/>
+      <use x="68.701172" xlink:href="#DejaVuSans-65"/>
+      <use x="130.224609" xlink:href="#DejaVuSans-72"/>
+      <use x="171.337891" xlink:href="#DejaVuSans-61"/>
+      <use x="232.617188" xlink:href="#DejaVuSans-74"/>
+      <use x="271.826172" xlink:href="#DejaVuSans-69"/>
+      <use x="299.609375" xlink:href="#DejaVuSans-6f"/>
+      <use x="360.791016" xlink:href="#DejaVuSans-6e"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_10">
+      <defs>
+       <path d="M 0 0 
+L -3.5 0 
+" id="mdb5b43aecd" style="stroke:#000000;stroke-width:0.8;"/>
+      </defs>
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="38.27" xlink:href="#mdb5b43aecd" y="303.64"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 0 -->
+      <g transform="translate(24.9075 307.439219)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_11">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="38.27" xlink:href="#mdb5b43aecd" y="267.53"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- 1 -->
+      <g transform="translate(24.9075 271.329219)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_12">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="38.27" xlink:href="#mdb5b43aecd" y="231.42"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 2 -->
+      <g transform="translate(24.9075 235.219219)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-32"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_13">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="38.27" xlink:href="#mdb5b43aecd" y="195.31"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 3 -->
+      <g transform="translate(24.9075 199.109219)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-33"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_14">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="38.27" xlink:href="#mdb5b43aecd" y="159.2"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- 4 -->
+      <g transform="translate(24.9075 162.999219)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-34"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_15">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="38.27" xlink:href="#mdb5b43aecd" y="123.09"/>
+      </g>
+     </g>
+     <g id="text_16">
+      <!-- 5 -->
+      <g transform="translate(24.9075 126.889219)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-35"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_16">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="38.27" xlink:href="#mdb5b43aecd" y="86.98"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- 6 -->
+      <defs>
+       <path d="M 33.015625 40.375 
+Q 26.375 40.375 22.484375 35.828125 
+Q 18.609375 31.296875 18.609375 23.390625 
+Q 18.609375 15.53125 22.484375 10.953125 
+Q 26.375 6.390625 33.015625 6.390625 
+Q 39.65625 6.390625 43.53125 10.953125 
+Q 47.40625 15.53125 47.40625 23.390625 
+Q 47.40625 31.296875 43.53125 35.828125 
+Q 39.65625 40.375 33.015625 40.375 
+z
+M 52.59375 71.296875 
+L 52.59375 62.3125 
+Q 48.875 64.0625 45.09375 64.984375 
+Q 41.3125 65.921875 37.59375 65.921875 
+Q 27.828125 65.921875 22.671875 59.328125 
+Q 17.53125 52.734375 16.796875 39.40625 
+Q 19.671875 43.65625 24.015625 45.921875 
+Q 28.375 48.1875 33.59375 48.1875 
+Q 44.578125 48.1875 50.953125 41.515625 
+Q 57.328125 34.859375 57.328125 23.390625 
+Q 57.328125 12.15625 50.6875 5.359375 
+Q 44.046875 -1.421875 33.015625 -1.421875 
+Q 20.359375 -1.421875 13.671875 8.265625 
+Q 6.984375 17.96875 6.984375 36.375 
+Q 6.984375 53.65625 15.1875 63.9375 
+Q 23.390625 74.21875 37.203125 74.21875 
+Q 40.921875 74.21875 44.703125 73.484375 
+Q 48.484375 72.75 52.59375 71.296875 
+z
+" id="DejaVuSans-36"/>
+      </defs>
+      <g transform="translate(24.9075 90.779219)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-36"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_17">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="38.27" xlink:href="#mdb5b43aecd" y="50.87"/>
+      </g>
+     </g>
+     <g id="text_18">
+      <!-- 7 -->
+      <defs>
+       <path d="M 8.203125 72.90625 
+L 55.078125 72.90625 
+L 55.078125 68.703125 
+L 28.609375 0 
+L 18.3125 0 
+L 43.21875 64.59375 
+L 8.203125 64.59375 
+z
+" id="DejaVuSans-37"/>
+      </defs>
+      <g transform="translate(24.9075 54.669219)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-37"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_9">
+     <g id="line2d_18">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.8;" x="38.27" xlink:href="#mdb5b43aecd" y="14.76"/>
+      </g>
+     </g>
+     <g id="text_19">
+      <!-- 8 -->
+      <defs>
+       <path d="M 31.78125 34.625 
+Q 24.75 34.625 20.71875 30.859375 
+Q 16.703125 27.09375 16.703125 20.515625 
+Q 16.703125 13.921875 20.71875 10.15625 
+Q 24.75 6.390625 31.78125 6.390625 
+Q 38.8125 6.390625 42.859375 10.171875 
+Q 46.921875 13.96875 46.921875 20.515625 
+Q 46.921875 27.09375 42.890625 30.859375 
+Q 38.875 34.625 31.78125 34.625 
+z
+M 21.921875 38.8125 
+Q 15.578125 40.375 12.03125 44.71875 
+Q 8.5 49.078125 8.5 55.328125 
+Q 8.5 64.0625 14.71875 69.140625 
+Q 20.953125 74.21875 31.78125 74.21875 
+Q 42.671875 74.21875 48.875 69.140625 
+Q 55.078125 64.0625 55.078125 55.328125 
+Q 55.078125 49.078125 51.53125 44.71875 
+Q 48 40.375 41.703125 38.8125 
+Q 48.828125 37.15625 52.796875 32.3125 
+Q 56.78125 27.484375 56.78125 20.515625 
+Q 56.78125 9.90625 50.3125 4.234375 
+Q 43.84375 -1.421875 31.78125 -1.421875 
+Q 19.734375 -1.421875 13.25 4.234375 
+Q 6.78125 9.90625 6.78125 20.515625 
+Q 6.78125 27.484375 10.78125 32.3125 
+Q 14.796875 37.15625 21.921875 38.8125 
+z
+M 18.3125 54.390625 
+Q 18.3125 48.734375 21.84375 45.5625 
+Q 25.390625 42.390625 31.78125 42.390625 
+Q 38.140625 42.390625 41.71875 45.5625 
+Q 45.3125 48.734375 45.3125 54.390625 
+Q 45.3125 60.0625 41.71875 63.234375 
+Q 38.140625 66.40625 31.78125 66.40625 
+Q 25.390625 66.40625 21.84375 63.234375 
+Q 18.3125 60.0625 18.3125 54.390625 
+z
+" id="DejaVuSans-38"/>
+      </defs>
+      <g transform="translate(24.9075 18.559219)scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-38"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_19">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 303.64 
+L 447.48 303.64 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_20">
+      <defs>
+       <path d="M 0 0 
+L -2 0 
+" id="m0a4be72e18" style="stroke:#000000;stroke-width:0.6;"/>
+      </defs>
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="303.64"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_21">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 285.585 
+L 447.48 285.585 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_22">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="285.585"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_23">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 267.53 
+L 447.48 267.53 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_24">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="267.53"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_13">
+     <g id="line2d_25">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 249.475 
+L 447.48 249.475 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_26">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="249.475"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_27">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 231.42 
+L 447.48 231.42 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_28">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="231.42"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_29">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 213.365 
+L 447.48 213.365 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_30">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="213.365"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_31">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 195.31 
+L 447.48 195.31 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_32">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="195.31"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_17">
+     <g id="line2d_33">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 177.255 
+L 447.48 177.255 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_34">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="177.255"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_18">
+     <g id="line2d_35">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 159.2 
+L 447.48 159.2 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_36">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="159.2"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_19">
+     <g id="line2d_37">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 141.145 
+L 447.48 141.145 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_38">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="141.145"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_20">
+     <g id="line2d_39">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 123.09 
+L 447.48 123.09 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_40">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="123.09"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_21">
+     <g id="line2d_41">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 105.035 
+L 447.48 105.035 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_42">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="105.035"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_22">
+     <g id="line2d_43">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 86.98 
+L 447.48 86.98 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_44">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="86.98"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_23">
+     <g id="line2d_45">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 68.925 
+L 447.48 68.925 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_46">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="68.925"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_24">
+     <g id="line2d_47">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 50.87 
+L 447.48 50.87 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_48">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="50.87"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_25">
+     <g id="line2d_49">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 32.815 
+L 447.48 32.815 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_50">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="32.815"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_26">
+     <g id="line2d_51">
+      <path clip-path="url(#p27d2dc2848)" d="M 38.27 14.76 
+L 447.48 14.76 
+" style="fill:none;stroke:#b0b0b0;stroke-linecap:square;stroke-width:0.8;"/>
+     </g>
+     <g id="line2d_52">
+      <g>
+       <use style="stroke:#000000;stroke-width:0.6;" x="38.27" xlink:href="#m0a4be72e18" y="14.76"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_20">
+     <!-- Loss -->
+     <defs>
+      <path d="M 9.8125 72.90625 
+L 19.671875 72.90625 
+L 19.671875 8.296875 
+L 55.171875 8.296875 
+L 55.171875 0 
+L 9.8125 0 
+z
+" id="DejaVuSans-4c"/>
+      <path d="M 44.28125 53.078125 
+L 44.28125 44.578125 
+Q 40.484375 46.53125 36.375 47.5 
+Q 32.28125 48.484375 27.875 48.484375 
+Q 21.1875 48.484375 17.84375 46.4375 
+Q 14.5 44.390625 14.5 40.28125 
+Q 14.5 37.15625 16.890625 35.375 
+Q 19.28125 33.59375 26.515625 31.984375 
+L 29.59375 31.296875 
+Q 39.15625 29.25 43.1875 25.515625 
+Q 47.21875 21.78125 47.21875 15.09375 
+Q 47.21875 7.46875 41.1875 3.015625 
+Q 35.15625 -1.421875 24.609375 -1.421875 
+Q 20.21875 -1.421875 15.453125 -0.5625 
+Q 10.6875 0.296875 5.421875 2 
+L 5.421875 11.28125 
+Q 10.40625 8.6875 15.234375 7.390625 
+Q 20.0625 6.109375 24.8125 6.109375 
+Q 31.15625 6.109375 34.5625 8.28125 
+Q 37.984375 10.453125 37.984375 14.40625 
+Q 37.984375 18.0625 35.515625 20.015625 
+Q 33.0625 21.96875 24.703125 23.78125 
+L 21.578125 24.515625 
+Q 13.234375 26.265625 9.515625 29.90625 
+Q 5.8125 33.546875 5.8125 39.890625 
+Q 5.8125 47.609375 11.28125 51.796875 
+Q 16.75 56 26.8125 56 
+Q 31.78125 56 36.171875 55.265625 
+Q 40.578125 54.546875 44.28125 53.078125 
+z
+" id="DejaVuSans-73"/>
+     </defs>
+     <g transform="translate(18.827813 170.253906)rotate(-90)scale(0.1 -0.1)">
+      <use xlink:href="#DejaVuSans-4c"/>
+      <use x="55.697266" xlink:href="#DejaVuSans-6f"/>
+      <use x="116.878906" xlink:href="#DejaVuSans-73"/>
+      <use x="168.978516" xlink:href="#DejaVuSans-73"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_53">
+    <path clip-path="url(#p27d2dc2848)" d="M 61.399493 -1 
+L 61.676043 47.318509 
+L 62.712543 141.788769 
+L 62.80677 144.627087 
+L 62.995224 156.869497 
+L 63.089452 158.397022 
+L 63.372133 170.228644 
+L 63.654815 175.684937 
+L 63.749042 175.630881 
+L 63.843269 176.588446 
+L 63.937497 174.370172 
+L 64.031724 174.53375 
+L 64.220178 178.170533 
+L 64.408633 180.585967 
+L 64.597087 176.832152 
+L 65.350905 184.953471 
+L 65.539359 183.44151 
+L 65.633587 184.318008 
+L 65.916268 183.168554 
+L 66.104723 184.261568 
+L 66.19895 183.928381 
+L 66.481632 177.042276 
+L 66.575859 176.225432 
+L 67.046995 167.052372 
+L 67.141222 166.331147 
+L 67.518131 176.101105 
+L 67.89504 180.684222 
+L 67.989267 180.435208 
+L 68.083495 184.500941 
+L 68.271949 180.333702 
+L 68.366176 182.284112 
+L 68.554631 181.707941 
+L 68.648858 181.708699 
+L 68.837312 184.221016 
+L 69.025767 184.721934 
+L 69.496903 200.077712 
+L 69.968039 206.13372 
+L 70.344948 199.720512 
+L 70.439175 198.379856 
+L 70.533402 193.909004 
+L 70.62763 194.639112 
+L 70.721857 193.685158 
+L 70.816084 191.321831 
+L 70.910311 191.873736 
+L 71.004539 190.353361 
+L 71.098766 190.659465 
+L 71.28722 188.528542 
+L 71.475675 188.951932 
+L 71.664129 191.044578 
+L 71.852584 189.972003 
+L 72.041038 194.683961 
+L 72.135265 194.522874 
+L 72.32372 195.666622 
+L 72.512174 200.864151 
+L 72.606401 200.344023 
+L 72.700629 200.632433 
+L 72.794856 200.176906 
+L 73.360219 189.028846 
+L 73.548674 186.922478 
+L 73.642901 186.828122 
+L 73.925583 183.321336 
+L 74.01981 181.07338 
+L 74.208264 182.428769 
+L 74.396719 182.139636 
+L 74.6794 175.058356 
+L 74.773628 174.563324 
+L 74.867855 176.904841 
+L 74.962082 175.297296 
+L 75.056309 175.434225 
+L 75.150537 175.661177 
+L 75.244764 174.324962 
+L 75.338991 175.166723 
+L 75.433218 174.941613 
+L 75.527445 174.986028 
+L 75.7159 177.241567 
+L 75.810127 176.900797 
+L 75.904354 177.154903 
+L 75.998582 175.143793 
+L 76.187036 177.834421 
+L 76.469718 179.390726 
+L 76.752399 181.042722 
+L 76.846627 178.516611 
+L 76.940854 178.856226 
+L 77.035081 179.117482 
+L 77.41199 173.743916 
+L 77.506217 172.005328 
+L 77.694672 173.455795 
+L 77.788899 173.795806 
+L 77.883126 173.067468 
+L 78.071581 178.051767 
+L 78.260035 179.624935 
+L 78.354262 177.997205 
+L 78.448489 179.585142 
+L 78.636944 174.482185 
+L 78.731171 173.761033 
+L 78.919626 175.686129 
+L 79.10808 175.376413 
+L 79.390762 176.114718 
+L 79.673443 180.635185 
+L 79.767671 180.690975 
+L 79.956125 184.89779 
+L 80.238807 189.070734 
+L 80.333034 190.55063 
+L 80.427261 189.434036 
+L 80.615716 191.631294 
+L 81.086852 182.758778 
+L 81.181079 181.940922 
+L 81.275306 180.098121 
+L 81.84067 189.88682 
+L 82.123351 193.981116 
+L 82.217579 195.881874 
+L 82.406033 192.637679 
+L 82.50026 192.690725 
+L 82.594487 192.518047 
+L 82.688715 191.526069 
+L 82.782942 191.894861 
+L 82.971396 194.845914 
+L 83.065624 193.730404 
+L 83.159851 194.064638 
+L 83.348305 189.248864 
+L 83.442532 190.791736 
+L 83.819441 184.904001 
+L 83.913669 183.295625 
+L 84.007896 184.03523 
+L 84.290578 180.266141 
+L 84.384805 180.034964 
+L 84.573259 179.306265 
+L 84.667486 180.80761 
+L 84.855941 176.737977 
+L 84.950168 175.235223 
+L 85.89244 199.373169 
+L 85.986668 199.602901 
+L 86.269349 189.582737 
+L 86.646258 180.805227 
+L 86.740485 180.136036 
+L 86.834713 178.487145 
+L 86.92894 178.903097 
+L 87.117394 183.475597 
+L 87.400076 188.775932 
+L 87.682758 194.911887 
+L 88.153894 201.473796 
+L 88.342348 204.341653 
+L 88.436575 204.866656 
+L 88.530803 204.569723 
+L 88.907712 208.46462 
+L 89.096166 210.808231 
+L 89.190393 210.518629 
+L 89.284621 209.429191 
+L 89.755757 213.842555 
+L 89.849984 213.795431 
+L 90.038438 213.981795 
+L 90.32112 214.143026 
+L 90.415347 214.994139 
+L 90.509574 213.20478 
+L 90.603802 213.498679 
+L 90.698029 213.576677 
+L 90.886483 210.119361 
+L 90.980711 210.192881 
+L 91.169165 207.59083 
+L 91.263392 207.459137 
+L 91.451847 204.985241 
+L 91.546074 205.801219 
+L 91.734528 204.674154 
+L 91.828756 203.957262 
+L 91.922983 204.488548 
+L 92.205665 210.308 
+L 92.299892 210.029447 
+L 92.394119 208.945208 
+L 92.582573 210.993476 
+L 92.959482 196.663042 
+L 93.05371 195.38742 
+L 93.430618 185.148538 
+L 93.524846 184.838858 
+L 94.184436 196.512391 
+L 94.372891 197.642381 
+L 94.844027 204.291857 
+L 94.938254 204.430303 
+L 95.032481 206.562273 
+L 95.126709 206.31817 
+L 95.40939 208.332133 
+L 95.974754 187.886217 
+L 96.163208 185.855608 
+L 96.351663 184.268212 
+L 96.634344 190.203902 
+L 96.917026 193.590695 
+L 97.011253 193.065691 
+L 97.10548 193.238189 
+L 97.199708 195.660881 
+L 97.293935 195.554898 
+L 97.388162 195.665359 
+L 97.670844 188.956518 
+L 97.765071 189.106483 
+L 97.859298 189.035707 
+L 98.14198 185.198803 
+L 98.518889 191.895077 
+L 98.613116 192.522236 
+L 99.084252 200.255337 
+L 99.178479 199.783415 
+L 99.272707 202.079036 
+L 99.366934 200.532734 
+L 99.555388 201.593501 
+L 99.649615 200.357817 
+L 99.932297 202.96077 
+L 100.026524 203.228129 
+L 100.120752 201.118005 
+L 100.214979 202.259803 
+L 100.309206 201.731044 
+L 100.403433 202.267386 
+L 100.686115 205.261085 
+L 100.968797 208.007251 
+L 101.063024 207.07276 
+L 101.722614 192.688378 
+L 102.193751 184.739447 
+L 102.382205 187.02243 
+L 102.476432 185.904212 
+L 102.570659 187.658941 
+L 102.664887 187.55953 
+L 102.759114 186.873223 
+L 103.041796 189.273383 
+L 103.418705 199.482258 
+L 103.512932 199.095159 
+L 103.607159 200.339329 
+L 103.701386 199.216885 
+L 103.889841 201.618525 
+L 103.984068 201.278802 
+L 104.078295 200.984217 
+L 104.26675 202.982581 
+L 104.455204 203.236037 
+L 104.643658 200.344998 
+L 104.737886 200.322357 
+L 104.92634 200.726067 
+L 105.020567 202.035451 
+L 105.114795 201.355608 
+L 105.209022 201.419343 
+L 105.397476 203.914255 
+L 105.491704 203.75306 
+L 105.680158 202.967992 
+L 105.774385 203.86482 
+L 105.868612 202.149703 
+L 105.96284 202.386802 
+L 106.151294 200.955221 
+L 106.245521 201.347484 
+L 106.339749 203.979108 
+L 106.433976 203.581682 
+L 106.528203 204.202087 
+L 106.716657 202.014652 
+L 106.905112 206.219734 
+L 106.999339 206.538657 
+L 107.187794 209.619274 
+L 107.282021 209.615554 
+L 107.65893 211.728025 
+L 108.224293 200.550247 
+L 108.31852 200.783012 
+L 108.601202 202.886167 
+L 108.883884 199.271267 
+L 108.978111 200.526487 
+L 109.166565 195.447976 
+L 109.260793 195.87541 
+L 109.35502 197.558172 
+L 109.449247 197.306847 
+L 109.543474 197.331691 
+L 109.731929 190.847562 
+L 110.108838 188.452169 
+L 110.203065 188.296644 
+L 110.485747 185.688924 
+L 110.579974 186.296258 
+L 110.768428 189.567138 
+L 111.05111 190.57504 
+L 111.333792 193.329114 
+L 111.616473 196.191987 
+L 111.804928 200.346189 
+L 112.087609 207.172099 
+L 112.370291 211.954616 
+L 112.7472 206.199079 
+L 112.841427 206.533999 
+L 113.029882 203.089177 
+L 113.218336 200.61752 
+L 113.312563 200.676343 
+L 113.406791 200.496371 
+L 113.501018 201.909825 
+L 113.595245 200.246273 
+L 113.877927 202.197188 
+L 114.066381 199.666166 
+L 114.537517 203.828602 
+L 114.820199 200.85256 
+L 114.914426 201.172314 
+L 115.008653 201.915458 
+L 115.102881 201.526553 
+L 115.197108 198.925694 
+L 115.291335 199.74308 
+L 115.574017 205.484173 
+L 115.668244 205.427878 
+L 115.762471 206.05045 
+L 115.856698 207.539301 
+L 115.950926 205.852134 
+L 116.045153 206.237319 
+L 116.13938 205.174638 
+L 116.798971 212.128088 
+L 116.987425 214.32809 
+L 117.081652 214.550852 
+L 117.647016 211.433043 
+L 117.741243 208.439957 
+L 117.929697 209.715723 
+L 118.118152 207.51388 
+L 118.306606 204.598611 
+L 118.495061 202.588404 
+L 118.589288 203.021724 
+L 118.777742 200.983964 
+L 118.87197 201.57689 
+L 118.966197 199.07129 
+L 119.060424 199.381727 
+L 119.154651 198.864632 
+L 119.625788 187.426898 
+L 119.720015 187.469183 
+L 119.908469 186.163698 
+L 120.002696 188.073845 
+L 120.096924 187.823097 
+L 120.285378 190.086869 
+L 120.473833 195.188309 
+L 120.56806 194.875308 
+L 120.850741 193.519738 
+L 121.039196 194.904882 
+L 121.133423 194.87863 
+L 121.604559 198.824622 
+L 121.698786 199.020411 
+L 121.887241 200.81172 
+L 121.981468 200.369733 
+L 122.26415 201.551722 
+L 122.358377 200.192469 
+L 122.546832 200.957893 
+L 122.735286 198.603124 
+L 122.829513 198.580844 
+L 123.017968 203.685353 
+L 123.112195 204.741751 
+L 123.206422 203.70655 
+L 123.300649 205.288637 
+L 123.489104 204.822168 
+L 123.583331 205.78663 
+L 123.771785 202.945243 
+L 123.96024 199.786304 
+L 124.242922 196.610321 
+L 124.619831 189.806547 
+L 124.902512 194.93785 
+L 124.996739 195.806224 
+L 125.279421 200.353809 
+L 125.467876 199.339443 
+L 125.562103 201.898089 
+L 125.844784 194.373632 
+L 125.939012 194.184451 
+L 126.033239 190.755771 
+L 126.127466 191.110624 
+L 126.315921 187.64558 
+L 126.410148 187.818006 
+L 126.598602 188.253528 
+L 126.787057 183.774552 
+L 126.881284 183.937083 
+L 126.975511 182.85743 
+L 127.069738 183.318194 
+L 127.258193 184.87681 
+L 127.446647 184.767252 
+L 127.635102 188.049543 
+L 128.200465 199.549097 
+L 128.294692 200.052362 
+L 128.38892 198.729978 
+L 128.954283 205.036806 
+L 129.04851 205.25928 
+L 129.142737 204.938659 
+L 129.519646 207.00621 
+L 129.708101 206.108912 
+L 129.896555 208.151438 
+L 129.990782 208.087812 
+L 130.08501 207.424111 
+L 130.179237 207.53746 
+L 130.273464 206.909615 
+L 130.650373 210.46497 
+L 130.7446 210.475514 
+L 131.121509 214.149779 
+L 131.215736 213.543564 
+L 131.498418 205.931287 
+L 131.875327 198.808228 
+L 131.969554 198.131635 
+L 132.158009 193.676492 
+L 132.346463 198.641906 
+L 132.723372 204.063678 
+L 132.817599 204.729907 
+L 132.911826 204.229567 
+L 133.006054 204.833363 
+L 133.100281 206.635613 
+L 133.194508 206.384215 
+L 133.382963 204.585142 
+L 133.47719 204.087402 
+L 133.854099 206.858484 
+L 134.042553 209.237085 
+L 134.231008 207.308631 
+L 134.325235 207.378756 
+L 134.513689 205.495439 
+L 134.702144 207.258005 
+L 134.796371 207.375795 
+L 134.890598 208.910507 
+L 135.079053 207.660089 
+L 135.267507 208.517991 
+L 135.455962 206.651176 
+L 135.644416 207.463001 
+L 135.738643 207.685078 
+L 135.83287 205.142536 
+L 135.927098 206.530424 
+L 136.115552 200.80717 
+L 136.304007 198.960757 
+L 136.680916 192.55571 
+L 136.775143 192.265494 
+L 136.86937 193.909618 
+L 137.152052 202.601403 
+L 137.340506 205.066272 
+L 137.623188 210.073321 
+L 137.811642 213.499257 
+L 138.094324 211.45655 
+L 138.188551 211.367539 
+L 138.377006 212.185575 
+L 138.471233 211.999428 
+L 138.659687 213.264975 
+L 138.753915 213.436064 
+L 138.942369 208.789971 
+L 139.413505 194.852306 
+L 139.790414 191.221048 
+L 139.978868 193.992563 
+L 140.073096 194.114615 
+L 140.167323 195.249913 
+L 140.450005 192.181032 
+L 140.544232 192.397079 
+L 141.29805 205.622438 
+L 141.392277 205.704588 
+L 141.580731 209.033642 
+L 141.95764 197.538131 
+L 142.523004 191.297854 
+L 142.617231 187.813781 
+L 142.711458 189.613395 
+L 142.805685 189.539586 
+L 142.99414 187.209299 
+L 143.088367 187.451272 
+L 143.182594 185.821592 
+L 143.371049 188.314121 
+L 143.465276 186.301061 
+L 143.842185 195.074996 
+L 144.313321 202.810047 
+L 144.407548 202.922096 
+L 144.501775 201.869562 
+L 145.067139 206.585997 
+L 145.161366 207.338385 
+L 145.34982 209.415722 
+L 145.444048 208.493292 
+L 145.538275 208.859844 
+L 145.726729 210.332302 
+L 145.820957 210.7571 
+L 145.915184 211.724053 
+L 146.103638 210.691307 
+L 146.197865 211.225663 
+L 146.292093 212.89102 
+L 146.951683 202.160284 
+L 147.140138 200.780159 
+L 147.234365 200.88134 
+L 147.611274 205.093282 
+L 147.705501 205.080788 
+L 147.799728 205.499736 
+L 147.893956 206.593003 
+L 147.988183 205.999679 
+L 148.176637 207.111904 
+L 148.270864 206.986602 
+L 148.459319 209.948886 
+L 148.553546 208.879524 
+L 148.647773 210.123694 
+L 148.742001 209.23333 
+L 149.118909 210.098742 
+L 149.401591 215.563521 
+L 149.7785 213.310004 
+L 149.966954 215.222968 
+L 150.061182 214.944343 
+L 150.155409 214.314187 
+L 150.438091 205.258811 
+L 150.626545 202.501631 
+L 150.720772 202.643544 
+L 150.815 203.194943 
+L 150.909227 202.499537 
+L 151.097681 199.579935 
+L 151.191908 201.256884 
+L 151.286136 200.434153 
+L 151.568817 202.442194 
+L 151.663045 201.597582 
+L 151.757272 201.806261 
+L 151.945726 199.526348 
+L 152.322635 209.414819 
+L 152.605317 213.504385 
+L 152.887999 219.596142 
+L 153.736044 196.787332 
+L 153.830271 197.374734 
+L 154.395634 207.027478 
+L 154.489861 209.680805 
+L 154.584089 209.640181 
+L 154.678316 209.555034 
+L 154.772543 209.783285 
+L 155.149452 204.011066 
+L 155.243679 203.324253 
+L 155.526361 198.930064 
+L 155.620588 198.969748 
+L 155.714815 198.683468 
+L 155.90327 194.50778 
+L 155.997497 195.8666 
+L 156.091724 195.471592 
+L 156.280179 197.613637 
+L 156.374406 196.965066 
+L 156.56286 197.840625 
+L 156.751315 195.288262 
+L 156.939769 197.99709 
+L 157.033996 195.10457 
+L 157.128224 195.392331 
+L 157.505133 198.582541 
+L 157.59936 196.941305 
+L 157.787814 200.090531 
+L 157.882042 199.192078 
+L 158.070496 201.959837 
+L 158.164723 201.667129 
+L 158.353178 204.419289 
+L 158.447405 204.083213 
+L 158.541632 204.34288 
+L 158.635859 202.990344 
+L 158.824314 197.414057 
+L 159.106995 194.596105 
+L 159.201223 195.133242 
+L 159.483904 191.646352 
+L 159.860813 196.913898 
+L 159.955041 196.803221 
+L 160.143495 198.742328 
+L 160.426177 202.918521 
+L 160.520404 202.703522 
+L 160.803086 204.304062 
+L 160.897313 204.154314 
+L 160.99154 204.590631 
+L 161.556903 209.728001 
+L 161.651131 210.025294 
+L 162.310721 218.59965 
+L 162.876085 204.114882 
+L 162.970312 203.716589 
+L 163.252993 196.342349 
+L 163.535675 203.40207 
+L 163.72413 205.704841 
+L 164.101039 211.174026 
+L 164.195266 211.649233 
+L 164.854856 195.940119 
+L 164.949084 195.542873 
+L 165.043311 193.952589 
+L 165.137538 194.067635 
+L 165.231765 192.820649 
+L 165.325992 189.73783 
+L 165.702901 191.986436 
+L 165.797129 191.784364 
+L 166.07981 187.270686 
+L 166.174037 188.04214 
+L 166.456719 192.808733 
+L 166.645174 195.837964 
+L 166.739401 196.502099 
+L 167.022083 205.002466 
+L 167.304764 198.931977 
+L 167.398991 199.027705 
+L 167.493219 198.202989 
+L 167.587446 199.615251 
+L 167.681673 199.384725 
+L 167.7759 197.209025 
+L 167.870128 197.407485 
+L 167.964355 196.362498 
+L 168.247036 199.375119 
+L 168.341264 199.732644 
+L 168.435491 200.605568 
+L 168.529718 202.911841 
+L 168.623945 202.243445 
+L 168.718173 202.541714 
+L 168.906627 205.284485 
+L 169.000854 205.279213 
+L 169.095082 205.865747 
+L 169.47199 212.222624 
+L 169.566218 211.879001 
+L 169.754672 214.19163 
+L 169.848899 216.13507 
+L 170.225808 206.260791 
+L 170.320035 204.340858 
+L 170.414263 205.330344 
+L 170.791172 200.24472 
+L 171.073853 204.511334 
+L 171.168081 204.379749 
+L 171.356535 206.482326 
+L 171.450762 205.280332 
+L 172.393034 213.395116 
+L 172.581489 211.518768 
+L 172.675716 212.344676 
+L 172.769943 211.202661 
+L 172.864171 211.595357 
+L 173.052625 206.612683 
+L 173.146852 206.958725 
+L 173.241079 205.532633 
+L 173.429534 205.952303 
+L 173.523761 205.670573 
+L 173.617988 205.971044 
+L 174.371806 218.961689 
+L 174.466033 219.46286 
+L 174.748715 222.55875 
+L 174.842942 221.866775 
+L 174.93717 222.151791 
+L 175.031397 222.10037 
+L 175.219851 221.661092 
+L 175.314078 221.741003 
+L 175.502533 224.400072 
+L 175.879442 214.97417 
+L 175.973669 215.388641 
+L 176.162124 212.31781 
+L 176.256351 212.313188 
+L 177.104396 202.171478 
+L 177.29285 198.35754 
+L 177.387077 198.955341 
+L 177.481305 197.434424 
+L 177.575532 197.779093 
+L 177.669759 196.387811 
+L 177.858214 199.210494 
+L 177.952441 198.092095 
+L 178.235123 199.960968 
+L 178.32935 198.109536 
+L 178.423577 198.200353 
+L 178.517804 197.342451 
+L 178.706259 201.510593 
+L 178.894713 199.992853 
+L 179.177395 203.939243 
+L 179.648531 198.282828 
+L 179.742758 199.801831 
+L 179.836985 199.452178 
+L 179.931213 199.674868 
+L 180.119667 197.800904 
+L 180.308121 200.423212 
+L 180.496576 199.284122 
+L 180.590803 199.478936 
+L 180.68503 200.574116 
+L 180.967712 198.644975 
+L 181.061939 198.35559 
+L 181.344621 199.904059 
+L 181.438848 200.353159 
+L 181.909984 207.34763 
+L 182.098439 208.942933 
+L 182.192666 208.611949 
+L 182.475348 203.727602 
+L 182.663802 202.154759 
+L 182.758029 202.112799 
+L 182.852257 202.385502 
+L 182.946484 202.120599 
+L 183.040711 202.223476 
+L 183.41762 207.472137 
+L 183.982983 214.161262 
+L 184.265665 209.263807 
+L 184.359892 209.47487 
+L 184.454119 209.032631 
+L 184.548347 209.192381 
+L 184.642574 207.739748 
+L 184.831028 208.513224 
+L 184.925256 207.756575 
+L 185.207937 203.404165 
+L 185.679073 198.221152 
+L 185.773301 199.536134 
+L 185.867528 198.994123 
+L 186.055982 195.185204 
+L 186.527118 198.034283 
+L 186.715573 197.161974 
+L 186.904027 195.039536 
+L 186.998255 194.097462 
+L 187.092482 194.128084 
+L 187.186709 194.231683 
+L 187.563618 196.389689 
+L 187.657845 194.902499 
+L 187.752072 195.547495 
+L 187.8463 197.052416 
+L 187.940527 196.65777 
+L 188.034754 196.959433 
+L 188.128981 198.080937 
+L 188.223209 196.681169 
+L 188.317436 197.04263 
+L 188.411663 196.120633 
+L 188.50589 198.418782 
+L 188.600117 196.360584 
+L 188.694345 196.462703 
+L 188.977026 198.222091 
+L 189.071254 198.571672 
+L 189.165481 197.936316 
+L 189.259708 196.158152 
+L 189.353935 197.025442 
+L 189.448162 194.664606 
+L 189.54239 195.336649 
+L 189.636617 192.698453 
+L 189.825071 196.032489 
+L 190.013526 200.368542 
+L 190.296208 201.347123 
+L 190.484662 202.451655 
+L 190.578889 201.716311 
+L 190.673116 199.212841 
+L 190.767344 200.04579 
+L 190.955798 196.173101 
+L 191.144253 194.962911 
+L 191.426934 195.864902 
+L 191.521161 194.487667 
+L 192.180752 204.701922 
+L 192.274979 204.936348 
+L 192.463434 206.942656 
+L 193.217252 218.274985 
+L 193.405706 219.32705 
+L 193.59416 222.13565 
+L 193.688388 222.153632 
+L 193.971069 224.04103 
+L 194.347978 227.243156 
+L 194.442205 227.07492 
+L 194.63066 228.53398 
+L 194.724887 227.796361 
+L 194.913342 230.187313 
+L 195.007569 230.005355 
+L 195.290251 231.940815 
+L 195.761387 226.892239 
+L 196.138296 222.6604 
+L 196.232523 222.261276 
+L 196.515204 215.683768 
+L 196.703659 216.516247 
+L 197.269022 209.932311 
+L 197.36325 210.343026 
+L 197.834386 208.164077 
+L 197.928613 208.199067 
+L 198.02284 208.351813 
+L 198.117067 209.077515 
+L 198.305522 208.313247 
+L 198.588203 203.043968 
+L 198.776658 204.055589 
+L 198.965112 202.344589 
+L 199.05934 202.615234 
+L 199.530476 210.827298 
+L 199.71893 212.516776 
+L 199.813157 212.799734 
+L 200.001612 214.895595 
+L 200.378521 209.578397 
+L 200.472748 210.295939 
+L 200.566975 209.898512 
+L 200.75543 211.498582 
+L 200.849657 213.730325 
+L 201.038111 211.79602 
+L 201.132339 211.643889 
+L 201.886156 198.725248 
+L 202.074611 200.921783 
+L 202.357293 203.366213 
+L 202.45152 202.565546 
+L 202.639974 205.53173 
+L 202.734201 205.626085 
+L 202.828429 206.741523 
+L 202.922656 206.288812 
+L 203.205338 201.848077 
+L 203.299565 201.663735 
+L 203.393792 200.833602 
+L 203.582246 202.068636 
+L 203.676474 202.085283 
+L 203.770701 201.239804 
+L 203.959155 202.827633 
+L 204.053383 201.383233 
+L 204.14761 201.969803 
+L 204.336064 200.782362 
+L 204.524519 202.326534 
+L 204.618746 202.8788 
+L 204.901428 206.023259 
+L 204.995655 206.41823 
+L 205.089882 206.210092 
+L 205.278337 207.928603 
+L 205.372564 208.188487 
+L 205.466791 207.849884 
+L 205.655245 208.850492 
+L 205.749473 206.96118 
+L 205.937927 209.345921 
+L 206.032154 208.901298 
+L 206.126382 207.437868 
+L 206.220609 207.932503 
+L 206.314836 209.726159 
+L 206.409063 208.436382 
+L 206.503291 210.58518 
+L 206.597518 210.321685 
+L 206.880199 204.684012 
+L 206.974427 204.969894 
+L 207.257108 198.95126 
+L 207.351336 199.056665 
+L 207.445563 200.175281 
+L 207.53979 199.962593 
+L 207.916699 204.721783 
+L 208.010926 204.843582 
+L 208.199381 207.346727 
+L 208.387835 207.604949 
+L 208.482062 208.483181 
+L 208.764744 213.562883 
+L 208.858971 213.767482 
+L 208.953198 214.323396 
+L 209.330107 218.756548 
+L 209.612789 221.374306 
+L 209.801243 222.580019 
+L 209.895471 222.921873 
+L 210.178152 225.403532 
+L 210.27238 224.03045 
+L 210.366607 225.539306 
+L 210.743516 217.960178 
+L 210.93197 212.973893 
+L 211.403106 209.080837 
+L 211.874242 201.998944 
+L 212.062697 201.865518 
+L 212.156924 201.861798 
+L 212.251151 200.116819 
+L 212.439606 202.363402 
+L 212.533833 202.092289 
+L 212.62806 202.265978 
+L 212.816515 205.149253 
+L 212.910742 204.771181 
+L 213.004969 204.912227 
+L 213.381878 210.873085 
+L 213.758787 218.638468 
+L 214.041469 221.510694 
+L 214.701059 216.628225 
+L 214.795286 218.191679 
+L 215.172195 216.082783 
+L 215.266423 216.491657 
+L 215.549104 215.791339 
+L 215.643331 215.865148 
+L 215.737559 214.685795 
+L 215.831786 214.821172 
+L 215.926013 214.652105 
+L 216.02024 213.765821 
+L 216.208695 215.546369 
+L 216.585604 220.164694 
+L 216.679831 220.32838 
+L 216.774058 221.778702 
+L 216.868285 221.435007 
+L 216.962513 223.001459 
+L 217.245194 216.458724 
+L 217.433649 215.978425 
+L 217.527876 215.835935 
+L 217.71633 214.000103 
+L 217.810558 214.00115 
+L 218.093239 212.609037 
+L 218.375921 215.073184 
+L 218.470148 215.023135 
+L 218.564376 217.726907 
+L 218.658603 217.690436 
+L 218.847057 219.504314 
+L 219.318193 212.310407 
+L 219.412421 211.887198 
+L 219.506648 210.774685 
+L 219.695102 204.732471 
+L 219.789329 205.401517 
+L 219.883557 204.352955 
+L 219.977784 204.450597 
+L 220.072011 205.006402 
+L 220.166238 204.420372 
+L 220.260466 205.18168 
+L 220.354693 204.31977 
+L 220.44892 202.347947 
+L 220.543147 202.466424 
+L 220.637375 202.438547 
+L 220.920056 205.309834 
+L 221.108511 204.50635 
+L 221.296965 202.061198 
+L 221.391192 201.439817 
+L 221.673874 205.947537 
+L 221.768101 205.831262 
+L 222.239237 198.225413 
+L 222.333465 198.213208 
+L 222.521919 198.894351 
+L 222.616146 197.386614 
+L 223.087282 209.173062 
+L 223.464191 214.876781 
+L 223.558419 217.885972 
+L 223.652646 216.989505 
+L 223.746873 218.120543 
+L 223.8411 217.790569 
+L 223.935327 216.242173 
+L 224.029555 216.587854 
+L 224.218009 218.68032 
+L 224.312236 218.226453 
+L 224.406464 216.813144 
+L 224.594918 217.609333 
+L 224.689145 217.59702 
+L 224.783372 217.861562 
+L 224.8776 218.797533 
+L 225.160281 217.067972 
+L 225.254509 218.185468 
+L 225.348736 217.896191 
+L 225.53719 219.74784 
+L 226.291008 229.662382 
+L 226.385235 230.013335 
+L 226.667917 226.455019 
+L 226.856371 224.451962 
+L 226.950599 224.566611 
+L 227.327508 217.381732 
+L 227.515962 217.098702 
+L 227.610189 217.333092 
+L 227.798644 218.590045 
+L 227.987098 220.300431 
+L 228.081325 220.302056 
+L 228.26978 219.108115 
+L 228.364007 219.294515 
+L 228.740916 214.665357 
+L 228.92937 215.776245 
+L 229.212052 214.200549 
+L 229.494734 217.103468 
+L 229.588961 217.52953 
+L 229.683188 217.453663 
+L 229.777415 217.428711 
+L 229.96587 218.194604 
+L 230.154324 217.669204 
+L 230.813915 225.09147 
+L 230.908142 224.311241 
+L 231.002369 224.404441 
+L 231.096597 224.329729 
+L 231.285051 222.892768 
+L 231.473506 223.771108 
+L 231.567733 225.006358 
+L 231.66196 223.818809 
+L 231.850414 225.317699 
+L 231.944642 225.072331 
+L 232.133096 225.811792 
+L 232.227323 224.976532 
+L 232.321551 225.779871 
+L 232.415778 225.255481 
+L 232.510005 223.647756 
+L 232.604232 224.114297 
+L 233.169596 216.79202 
+L 233.263823 217.084547 
+L 233.452277 217.255528 
+L 233.546505 217.011243 
+L 233.640732 214.903142 
+L 233.734959 215.857782 
+L 233.829186 214.586746 
+L 233.923413 214.674276 
+L 234.206095 219.130323 
+L 234.677231 224.215947 
+L 234.865686 225.657566 
+L 235.05414 223.202953 
+L 235.148367 223.057249 
+L 235.525276 219.168419 
+L 235.619504 219.576787 
+L 235.713731 218.774459 
+L 235.807958 218.946703 
+L 235.902185 219.623513 
+L 235.996412 219.343913 
+L 236.09064 218.460157 
+L 236.184867 218.937423 
+L 236.279094 218.820246 
+L 236.467549 219.751992 
+L 236.561776 219.552521 
+L 237.221366 222.88764 
+L 237.409821 226.080992 
+L 237.504048 226.285808 
+L 237.598275 228.138504 
+L 237.692503 227.781737 
+L 237.880957 221.601185 
+L 238.634775 215.792531 
+L 238.729002 216.411095 
+L 238.823229 216.448325 
+L 238.917456 215.274713 
+L 239.200138 218.333339 
+L 239.294365 217.416434 
+L 239.388593 217.479735 
+L 239.48282 217.359235 
+L 239.577047 218.523819 
+L 240.048183 213.216191 
+L 240.14241 213.744444 
+L 240.425092 209.854566 
+L 240.519319 208.320072 
+L 240.707774 210.05801 
+L 240.896228 209.880349 
+L 241.367364 214.401249 
+L 241.461592 214.296349 
+L 241.650046 215.989511 
+L 241.744273 217.918146 
+L 241.838501 217.669637 
+L 242.026955 220.141655 
+L 242.215409 218.767778 
+L 242.403864 217.820505 
+L 242.686546 212.517318 
+L 243.063454 205.974547 
+L 243.157682 206.097176 
+L 243.346136 203.687267 
+L 243.534591 207.752062 
+L 243.628818 208.563923 
+L 243.723045 208.330183 
+L 244.099954 215.3452 
+L 244.194181 217.064722 
+L 244.288408 217.043742 
+L 244.476863 215.86717 
+L 244.665317 214.669763 
+L 244.759545 214.456353 
+L 244.947999 212.524612 
+L 245.042226 212.705776 
+L 245.513362 209.01407 
+L 245.60759 209.826834 
+L 245.796044 208.406266 
+L 245.984498 211.209594 
+L 246.738316 201.197194 
+L 246.832544 201.953084 
+L 246.926771 201.846741 
+L 247.115225 199.753444 
+L 247.209452 200.040121 
+L 247.30368 199.742178 
+L 247.492134 203.249253 
+L 247.586361 202.292555 
+L 247.680589 202.521925 
+L 247.96327 204.804836 
+L 248.057497 204.832965 
+L 248.245952 203.404418 
+L 248.340179 203.990375 
+L 248.434406 203.212132 
+L 248.622861 205.811763 
+L 248.717088 205.080572 
+L 248.811315 205.560871 
+L 248.905543 204.637646 
+L 248.99977 205.46944 
+L 249.942042 226.493765 
+L 250.224724 222.065741 
+L 250.318951 220.191993 
+L 250.413178 220.499975 
+L 250.69586 218.846245 
+L 250.884314 214.200874 
+L 250.978541 214.620761 
+L 251.166996 212.416896 
+L 251.261223 212.405918 
+L 251.449678 212.173587 
+L 251.543905 210.290053 
+L 251.638132 210.666752 
+L 251.826587 208.963877 
+L 252.203495 216.796028 
+L 252.674632 225.454303 
+L 252.768859 225.854438 
+L 253.145768 218.015137 
+L 253.239995 218.652551 
+L 253.334222 217.310379 
+L 253.428449 217.796564 
+L 253.711131 212.809375 
+L 253.993813 216.259794 
+L 254.182267 217.649163 
+L 254.276494 215.332417 
+L 254.370722 215.844421 
+L 254.464949 214.732377 
+L 254.653403 216.967081 
+L 254.841858 215.330576 
+L 255.030312 211.565097 
+L 255.407221 207.90069 
+L 255.501448 209.69756 
+L 255.595676 208.040761 
+L 256.161039 222.852469 
+L 256.255266 222.96636 
+L 256.537948 227.28053 
+L 257.009084 221.903426 
+L 257.103311 221.562656 
+L 257.197538 222.153741 
+L 257.668675 218.196987 
+L 257.762902 218.529127 
+L 257.857129 217.589112 
+L 257.951356 217.708708 
+L 258.234038 219.635429 
+L 258.422492 218.881236 
+L 258.799401 213.471127 
+L 259.082083 210.856041 
+L 259.270537 211.635367 
+L 259.364765 210.743414 
+L 259.647446 217.044537 
+L 259.930128 222.617754 
+L 260.024355 222.687916 
+L 260.118582 222.928445 
+L 260.21281 223.980365 
+L 260.307037 223.54459 
+L 260.401264 221.884577 
+L 260.683946 213.900403 
+L 260.966628 211.32316 
+L 261.060855 210.657472 
+L 261.155082 211.106356 
+L 261.437764 213.590038 
+L 261.626218 212.742608 
+L 261.814673 214.413057 
+L 262.003127 212.16817 
+L 262.097354 212.065618 
+L 262.474263 209.759344 
+L 262.56849 209.741867 
+L 262.662718 209.127853 
+L 262.756945 210.583447 
+L 262.851172 210.206964 
+L 263.039627 210.543581 
+L 263.133854 209.457537 
+L 263.228081 210.48288 
+L 263.322308 210.277451 
+L 263.416535 211.701665 
+L 263.60499 210.761288 
+L 263.699217 211.86203 
+L 263.793444 211.668408 
+L 263.887672 211.566614 
+L 263.981899 211.259787 
+L 264.170353 213.57451 
+L 264.547262 219.578303 
+L 264.641489 219.845842 
+L 264.924171 223.864235 
+L 265.018398 224.614096 
+L 265.206853 227.974095 
+L 265.30108 227.583854 
+L 265.583762 224.843286 
+L 265.677989 225.137113 
+L 265.772216 226.263384 
+L 265.866443 226.025383 
+L 266.054898 224.395233 
+L 266.526034 216.058698 
+L 266.620261 215.960515 
+L 266.714488 216.457821 
+L 266.902943 213.031488 
+L 267.185624 217.118959 
+L 267.468306 222.094556 
+L 267.750988 224.220027 
+L 267.845215 224.111336 
+L 267.939442 224.821836 
+L 268.03367 223.699754 
+L 268.127897 224.356848 
+L 268.410578 221.522935 
+L 268.599033 220.551215 
+L 268.69326 221.704135 
+L 268.787487 221.36972 
+L 268.881715 222.154102 
+L 268.975942 221.634118 
+L 269.070169 222.468692 
+L 269.258623 225.374608 
+L 269.352851 225.233418 
+L 269.447078 225.447695 
+L 269.635532 222.652925 
+L 269.72976 223.160957 
+L 269.918214 219.894735 
+L 270.012441 219.762898 
+L 270.200896 221.249582 
+L 270.577805 219.208934 
+L 270.672032 219.705194 
+L 270.954714 222.884427 
+L 271.048941 222.803937 
+L 271.42585 226.447509 
+L 271.520077 228.25077 
+L 271.991213 225.947205 
+L 272.179667 226.911991 
+L 272.368122 227.665354 
+L 272.462349 227.383769 
+L 272.556576 227.485093 
+L 273.12194 216.426767 
+L 273.216167 216.021143 
+L 273.310394 216.40929 
+L 273.404621 215.836332 
+L 273.593076 216.331761 
+L 273.78153 218.819343 
+L 274.158439 219.507672 
+L 274.346894 216.749085 
+L 274.441121 216.475624 
+L 274.629575 220.901915 
+L 274.912257 225.108116 
+L 275.006484 225.030588 
+L 275.100712 225.174126 
+L 275.383393 227.230915 
+L 275.47762 227.079867 
+L 275.571848 227.921194 
+L 275.666075 227.825177 
+L 275.760302 228.05314 
+L 275.854529 227.384346 
+L 275.948757 229.064545 
+L 276.042984 228.604287 
+L 276.137211 228.712544 
+L 276.231438 228.252575 
+L 276.985256 213.162567 
+L 277.079483 212.933269 
+L 277.267938 210.07296 
+L 277.550619 213.924019 
+L 277.644847 213.963487 
+L 277.927528 219.477773 
+L 278.021756 217.792664 
+L 278.21021 218.463949 
+L 278.492892 213.381863 
+L 278.681346 211.882612 
+L 278.964028 209.14356 
+L 279.152482 209.725509 
+L 279.435164 216.47566 
+L 279.9063 222.180209 
+L 280.000527 221.940836 
+L 280.094755 222.259399 
+L 280.565891 214.606462 
+L 280.660118 214.444292 
+L 280.9428 213.064854 
+L 281.037027 211.171173 
+L 281.225481 213.561908 
+L 281.319708 213.550317 
+L 281.413936 213.287688 
+L 281.696617 216.518306 
+L 281.790845 216.564779 
+L 281.885072 217.110365 
+L 281.979299 216.639383 
+L 282.921571 229.595976 
+L 283.204253 226.476505 
+L 283.29848 223.897637 
+L 283.392707 223.995134 
+L 283.863844 219.579025 
+L 284.052298 214.184372 
+L 284.146525 214.020144 
+L 284.523434 209.176529 
+L 284.806116 207.50153 
+L 284.99457 211.183089 
+L 285.371479 219.541327 
+L 285.748388 223.763199 
+L 286.03107 220.809618 
+L 286.407979 214.447072 
+L 286.502206 213.931205 
+L 286.69066 210.184323 
+L 287.256024 225.493988 
+L 287.350251 226.005486 
+L 287.538705 229.651152 
+L 287.632933 230.067753 
+L 287.72716 229.291821 
+L 287.821387 230.073747 
+L 288.009842 228.335484 
+L 288.104069 227.912022 
+L 288.38675 224.572966 
+L 288.575205 225.173728 
+L 289.517477 212.224249 
+L 289.988613 221.751475 
+L 290.271295 226.466033 
+L 290.365522 226.293174 
+L 290.459749 228.206029 
+L 290.648204 223.044502 
+L 291.025113 216.458941 
+L 291.213567 210.484722 
+L 291.307795 211.356742 
+L 291.402022 211.193056 
+L 292.061612 220.776325 
+L 292.344294 223.34046 
+L 293.098112 212.524468 
+L 293.286566 211.045041 
+L 293.569248 214.561685 
+L 293.663475 214.258867 
+L 293.85193 215.720744 
+L 293.946157 215.776101 
+L 294.040384 215.158692 
+L 294.228839 217.767892 
+L 294.417293 213.82338 
+L 294.51152 213.054779 
+L 294.699975 209.214011 
+L 294.888429 207.838509 
+L 295.076884 205.693936 
+L 295.171111 204.618075 
+L 295.359565 207.943914 
+L 295.54802 208.441257 
+L 295.924929 213.172967 
+L 296.019156 212.085948 
+L 296.113383 212.785832 
+L 296.584519 205.965303 
+L 296.678746 206.847506 
+L 296.867201 204.425753 
+L 296.961428 204.835962 
+L 297.055655 205.340708 
+L 297.526791 218.501864 
+L 297.997928 227.464763 
+L 298.280609 220.317511 
+L 298.374837 220.872125 
+L 298.657518 217.37386 
+L 298.751745 216.749229 
+L 299.034427 217.287449 
+L 299.128654 218.029906 
+L 299.222882 217.069814 
+L 299.411336 218.142822 
+L 299.505563 217.838054 
+L 299.694018 219.391362 
+L 299.976699 217.080286 
+L 300.165154 215.055742 
+L 300.259381 215.414929 
+L 300.353608 214.853707 
+L 300.447835 214.945932 
+L 300.542063 212.962085 
+L 300.63629 213.079189 
+L 300.730517 212.772507 
+L 300.918972 211.066815 
+L 301.107426 211.046666 
+L 301.201653 211.329299 
+L 301.295881 211.233716 
+L 301.390108 211.891856 
+L 301.484335 211.278203 
+L 301.578562 212.293219 
+L 301.672789 210.129977 
+L 302.143926 219.348499 
+L 302.238153 219.210378 
+L 302.520834 222.208917 
+L 302.615062 222.046025 
+L 302.709289 222.469848 
+L 303.086198 220.312167 
+L 303.274652 217.79613 
+L 303.36888 217.089458 
+L 303.463107 217.170525 
+L 303.557334 218.227753 
+L 303.651561 217.546321 
+L 304.02847 221.895301 
+L 304.122697 222.069027 
+L 304.216925 224.50656 
+L 304.311152 224.499771 
+L 304.688061 227.782929 
+L 304.782288 228.256872 
+L 304.876515 227.994136 
+L 305.06497 229.434672 
+L 305.347651 230.063167 
+L 305.536106 231.683567 
+L 305.630333 231.359335 
+L 305.72456 231.420506 
+L 306.007242 229.904138 
+L 306.101469 230.970575 
+L 306.384151 229.590234 
+L 306.478378 228.688892 
+L 306.666832 229.460527 
+L 306.855287 228.260989 
+L 306.949514 228.74916 
+L 307.043741 227.703667 
+L 307.137969 227.874792 
+L 307.42065 230.663495 
+L 307.514877 228.511773 
+L 307.609105 229.4372 
+L 307.703332 229.3401 
+L 307.891786 227.50907 
+L 307.986014 227.016277 
+L 308.080241 227.146634 
+L 308.551377 223.411524 
+L 308.739831 223.817545 
+L 308.834059 223.023269 
+L 308.928286 223.271417 
+L 309.022513 222.15273 
+L 309.210968 222.730453 
+L 309.682104 217.626522 
+L 309.870558 216.842899 
+L 310.341694 212.813853 
+L 310.624376 217.037893 
+L 310.718603 218.586073 
+L 310.81283 217.258814 
+L 310.907058 218.261769 
+L 311.001285 217.890269 
+L 311.095512 217.916774 
+L 311.189739 219.146139 
+L 311.566648 211.834333 
+L 311.660875 212.531509 
+L 312.037784 207.610366 
+L 312.132012 207.287831 
+L 312.791602 213.643805 
+L 312.980057 215.094019 
+L 313.074284 216.207254 
+L 313.262738 210.335696 
+L 313.356966 211.355334 
+L 313.451193 210.563045 
+L 313.54542 208.792391 
+L 313.639647 209.183895 
+L 313.733874 207.931673 
+L 313.828102 208.862047 
+L 314.016556 207.683525 
+L 314.487692 217.471465 
+L 314.676147 219.754231 
+L 314.958828 220.590467 
+L 315.335737 214.103666 
+L 315.429965 214.086622 
+L 315.712646 211.951727 
+L 315.806873 213.040588 
+L 315.901101 210.532351 
+L 316.183782 217.21642 
+L 316.27801 217.26358 
+L 316.654918 222.262468 
+L 316.749146 223.820037 
+L 316.843373 223.744386 
+L 317.031827 221.867497 
+L 317.126055 222.109867 
+L 317.314509 224.248771 
+L 317.597191 226.929974 
+L 317.691418 226.149637 
+L 317.785645 226.382186 
+L 318.162554 218.757017 
+L 318.351009 217.142287 
+L 318.727917 209.68438 
+L 318.822145 208.75787 
+L 318.916372 209.117525 
+L 319.293281 204.81018 
+L 319.387508 205.634102 
+L 319.67019 201.773509 
+L 320.141326 215.771009 
+L 320.800916 227.57981 
+L 320.989371 230.076925 
+L 321.083598 229.847373 
+L 321.177825 231.393748 
+L 321.272053 231.069914 
+L 321.36628 231.351716 
+L 321.554734 230.47251 
+L 322.214325 216.141534 
+L 322.402779 214.622639 
+L 322.497007 214.825686 
+L 322.873915 209.57735 
+L 323.250824 206.42845 
+L 323.345052 206.606905 
+L 323.439279 206.081107 
+L 324.098869 219.847106 
+L 324.193097 220.287504 
+L 324.287324 219.907049 
+L 324.381551 220.428874 
+L 324.475778 219.967244 
+L 324.75846 222.311108 
+L 324.946914 222.271279 
+L 325.041142 222.596919 
+L 325.229596 224.350565 
+L 325.323823 225.057093 
+L 325.606505 218.630741 
+L 325.983414 212.117905 
+L 326.077641 211.062229 
+L 326.171868 208.022526 
+L 326.360323 208.615813 
+L 326.45455 208.328594 
+L 326.737232 210.925228 
+L 326.831459 211.062771 
+L 327.019913 213.86209 
+L 327.114141 214.388141 
+L 327.396822 212.557328 
+L 327.585277 213.258259 
+L 327.679504 211.877701 
+L 327.773731 212.391583 
+L 327.867958 211.467636 
+L 327.962186 212.236238 
+L 328.056413 211.889401 
+L 328.244867 210.56958 
+L 328.339095 210.646711 
+L 328.433322 210.242532 
+L 328.621776 207.124578 
+L 328.716004 206.527752 
+L 328.810231 207.049505 
+L 328.904458 204.986686 
+L 328.998685 205.30774 
+L 329.092912 205.761606 
+L 329.564049 218.025645 
+L 330.035185 225.14921 
+L 330.223639 222.223361 
+L 330.694775 218.294232 
+L 330.789002 218.532666 
+L 330.977457 220.857392 
+L 331.260139 217.379638 
+L 331.354366 218.189043 
+L 331.637048 215.542939 
+L 331.731275 215.348667 
+L 331.825502 214.666043 
+L 331.919729 213.100855 
+L 332.013956 213.192791 
+L 332.108184 212.838227 
+L 332.202411 214.681679 
+L 332.296638 212.357892 
+L 332.390865 212.411263 
+L 332.485093 211.832022 
+L 332.673547 213.584693 
+L 332.767774 213.173111 
+L 332.862001 214.972798 
+L 332.956229 214.736999 
+L 333.521592 223.231408 
+L 333.615819 223.604821 
+L 333.710047 223.502305 
+L 333.804274 223.998059 
+L 333.898501 222.905081 
+L 334.181183 225.144082 
+L 334.463864 220.603899 
+L 334.652319 223.155215 
+L 334.840773 222.7575 
+L 334.935 220.737615 
+L 335.123455 222.666828 
+L 335.217682 222.318294 
+L 335.311909 220.551973 
+L 335.500364 221.323608 
+L 335.594591 221.179601 
+L 336.442636 232.175963 
+L 336.536863 232.263999 
+L 336.631091 232.504817 
+L 336.819545 230.418634 
+L 337.007999 228.636027 
+L 337.102227 228.981781 
+L 337.196454 229.460816 
+L 337.384908 229.015652 
+L 337.479136 228.926568 
+L 337.573363 228.551241 
+L 337.66759 228.754324 
+L 337.761817 229.619447 
+L 337.856044 229.283119 
+L 338.138726 231.358252 
+L 338.232953 231.100174 
+L 338.327181 231.906763 
+L 338.421408 231.451235 
+L 338.515635 231.499514 
+L 338.798317 226.766757 
+L 338.892544 226.791854 
+L 339.080998 227.28703 
+L 339.269453 228.220871 
+L 339.36368 228.486315 
+L 339.834816 221.961997 
+L 339.929043 222.427527 
+L 340.023271 222.04718 
+L 340.211725 218.529452 
+L 340.305952 218.148817 
+L 340.588634 215.59472 
+L 340.682861 216.30959 
+L 340.777089 218.279318 
+L 341.153997 215.233656 
+L 341.248225 215.725186 
+L 341.530906 220.482931 
+L 341.907815 222.499313 
+L 342.002042 223.546431 
+L 342.09627 223.129469 
+L 342.190497 223.516532 
+L 342.284724 223.226352 
+L 342.378951 223.505519 
+L 342.567406 222.278392 
+L 342.661633 220.959475 
+L 342.75586 221.333574 
+L 343.038542 218.329547 
+L 343.132769 218.075225 
+L 343.226996 218.460807 
+L 343.509678 216.690875 
+L 343.603905 216.45338 
+L 343.698133 215.725113 
+L 343.886587 218.12047 
+L 343.980814 217.839137 
+L 344.169269 216.38961 
+L 344.546178 220.875663 
+L 344.734632 224.114658 
+L 344.923086 223.459912 
+L 345.111541 226.007003 
+L 345.205768 225.184092 
+L 345.394223 222.114959 
+L 345.582677 221.319852 
+L 345.865359 218.667645 
+L 345.959586 218.466801 
+L 346.14804 214.158734 
+L 346.713404 210.112139 
+L 346.807631 210.081662 
+L 346.901858 210.447854 
+L 346.996085 208.413813 
+L 347.18454 209.73822 
+L 347.372994 207.835078 
+L 347.467222 209.881143 
+L 347.561449 208.587828 
+L 347.749903 209.853375 
+L 347.844131 209.766747 
+L 347.938358 210.105206 
+L 348.221039 218.37194 
+L 348.88063 226.442381 
+L 349.163312 221.923251 
+L 349.257539 221.267637 
+L 349.634448 216.394232 
+L 349.728675 215.869409 
+L 349.91713 212.457122 
+L 350.105584 213.703351 
+L 350.199811 214.505245 
+L 350.57672 211.735247 
+L 350.670947 212.399852 
+L 350.765175 212.222155 
+L 350.859402 212.968621 
+L 350.953629 212.953671 
+L 351.047856 212.852888 
+L 351.142083 212.428812 
+L 351.236311 211.234113 
+L 351.330538 211.244549 
+L 351.424765 210.917103 
+L 351.518992 211.82935 
+L 351.707447 211.124302 
+L 351.801674 211.31406 
+L 352.178583 219.425811 
+L 352.367037 220.303103 
+L 352.555492 220.768092 
+L 352.932401 217.436475 
+L 353.026628 217.31139 
+L 353.120855 217.501762 
+L 353.30931 218.834076 
+L 353.403537 218.379451 
+L 353.780446 213.152312 
+L 353.9689 212.275272 
+L 354.063127 213.907083 
+L 354.157355 212.238801 
+L 354.251582 213.388399 
+L 354.345809 212.952624 
+L 354.722718 217.978522 
+L 355.193854 225.72065 
+L 355.288081 225.651789 
+L 355.382309 225.890042 
+L 355.476536 226.803264 
+L 356.230354 210.626273 
+L 356.418808 209.326458 
+L 356.70149 206.637165 
+L 356.795717 207.940339 
+L 356.889944 204.876333 
+L 356.984172 205.193776 
+L 357.078399 204.864237 
+L 357.266853 202.296924 
+L 357.455308 204.732868 
+L 358.020671 211.786668 
+L 358.39758 213.343478 
+L 358.491807 212.272817 
+L 358.680262 213.15874 
+L 358.774489 213.130213 
+L 358.868716 212.820389 
+L 358.962943 212.032035 
+L 359.05717 209.332777 
+L 359.151398 210.585036 
+L 359.245625 209.704493 
+L 359.622534 214.7318 
+L 359.810988 215.941015 
+L 359.905216 215.182922 
+L 359.999443 215.360511 
+L 360.09367 215.884828 
+L 360.187897 217.545346 
+L 360.376352 215.043718 
+L 360.470579 214.425334 
+L 360.659033 216.01013 
+L 360.847488 214.858762 
+L 361.035942 214.405257 
+L 361.224397 213.625353 
+L 361.412851 210.87944 
+L 361.507078 211.106319 
+L 361.601306 210.743631 
+L 361.695533 210.779993 
+L 361.978215 205.769478 
+L 362.072442 206.280399 
+L 362.449351 216.133879 
+L 362.82626 223.465148 
+L 362.920487 223.471178 
+L 363.203168 227.902958 
+L 363.391623 228.735149 
+L 363.580077 231.029543 
+L 363.674305 230.431019 
+L 363.956986 232.688653 
+L 364.051214 232.671681 
+L 364.333895 229.893486 
+L 364.428122 230.367827 
+L 364.616577 229.967222 
+L 364.710804 230.596078 
+L 364.805031 229.780064 
+L 364.899259 230.177455 
+L 365.087713 226.558836 
+L 365.18194 226.309749 
+L 365.464622 220.627118 
+L 365.558849 220.009384 
+L 365.653076 220.072794 
+L 365.747304 220.409375 
+L 365.841531 219.240205 
+L 366.124212 224.077718 
+L 366.312667 228.090297 
+L 366.595349 231.575634 
+L 366.783803 233.973988 
+L 367.443394 226.301949 
+L 367.631848 225.579713 
+L 367.820303 224.741058 
+L 368.197211 229.873373 
+L 368.479893 228.334798 
+L 368.668348 226.029427 
+L 368.762575 225.958399 
+L 369.327938 218.965228 
+L 369.422165 219.060739 
+L 369.61062 219.532696 
+L 370.081756 226.011047 
+L 370.458665 228.881648 
+L 370.552892 229.048548 
+L 371.024028 235.087512 
+L 371.118256 235.002581 
+L 371.495164 237.741525 
+L 371.872073 222.033747 
+L 372.343209 214.216149 
+L 372.437437 214.531895 
+L 372.531664 214.420604 
+L 372.625891 214.723819 
+L 372.720118 213.636258 
+L 372.814346 214.594293 
+L 372.908573 214.433025 
+L 373.097027 212.34724 
+L 373.191254 213.108872 
+L 373.285482 213.150109 
+L 373.473936 216.375743 
+L 373.945072 211.988884 
+L 374.0393 212.228907 
+L 374.133527 210.583952 
+L 374.227754 210.928514 
+L 374.321981 212.074934 
+L 374.510436 210.899156 
+L 374.604663 210.729439 
+L 374.69889 209.459703 
+L 374.793117 209.567095 
+L 375.075799 207.179249 
+L 375.170026 205.214973 
+L 375.358481 209.503288 
+L 375.641162 216.745546 
+L 375.829617 219.193551 
+L 376.018071 222.434568 
+L 376.489207 227.953584 
+L 376.583435 228.033207 
+L 376.677662 227.331156 
+L 376.866116 228.294752 
+L 377.054571 227.505134 
+L 377.148798 227.44577 
+L 377.619934 218.562023 
+L 377.714161 218.76442 
+L 377.996843 215.20531 
+L 378.185298 212.702706 
+L 378.279525 214.135335 
+L 378.656434 211.375845 
+L 378.750661 210.850372 
+L 378.939115 207.712954 
+L 379.033343 207.271943 
+L 379.12757 207.82829 
+L 379.410251 210.869835 
+L 379.504479 209.712149 
+L 379.692933 212.372806 
+L 379.78716 212.279245 
+L 379.881388 211.398233 
+L 380.069842 213.954207 
+L 380.164069 214.655427 
+L 380.258296 213.028166 
+L 380.352524 213.449389 
+L 380.446751 213.190444 
+L 380.635205 214.683448 
+L 380.82366 214.447614 
+L 381.200569 212.283108 
+L 381.389023 213.205827 
+L 381.577478 211.08079 
+L 381.860159 208.472239 
+L 382.142841 214.903828 
+L 382.51975 224.238877 
+L 382.613977 224.164707 
+L 382.990886 228.777831 
+L 383.179341 230.529419 
+L 383.273568 230.56116 
+L 383.367795 230.427986 
+L 383.462022 230.684367 
+L 383.556249 230.625688 
+L 383.650477 230.541805 
+L 383.744704 230.782622 
+L 383.933158 230.35916 
+L 384.498522 222.78516 
+L 384.592749 222.757139 
+L 384.686976 223.101664 
+L 385.063885 214.205785 
+L 385.158112 214.816803 
+L 385.25234 214.728044 
+L 385.629248 212.230207 
+L 385.723476 214.160106 
+L 385.817703 214.028449 
+L 385.91193 214.09435 
+L 386.100385 215.832107 
+L 386.194612 214.691681 
+L 386.383066 216.005399 
+L 386.477293 215.973298 
+L 386.571521 216.631041 
+L 386.665748 215.28208 
+L 386.759975 215.509067 
+L 386.854202 215.67882 
+L 386.94843 216.146698 
+L 387.136884 214.245651 
+L 387.231111 214.467294 
+L 387.325338 214.014113 
+L 387.419566 214.304943 
+L 387.60802 212.675371 
+L 387.796475 209.425255 
+L 387.890702 210.301716 
+L 388.079156 207.381067 
+L 388.173384 207.73462 
+L 388.267611 208.269662 
+L 388.456065 206.022573 
+L 388.550292 206.782653 
+L 388.64452 206.603764 
+L 388.832974 207.974319 
+L 388.927201 206.85787 
+L 389.398337 213.628386 
+L 389.586792 217.628616 
+L 389.681019 217.062989 
+L 389.963701 221.292842 
+L 390.057928 220.820343 
+L 390.152155 220.799435 
+L 390.246383 220.973955 
+L 390.34061 221.944086 
+L 390.434837 221.549043 
+L 390.717519 217.606986 
+L 390.905973 215.841929 
+L 391.0002 215.865545 
+L 391.377109 213.555191 
+L 391.471336 214.451478 
+L 391.565564 214.086658 
+L 391.754018 216.56301 
+L 391.942473 214.927516 
+L 392.0367 215.201627 
+L 392.130927 216.807764 
+L 392.225154 215.88212 
+L 392.507836 220.020109 
+L 392.602063 219.52923 
+L 392.69629 219.844759 
+L 392.790518 219.670203 
+L 392.978972 220.766033 
+L 393.167427 220.362035 
+L 393.355881 218.524613 
+L 393.450108 218.810605 
+L 393.73279 221.304758 
+L 393.827017 220.626432 
+L 394.109699 223.548598 
+L 394.203926 221.939247 
+L 394.298153 222.977482 
+L 394.486608 222.552034 
+L 394.675062 222.094159 
+L 394.769289 222.253368 
+L 394.863517 221.898732 
+L 395.146198 223.530037 
+L 395.523107 220.599927 
+L 395.617334 218.462432 
+L 395.711562 219.174196 
+L 395.900016 218.191174 
+L 395.994243 216.077908 
+L 396.276925 220.721907 
+L 396.842288 228.79502 
+L 396.936516 228.682356 
+L 397.030743 227.695434 
+L 397.313425 221.309164 
+L 397.596106 217.374943 
+L 397.878788 213.43852 
+L 397.973015 213.077348 
+L 398.16147 214.871762 
+L 398.349924 212.692812 
+L 398.632606 209.133052 
+L 399.009515 211.670069 
+L 399.103742 210.142435 
+L 399.386424 211.16832 
+L 399.480651 210.089245 
+L 399.669105 212.101656 
+L 399.763332 213.409524 
+L 400.046014 219.581878 
+L 400.422923 225.140038 
+L 400.705605 228.398062 
+L 400.894059 224.092595 
+L 400.988286 223.683794 
+L 401.270968 218.997149 
+L 401.55365 216.86164 
+L 401.742104 219.337377 
+L 401.836331 218.192582 
+L 401.930559 218.381401 
+L 402.119013 220.09056 
+L 402.307468 216.419184 
+L 402.495922 216.14937 
+L 402.590149 216.541344 
+L 402.684376 216.166197 
+L 402.778604 214.719269 
+L 402.872831 215.091744 
+L 403.061285 212.907522 
+L 403.155513 212.986603 
+L 403.438194 216.081483 
+L 403.532421 214.933438 
+L 403.815103 219.102698 
+L 403.90933 218.958042 
+L 404.097785 220.34091 
+L 404.192012 219.430722 
+L 404.286239 220.023215 
+L 404.380467 219.175027 
+L 404.663148 214.155484 
+L 404.757375 214.233048 
+L 404.851603 213.387858 
+L 404.94583 213.491566 
+L 405.134284 214.159059 
+L 405.228512 214.099044 
+L 405.416966 212.485577 
+L 406.076557 224.687254 
+L 406.359238 227.567099 
+L 406.547693 230.538483 
+L 406.64192 230.181644 
+L 406.736147 230.456477 
+L 406.830374 231.910013 
+L 406.924602 231.686708 
+L 407.113056 233.566667 
+L 407.207283 235.284276 
+L 407.301511 234.363037 
+L 407.584192 227.749491 
+L 407.678419 227.590282 
+L 408.432237 221.404133 
+L 408.620692 221.833951 
+L 408.809146 221.195959 
+L 408.997601 219.712885 
+L 409.091828 220.377779 
+L 409.186055 219.6732 
+L 409.657191 210.37502 
+L 409.751418 210.965852 
+L 410.128327 207.079621 
+L 410.599463 219.259055 
+L 410.882145 222.524446 
+L 411.164827 227.672974 
+L 411.259054 227.734361 
+L 411.824417 231.710541 
+L 412.107099 233.908629 
+L 412.201326 234.4926 
+L 412.578235 231.57289 
+L 412.955144 233.668245 
+L 413.049371 233.525502 
+L 413.143599 233.548793 
+L 413.42628 230.619333 
+L 413.708962 228.191008 
+L 413.803189 227.937479 
+L 413.897416 227.327365 
+L 414.085871 227.624189 
+L 414.46278 233.113451 
+L 415.028143 236.528807 
+L 415.12237 236.218875 
+L 415.405052 234.440782 
+L 415.499279 234.558537 
+L 415.593506 234.110159 
+L 415.687734 234.258318 
+L 415.781961 234.095209 
+L 416.064643 227.282408 
+L 416.15887 223.42976 
+L 416.253097 223.684588 
+L 416.81846 215.273341 
+L 416.912688 215.724897 
+L 417.101142 214.033396 
+L 417.289597 214.711831 
+L 417.383824 215.32794 
+L 417.478051 214.924591 
+L 417.666505 216.436372 
+L 417.760733 215.19668 
+L 417.85496 215.394021 
+L 417.949187 214.44635 
+L 418.137642 211.497391 
+L 418.231869 212.656702 
+L 418.326096 211.082956 
+L 418.420323 211.482513 
+L 418.608778 209.097051 
+L 419.079914 212.503813 
+L 419.174141 211.630528 
+L 419.456823 214.959148 
+L 419.645277 215.556769 
+L 419.739504 213.999019 
+L 419.833732 214.222901 
+L 420.304868 212.413682 
+L 420.399095 212.490632 
+L 420.58755 211.475833 
+L 420.870231 209.624798 
+L 420.964458 209.819648 
+L 421.058686 209.504913 
+L 421.152913 209.719515 
+L 421.24714 209.294139 
+L 421.341367 209.376001 
+L 421.529822 211.976643 
+L 421.624049 212.012031 
+L 421.718276 211.605432 
+L 421.906731 212.337454 
+L 422.189412 210.560517 
+L 422.28364 210.9136 
+L 422.377867 210.111633 
+L 423.320139 222.882332 
+L 423.508594 220.89296 
+L 423.97973 215.318695 
+L 424.073957 215.460138 
+L 424.262411 215.237123 
+L 424.545093 222.773822 
+L 424.827775 226.178778 
+L 425.110456 231.071214 
+L 425.204684 231.896291 
+L 425.487365 229.597059 
+L 425.581593 230.988991 
+L 425.67582 230.923885 
+L 425.864274 231.233961 
+L 426.146956 229.902947 
+L 426.618092 235.861061 
+L 426.712319 236.180634 
+L 426.806546 237.378331 
+L 426.900774 237.254798 
+L 427.183455 233.133058 
+L 427.37191 228.711353 
+L 427.748819 225.594699 
+L 427.937273 224.939194 
+L 428.0315 224.492044 
+L 428.219955 226.900003 
+L 428.879545 235.541162 
+L 428.879545 235.541162 
+" style="fill:none;stroke:#1f77b4;stroke-linecap:square;stroke-width:1.5;"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 38.27 303.64 
+L 38.27 14.76 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 447.48 303.64 
+L 447.48 14.76 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 38.27 303.64 
+L 447.48 303.64 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 38.27 14.76 
+L 447.48 14.76 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;"/>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p27d2dc2848">
+   <rect height="288.88" width="409.21" x="38.27" y="14.76"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/darknet-master/scripts/requested_cuda_version.sh b/darknet-master/scripts/requested_cuda_version.sh
new file mode 100644
index 0000000..ae9ea2a
--- /dev/null
+++ b/darknet-master/scripts/requested_cuda_version.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+export CUDA_VERSION="12.2"
+export CUDA_VERSION_DASHED="${CUDA_VERSION//./-}"
diff --git a/darknet-master/scripts/reval_voc.py b/darknet-master/scripts/reval_voc.py
new file mode 100644
index 0000000..1164f88
--- /dev/null
+++ b/darknet-master/scripts/reval_voc.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+# Adapt from ->
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+# <- Written by Yaping Sun
+
+"""Reval = re-eval. Re-evaluate saved detections."""
+
+import os, sys, argparse
+import numpy as np
+import cPickle
+
+from voc_eval import voc_eval
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    parser = argparse.ArgumentParser(description='Re-evaluate results')
+    parser.add_argument('output_dir', nargs=1, help='results directory',
+                        type=str)
+    parser.add_argument('--voc_dir', dest='voc_dir', default='data/VOCdevkit', type=str)
+    parser.add_argument('--year', dest='year', default='2017', type=str)
+    parser.add_argument('--image_set', dest='image_set', default='test', type=str)
+
+    parser.add_argument('--classes', dest='class_file', default='data/voc.names', type=str)
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+def get_voc_results_file_template(image_set, out_dir = 'results'):
+    filename = 'comp4_det_' + image_set + '_{:s}.txt'
+    path = os.path.join(out_dir, filename)
+    return path
+
+def do_python_eval(devkit_path, year, image_set, classes, output_dir = 'results'):
+    annopath = os.path.join(
+        devkit_path,
+        'VOC' + year,
+        'Annotations',
+        '{:s}.xml')
+    imagesetfile = os.path.join(
+        devkit_path,
+        'VOC' + year,
+        'ImageSets',
+        'Main',
+        image_set + '.txt')
+    cachedir = os.path.join(devkit_path, 'annotations_cache')
+    aps = []
+    # The PASCAL VOC metric changed in 2010
+    use_07_metric = True if int(year) < 2010 else False
+    print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No')
+    if not os.path.isdir(output_dir):
+        os.mkdir(output_dir)
+    for i, cls in enumerate(classes):
+        if cls == '__background__':
+            continue
+        filename = get_voc_results_file_template(image_set).format(cls)
+        rec, prec, ap = voc_eval(
+            filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
+            use_07_metric=use_07_metric)
+        aps += [ap]
+        print('AP for {} = {:.4f}'.format(cls, ap))
+        with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f:
+            cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
+    print('Mean AP = {:.4f}'.format(np.mean(aps)))
+    print('~~~~~~~~')
+    print('Results:')
+    for ap in aps:
+        print('{:.3f}'.format(ap))
+    print('{:.3f}'.format(np.mean(aps)))
+    print('~~~~~~~~')
+    print('')
+    print('--------------------------------------------------------------')
+    print('Results computed with the **unofficial** Python eval code.')
+    print('Results should be very close to the official MATLAB eval code.')
+    print('-- Thanks, The Management')
+    print('--------------------------------------------------------------')
+
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    output_dir = os.path.abspath(args.output_dir[0])
+    with open(args.class_file, 'r') as f:
+        lines = f.readlines()
+
+    classes = [t.strip('\n') for t in lines]
+
+    print 'Evaluating detections'
+    do_python_eval(args.voc_dir, args.year, args.image_set, classes, output_dir)
diff --git a/darknet-master/scripts/reval_voc_py3.py b/darknet-master/scripts/reval_voc_py3.py
new file mode 100644
index 0000000..23f9ce3
--- /dev/null
+++ b/darknet-master/scripts/reval_voc_py3.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+
+# Adapt from ->
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+# <- Written by Yaping Sun
+
+"""Reval = re-eval. Re-evaluate saved detections."""
+
+import os, sys, argparse
+import numpy as np
+import _pickle as cPickle
+#import cPickle
+
+from voc_eval_py3 import voc_eval
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    parser = argparse.ArgumentParser(description='Re-evaluate results')
+    parser.add_argument('output_dir', nargs=1, help='results directory',
+                        type=str)
+    parser.add_argument('--voc_dir', dest='voc_dir', default='data/VOCdevkit', type=str)
+    parser.add_argument('--year', dest='year', default='2017', type=str)
+    parser.add_argument('--image_set', dest='image_set', default='test', type=str)
+
+    parser.add_argument('--classes', dest='class_file', default='data/voc.names', type=str)
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+def get_voc_results_file_template(image_set, out_dir = 'results'):
+    filename = 'comp4_det_' + image_set + '_{:s}.txt'
+    path = os.path.join(out_dir, filename)
+    return path
+
+def do_python_eval(devkit_path, year, image_set, classes, output_dir = 'results'):
+    annopath = os.path.join(
+        devkit_path,
+        'VOC' + year,
+        'Annotations',
+        '{}.xml')
+    imagesetfile = os.path.join(
+        devkit_path,
+        'VOC' + year,
+        'ImageSets',
+        'Main',
+        image_set + '.txt')
+    cachedir = os.path.join(devkit_path, 'annotations_cache')
+    aps = []
+    # The PASCAL VOC metric changed in 2010
+    use_07_metric = True if int(year) < 2010 else False
+    print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
+    print('devkit_path=',devkit_path,', year = ',year)
+
+    if not os.path.isdir(output_dir):
+        os.mkdir(output_dir)
+    for i, cls in enumerate(classes):
+        if cls == '__background__':
+            continue
+        filename = get_voc_results_file_template(image_set).format(cls)
+        rec, prec, ap = voc_eval(
+            filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
+            use_07_metric=use_07_metric)
+        aps += [ap]
+        print('AP for {} = {:.4f}'.format(cls, ap))
+        with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
+            cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
+    print('Mean AP = {:.4f}'.format(np.mean(aps)))
+    print('~~~~~~~~')
+    print('Results:')
+    for ap in aps:
+        print('{:.3f}'.format(ap))
+    print('{:.3f}'.format(np.mean(aps)))
+    print('~~~~~~~~')
+    print('')
+    print('--------------------------------------------------------------')
+    print('Results computed with the **unofficial** Python eval code.')
+    print('Results should be very close to the official MATLAB eval code.')
+    print('-- Thanks, The Management')
+    print('--------------------------------------------------------------')
+
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    output_dir = os.path.abspath(args.output_dir[0])
+    with open(args.class_file, 'r') as f:
+        lines = f.readlines()
+
+    classes = [t.strip('\n') for t in lines]
+
+    print('Evaluating detections')
+    do_python_eval(args.voc_dir, args.year, args.image_set, classes, output_dir)
diff --git a/darknet-master/scripts/setup.ps1 b/darknet-master/scripts/setup.ps1
new file mode 100644
index 0000000..125cf6f
--- /dev/null
+++ b/darknet-master/scripts/setup.ps1
@@ -0,0 +1,41 @@
+#!/usr/bin/env pwsh
+
+param (
+  [switch]$InstallCUDA = $false
+)
+
+Import-Module -Name $PSScriptRoot/utils.psm1 -Force
+
+if ($null -eq (Get-Command "choco.exe" -ErrorAction SilentlyContinue)) {
+  # Download and install Chocolatey
+  Invoke-Expression ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))
+  Throw "Please close and re-open powershell and then re-run setup.ps1 script"
+}
+
+Start-Process -FilePath "choco" -Verb runAs -ArgumentList " install -y cmake ninja powershell git vscode"
+Start-Process -FilePath "choco" -Verb runAs -ArgumentList " install -y visualstudio2022buildtools --package-parameters `"--add Microsoft.VisualStudio.Component.VC.CoreBuildTools --includeRecommended --includeOptional --passive --locale en-US --lang en-US`""
+Push-Location $PSScriptRoot
+
+if ($InstallCUDA) {
+  & $PSScriptRoot/deploy-cuda.ps1
+  $env:CUDA_PATH = "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v${cuda_version_short}"
+  $env:CUDA_TOOLKIT_ROOT_DIR = "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v${cuda_version_short}"
+  $env:CUDACXX = "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v${cuda_version_short}\\bin\\nvcc.exe"
+  $CUDAisAvailable = $true
+}
+else {
+  if (-not $null -eq $env:CUDA_PATH) {
+    $CUDAisAvailable = $true
+  }
+  else{
+    $CUDAisAvailable = $false
+  }
+}
+
+if ($CUDAisAvailable) {
+  & $PSScriptRoot/../build.ps1 -UseVCPKG -ForceLocalVCPKG -EnableOPENCV -EnableCUDA -DisableInteractive -DoNotUpdateTOOL
+  #& $PSScriptRoot/../build.ps1 -UseVCPKG -EnableOPENCV -EnableCUDA -EnableOPENCV_CUDA  -DisableInteractive -DoNotUpdateTOOL
+}
+else {
+  & $PSScriptRoot/../build.ps1 -UseVCPKG -EnableOPENCV -DisableInteractive -DoNotUpdateTOOL
+}
diff --git a/darknet-master/scripts/setup.sh b/darknet-master/scripts/setup.sh
new file mode 100644
index 0000000..e583e97
--- /dev/null
+++ b/darknet-master/scripts/setup.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+install_tools=false
+install_cuda=false
+bypass_driver_installation=false
+
+POSITIONAL=()
+while [[ $# -gt 0 ]]
+do
+key="$1"
+
+case $key in
+    -InstallCUDA|--InstallCUDA)
+    install_cuda=true
+    shift
+    ;;
+    -InstallTOOLS|--InstallTOOLS)
+    install_tools=true
+    shift
+    ;;
+    -BypassDRIVER|--BypassDRIVER)
+    bypass_driver_installation=true
+    shift
+    ;;
+    *)    # unknown option
+    POSITIONAL+=("$1") # save it in an array for later
+    shift # past argument
+    ;;
+esac
+done
+set -- "${POSITIONAL[@]}" # restore positional parameters
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+echo "This script is located in $script_dir"
+cd $script_dir/..
+temp_folder="./temp"
+mkdir -p $temp_folder
+cd $temp_folder
+
+if [ -f $script_dir/requested_cuda_version.sh ]; then
+  echo "Loading $script_dir/requested_cuda_version.sh"
+  source $script_dir/requested_cuda_version.sh
+else
+  echo "Unable to find requested_cuda_version.sh script"
+  exit 1
+fi
+
+if [[ "$OSTYPE" == "darwin"* ]]; then
+  if [ "$install_cuda" = true ] ; then
+    echo "Unable to install CUDA on macOS, please wait for a future script update or do not put -InstallCUDA command line flag to continue"
+    exit 2
+  fi
+  if [ "$install_tools" = true ] ; then
+    echo "Unable to provide tools on macOS, please wait for a future script update or do not put -InstallTOOLS command line flag to continue"
+    exit 3
+  fi
+elif [[ $(cut -f2 <<< $(lsb_release -i)) == "Ubuntu" ]]; then
+  echo "Running in $(cut -f2 <<< $(lsb_release -i))"
+  echo "InstallCUDA: $install_cuda"
+  echo "InstallTOOLS: $install_tools"
+  if [ "$install_cuda" = true ] ; then
+    echo "Running $script_dir/deploy-cuda.sh"
+    $script_dir/deploy-cuda.sh
+    if [ "$bypass_driver_installation" = true ] ; then
+      sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so.1
+      sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so.1
+      sudo ln -s /usr/local/cuda-${CUDA_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_VERSION}/lib64/libcuda.so
+    fi
+    export PATH=/usr/local/cuda/bin:$PATH
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export CUDACXX=/usr/local/cuda/bin/nvcc
+    export CUDA_PATH=/usr/local/cuda
+    export CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda
+  fi
+  if [ "$install_tools" = true ] ; then
+    echo "Installing tools"
+    sudo apt-get update
+    sudo apt-get install -y  --no-install-recommends git ninja-build build-essential g++ nasm yasm gperf
+    sudo apt-get install -y --no-install-recommends apt-transport-https ca-certificates gnupg software-properties-common wget
+    sudo apt-get install -y --no-install-recommends libgles2-mesa-dev libx11-dev libxft-dev libxext-dev libxrandr-dev libxi-dev libxcursor-dev libxdamage-dev libxinerama-dev libdbus-1-dev libxtst-dev
+    wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+    sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(cut -f2 <<< $(lsb_release -c)) main"
+    wget -q https://packages.microsoft.com/config/ubuntu/$(cut -f2 <<< $(lsb_release -r))/packages-microsoft-prod.deb
+    sudo dpkg -i packages-microsoft-prod.deb
+    sudo add-apt-repository universe
+    sudo apt-get update
+    sudo apt-get dist-upgrade -y
+    sudo apt-get install -y --no-install-recommends cmake
+    sudo apt-get install -y --no-install-recommends powershell
+    sudo apt-get install -y --no-install-recommends curl zip unzip tar
+    sudo apt-get install -y --no-install-recommends pkg-config autoconf libtool bison
+    sudo apt-get clean
+  fi
+else
+  if [ "$install_cuda" = true ] ; then
+    echo "Unable to install CUDA on this OS, please wait for a future script update or do not put -InstallCUDA command line flag to continue"
+    exit 4
+  fi
+  if [ "$install_tools" = true ] ; then
+    echo "Unable to install tools on this OS, please wait for a future script update or do not put -InstallTOOLS command line flag to continue"
+    exit 5
+  fi
+fi
+
+cd ..
+rm -rf "$temp_folder"
+echo "Building darknet"
+if [[ -v CUDA_PATH ]]; then
+  ./build.ps1 -UseVCPKG -EnableOPENCV -EnableCUDA -EnableCUDNN -DisableInteractive -DoNotUpdateTOOL
+  #./build.ps1 -UseVCPKG -EnableOPENCV -EnableCUDA -EnableCUDNN -EnableOPENCV_CUDA -DisableInteractive -DoNotUpdateTOOL
+else
+  ./build.ps1 -UseVCPKG -EnableOPENCV -DisableInteractive -DoNotUpdateTOOL
+fi
diff --git a/darknet-master/scripts/utils.psm1 b/darknet-master/scripts/utils.psm1
new file mode 100644
index 0000000..928d933
--- /dev/null
+++ b/darknet-master/scripts/utils.psm1
@@ -0,0 +1,342 @@
+<#
+Copyright (c) Stefano Sinigardi
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+#>
+
+$utils_psm1_version = "0.3.0"
+$IsWindowsPowerShell = switch ( $PSVersionTable.PSVersion.Major ) {
+  5 { $true }
+  4 { $true }
+  3 { $true }
+  2 { $true }
+  default { $false }
+}
+
+$ExecutableSuffix = ""
+if ($IsWindowsPowerShell -or $IsWindows) {
+  $ExecutableSuffix = ".exe"
+}
+
+$64bitPwsh = $([Environment]::Is64BitProcess)
+$64bitOS = $([Environment]::Is64BitOperatingSystem)
+
+Push-Location $PSScriptRoot
+$GIT_EXE = Get-Command "git" -ErrorAction SilentlyContinue | Select-Object -ExpandProperty Definition
+if ($GIT_EXE) {
+  $IsInGitSubmoduleString = $(git rev-parse --show-superproject-working-tree 2> $null)
+  if ($IsInGitSubmoduleString.Length -eq 0) {
+    $IsInGitSubmodule = $false
+  }
+  else {
+    $IsInGitSubmodule = $true
+  }
+}
+else {
+  $IsInGitSubmodule = $false
+}
+Pop-Location
+
+$cuda_version_full = "12.2.0"
+$cuda_version_short = "12.2"
+$cuda_version_full_dashed = $cuda_version_full.replace('.', '-')
+$cuda_version_short_dashed = $cuda_version_short.replace('.', '-')
+
+function getProgramFiles32bit() {
+  $out = ${env:PROGRAMFILES(X86)}
+  if ($null -eq $out) {
+    $out = ${env:PROGRAMFILES}
+  }
+
+  if ($null -eq $out) {
+    MyThrow("Could not find [Program Files 32-bit]")
+  }
+
+  return $out
+}
+
+function getLatestVisualStudioWithDesktopWorkloadPath([bool]$required = $true) {
+  $programFiles = getProgramFiles32bit
+  $vswhereExe = "$programFiles\Microsoft Visual Studio\Installer\vswhere.exe"
+  if (Test-Path $vswhereExe) {
+    $output = & $vswhereExe -products * -latest -requires Microsoft.VisualStudio.Workload.NativeDesktop -format xml
+    [xml]$asXml = $output
+    foreach ($instance in $asXml.instances.instance) {
+      $installationPath = $instance.InstallationPath -replace "\\$" # Remove potential trailing backslash
+    }
+    if (!$installationPath) {
+      #Write-Host "Warning: no full Visual Studio setup has been found, extending search to include also partial installations" -ForegroundColor Yellow
+      $output = & $vswhereExe -products * -latest -format xml
+      [xml]$asXml = $output
+      foreach ($instance in $asXml.instances.instance) {
+        $installationPath = $instance.InstallationPath -replace "\\$" # Remove potential trailing backslash
+      }
+    }
+    if (!$installationPath) {
+      #Write-Host "Warning: no full Visual Studio setup has been found, extending search to include also pre-release installations" -ForegroundColor Yellow
+      $output = & $vswhereExe -prerelease -products * -latest -format xml
+      [xml]$asXml = $output
+      foreach ($instance in $asXml.instances.instance) {
+        $installationPath = $instance.InstallationPath -replace "\\$" # Remove potential trailing backslash
+      }
+    }
+    if (!$installationPath) {
+      if ($required) {
+        MyThrow("Could not locate any installation of Visual Studio")
+      }
+      else {
+        Write-Host "Could not locate any installation of Visual Studio" -ForegroundColor Red
+        return $null
+      }
+    }
+  }
+  else {
+    if ($required) {
+      MyThrow("Could not locate vswhere at $vswhereExe")
+    }
+    else {
+      Write-Host "Could not locate vswhere at $vswhereExe" -ForegroundColor Red
+      return $null
+    }
+  }
+  return $installationPath
+}
+
+function getLatestVisualStudioWithDesktopWorkloadVersion([bool]$required = $true) {
+  $programFiles = getProgramFiles32bit
+  $vswhereExe = "$programFiles\Microsoft Visual Studio\Installer\vswhere.exe"
+  if (Test-Path $vswhereExe) {
+    $output = & $vswhereExe -products * -latest -requires Microsoft.VisualStudio.Workload.NativeDesktop -format xml
+    [xml]$asXml = $output
+    foreach ($instance in $asXml.instances.instance) {
+      $installationVersion = $instance.InstallationVersion
+    }
+    if (!$installationVersion) {
+      #Write-Host "Warning: no full Visual Studio setup has been found, extending search to include also partial installations" -ForegroundColor Yellow
+      $output = & $vswhereExe -products * -latest -format xml
+      [xml]$asXml = $output
+      foreach ($instance in $asXml.instances.instance) {
+        $installationVersion = $instance.installationVersion
+      }
+    }
+    if (!$installationVersion) {
+      #Write-Host "Warning: no full Visual Studio setup has been found, extending search to include also pre-release installations" -ForegroundColor Yellow
+      $output = & $vswhereExe -prerelease -products * -latest -format xml
+      [xml]$asXml = $output
+      foreach ($instance in $asXml.instances.instance) {
+        $installationVersion = $instance.installationVersion
+      }
+    }
+    if (!$installationVersion) {
+      if ($required) {
+        MyThrow("Could not locate any installation of Visual Studio")
+      }
+      else {
+        Write-Host "Could not locate any installation of Visual Studio" -ForegroundColor Red
+        return $null
+      }
+    }
+  }
+  else {
+    if ($required) {
+      MyThrow("Could not locate vswhere at $vswhereExe")
+    }
+    else {
+      Write-Host "Could not locate vswhere at $vswhereExe" -ForegroundColor Red
+      return $null
+    }
+  }
+  return $installationVersion
+}
+
+function DownloadNinja() {
+  Write-Host "Downloading a portable version of Ninja" -ForegroundColor Yellow
+  Remove-Item -Force -Recurse -ErrorAction SilentlyContinue ninja
+  Remove-Item -Force -ErrorAction SilentlyContinue ninja.zip
+  if ($IsWindows -or $IsWindowsPowerShell) {
+    $url = "https://github.com/ninja-build/ninja/releases/download/v1.10.2/ninja-win.zip"
+  }
+  elseif ($IsLinux) {
+    $url = "https://github.com/ninja-build/ninja/releases/download/v1.10.2/ninja-linux.zip"
+  }
+  elseif ($IsMacOS) {
+    $url = "https://github.com/ninja-build/ninja/releases/download/v1.10.2/ninja-mac.zip"
+  }
+  else {
+    MyThrow("Unknown OS, unsupported")
+  }
+  Invoke-RestMethod -Uri $url -Method Get -ContentType application/zip -OutFile "ninja.zip"
+  Expand-Archive -Path ninja.zip
+  Remove-Item -Force -ErrorAction SilentlyContinue ninja.zip
+  return "./ninja${ExecutableSuffix}"
+}
+
+function DownloadAria2() {
+  Write-Host "Downloading a portable version of Aria2" -ForegroundColor Yellow
+  if ($IsWindows -or $IsWindowsPowerShell) {
+    $basename = "aria2-1.35.0-win-32bit-build1"
+    $zipName = "${basename}.zip"
+    $outFolder = "$basename/$basename"
+    Remove-Item -Force -Recurse -ErrorAction SilentlyContinue $outFolder
+    Remove-Item -Force -ErrorAction SilentlyContinue $zipName
+    $url = "https://github.com/aria2/aria2/releases/download/release-1.35.0/$zipName"
+    Invoke-RestMethod -Uri $url -Method Get -ContentType application/zip -OutFile $zipName
+    Expand-Archive -Path $zipName
+  }
+  elseif ($IsLinux) {
+    $basename = "aria2-1.36.0-linux-gnu-64bit-build1"
+    $zipName = "${basename}.tar.bz2"
+    $outFolder = $basename
+    Remove-Item -Force -Recurse -ErrorAction SilentlyContinue $outFolder
+    Remove-Item -Force -ErrorAction SilentlyContinue $zipName
+    $url = "https://github.com/q3aql/aria2-static-builds/releases/download/v1.36.0/$zipName"
+    Invoke-RestMethod -Uri $url -Method Get -ContentType application/zip -OutFile $zipName
+    tar xf $zipName
+  }
+  elseif ($IsMacOS) {
+    $basename = "aria2-1.35.0-osx-darwin"
+    $zipName = "${basename}.tar.bz2"
+    $outFolder = "aria2-1.35.0/bin"
+    Remove-Item -Force -Recurse -ErrorAction SilentlyContinue $outFolder
+    Remove-Item -Force -ErrorAction SilentlyContinue $zipName
+    $url = "https://github.com/aria2/aria2/releases/download/release-1.35.0/$zipName"
+    Invoke-RestMethod -Uri $url -Method Get -ContentType application/zip -OutFile $zipName
+    tar xf $zipName
+  }
+  else {
+    MyThrow("Unknown OS, unsupported")
+  }
+  Remove-Item -Force -ErrorAction SilentlyContinue $zipName
+  return "./$outFolder/aria2c${ExecutableSuffix}"
+}
+
+function Download7Zip() {
+  Write-Host "Downloading a portable version of 7-Zip" -ForegroundColor Yellow
+  if ($IsWindows -or $IsWindowsPowerShell) {
+    $basename = "7za920"
+    $zipName = "${basename}.zip"
+    $outFolder = "$basename"
+    $outSuffix = "a"
+    Remove-Item -Force -Recurse -ErrorAction SilentlyContinue $outFolder
+    Remove-Item -Force -ErrorAction SilentlyContinue $zipName
+    $url = "https://www.7-zip.org/a/$zipName"
+    Invoke-RestMethod -Uri $url -Method Get -ContentType application/zip -OutFile $zipName
+    Expand-Archive -Path $zipName
+  }
+  elseif ($IsLinux) {
+    $basename = "7z2201-linux-x64"
+    $zipName = "${basename}.tar.xz"
+    $outFolder = $basename
+    $outSuffix = "z"
+    Remove-Item -Force -Recurse -ErrorAction SilentlyContinue $outFolder
+    Remove-Item -Force -ErrorAction SilentlyContinue $zipName
+    $url = "https://www.7-zip.org/a/$zipName"
+    Invoke-RestMethod -Uri $url -Method Get -ContentType application/zip -OutFile $zipName
+    tar xf $zipName
+  }
+  elseif ($IsMacOS) {
+    $basename = "7z2107-mac"
+    $zipName = "${basename}.tar.xz"
+    $outFolder = $basename
+    $outSuffix = "z"
+    Remove-Item -Force -Recurse -ErrorAction SilentlyContinue $outFolder
+    Remove-Item -Force -ErrorAction SilentlyContinue $zipName
+    $url = "https://www.7-zip.org/a/$zipName"
+    Invoke-RestMethod -Uri $url -Method Get -ContentType application/zip -OutFile $zipName
+    tar xf $zipName
+  }
+  else {
+    MyThrow("Unknown OS, unsupported")
+  }
+  Remove-Item -Force -ErrorAction SilentlyContinue $zipName
+  return "./$outFolder/7z${outSuffix}${ExecutableSuffix}"
+}
+
+Function MyThrow ($Message) {
+  if ($global:DisableInteractive) {
+    Write-Host $Message -ForegroundColor Red
+    throw
+  }
+  else {
+    # Check if running in PowerShell ISE
+    if ($psISE) {
+      # "ReadKey" not supported in PowerShell ISE.
+      # Show MessageBox UI
+      $Shell = New-Object -ComObject "WScript.Shell"
+      $Shell.Popup($Message, 0, "OK", 0)
+      throw
+    }
+
+    $Ignore =
+    16, # Shift (left or right)
+    17, # Ctrl (left or right)
+    18, # Alt (left or right)
+    20, # Caps lock
+    91, # Windows key (left)
+    92, # Windows key (right)
+    93, # Menu key
+    144, # Num lock
+    145, # Scroll lock
+    166, # Back
+    167, # Forward
+    168, # Refresh
+    169, # Stop
+    170, # Search
+    171, # Favorites
+    172, # Start/Home
+    173, # Mute
+    174, # Volume Down
+    175, # Volume Up
+    176, # Next Track
+    177, # Previous Track
+    178, # Stop Media
+    179, # Play
+    180, # Mail
+    181, # Select Media
+    182, # Application 1
+    183  # Application 2
+
+    Write-Host $Message -ForegroundColor Red
+    Write-Host -NoNewline "Press any key to continue..."
+    while (($null -eq $KeyInfo.VirtualKeyCode) -or ($Ignore -contains $KeyInfo.VirtualKeyCode)) {
+      $KeyInfo = $Host.UI.RawUI.ReadKey("NoEcho, IncludeKeyDown")
+    }
+    Write-Host ""
+    throw
+  }
+}
+
+Export-ModuleMember -Variable utils_psm1_version
+Export-ModuleMember -Variable IsWindowsPowerShell
+Export-ModuleMember -Variable IsInGitSubmodule
+Export-ModuleMember -Variable 64bitPwsh
+Export-ModuleMember -Variable 64bitOS
+Export-ModuleMember -Variable cuda_version_full
+Export-ModuleMember -Variable cuda_version_short
+Export-ModuleMember -Variable cuda_version_full_dashed
+Export-ModuleMember -Variable cuda_version_short_dashed
+Export-ModuleMember -Function getProgramFiles32bit
+Export-ModuleMember -Function getLatestVisualStudioWithDesktopWorkloadPath
+Export-ModuleMember -Function getLatestVisualStudioWithDesktopWorkloadVersion
+Export-ModuleMember -Function DownloadNinja
+Export-ModuleMember -Function DownloadAria2
+Export-ModuleMember -Function Download7Zip
+Export-ModuleMember -Function MyThrow
diff --git a/darknet-master/scripts/voc_eval.py b/darknet-master/scripts/voc_eval.py
new file mode 100644
index 0000000..3b69331
--- /dev/null
+++ b/darknet-master/scripts/voc_eval.py
@@ -0,0 +1,200 @@
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+
+import xml.etree.ElementTree as ET
+import os
+import cPickle
+import numpy as np
+
+def parse_rec(filename):
+    """ Parse a PASCAL VOC xml file """
+    tree = ET.parse(filename)
+    objects = []
+    for obj in tree.findall('object'):
+        obj_struct = {}
+        obj_struct['name'] = obj.find('name').text
+        #obj_struct['pose'] = obj.find('pose').text
+        #obj_struct['truncated'] = int(obj.find('truncated').text)
+        obj_struct['difficult'] = int(obj.find('difficult').text)
+        bbox = obj.find('bndbox')
+        obj_struct['bbox'] = [int(bbox.find('xmin').text),
+                              int(bbox.find('ymin').text),
+                              int(bbox.find('xmax').text),
+                              int(bbox.find('ymax').text)]
+        objects.append(obj_struct)
+
+    return objects
+
+def voc_ap(rec, prec, use_07_metric=False):
+    """ ap = voc_ap(rec, prec, [use_07_metric])
+    Compute VOC AP given precision and recall.
+    If use_07_metric is true, uses the
+    VOC 07 11 point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+def voc_eval(detpath,
+             annopath,
+             imagesetfile,
+             classname,
+             cachedir,
+             ovthresh=0.5,
+             use_07_metric=False):
+    """rec, prec, ap = voc_eval(detpath,
+                                annopath,
+                                imagesetfile,
+                                classname,
+                                [ovthresh],
+                                [use_07_metric])
+
+    Top level function that does the PASCAL VOC evaluation.
+
+    detpath: Path to detections
+        detpath.format(classname) should produce the detection results file.
+    annopath: Path to annotations
+        annopath.format(imagename) should be the xml annotations file.
+    imagesetfile: Text file containing the list of images, one image per line.
+    classname: Category name (duh)
+    cachedir: Directory for caching the annotations
+    [ovthresh]: Overlap threshold (default = 0.5)
+    [use_07_metric]: Whether to use VOC07's 11 point AP computation
+        (default False)
+    """
+    # assumes detections are in detpath.format(classname)
+    # assumes annotations are in annopath.format(imagename)
+    # assumes imagesetfile is a text file with each line an image name
+    # cachedir caches the annotations in a pickle file
+
+    # first load gt
+    if not os.path.isdir(cachedir):
+        os.mkdir(cachedir)
+    cachefile = os.path.join(cachedir, 'annots.pkl')
+    # read list of images
+    with open(imagesetfile, 'r') as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+
+    if not os.path.isfile(cachefile):
+        # load annots
+        recs = {}
+        for i, imagename in enumerate(imagenames):
+            recs[imagename] = parse_rec(annopath.format(imagename))
+            if i % 100 == 0:
+                print 'Reading annotation for {:d}/{:d}'.format(
+                    i + 1, len(imagenames))
+        # save
+        print 'Saving cached annotations to {:s}'.format(cachefile)
+        with open(cachefile, 'w') as f:
+            cPickle.dump(recs, f)
+    else:
+        # load
+        with open(cachefile, 'r') as f:
+            recs = cPickle.load(f)
+
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj['name'] == classname]
+        bbox = np.array([x['bbox'] for x in R])
+        difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {'bbox': bbox,
+                                 'difficult': difficult,
+                                 'det': det}
+
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, 'r') as f:
+        lines = f.readlines()
+
+    splitlines = [x.strip().split(' ') for x in lines]
+    image_ids = [x[0] for x in splitlines]
+    confidence = np.array([float(x[1]) for x in splitlines])
+    BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    sorted_scores = np.sort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R['bbox'].astype(float)
+
+        if BBGT.size > 0:
+            # compute overlaps
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1., 0.)
+            ih = np.maximum(iymax - iymin + 1., 0.)
+            inters = iw * ih
+
+            # union
+            uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
+                   (BBGT[:, 2] - BBGT[:, 0] + 1.) *
+                   (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > ovthresh:
+            if not R['difficult'][jmax]:
+                if not R['det'][jmax]:
+                    tp[d] = 1.
+                    R['det'][jmax] = 1
+                else:
+                    fp[d] = 1.
+        else:
+            fp[d] = 1.
+
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    return rec, prec, ap
diff --git a/darknet-master/scripts/voc_eval_py3.py b/darknet-master/scripts/voc_eval_py3.py
new file mode 100644
index 0000000..13d07a9
--- /dev/null
+++ b/darknet-master/scripts/voc_eval_py3.py
@@ -0,0 +1,201 @@
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+
+import xml.etree.ElementTree as ET
+import os
+#import cPickle
+import _pickle as cPickle
+import numpy as np
+
+def parse_rec(filename):
+    """ Parse a PASCAL VOC xml file """
+    tree = ET.parse(filename)
+    objects = []
+    for obj in tree.findall('object'):
+        obj_struct = {}
+        obj_struct['name'] = obj.find('name').text
+        #obj_struct['pose'] = obj.find('pose').text
+        #obj_struct['truncated'] = int(obj.find('truncated').text)
+        obj_struct['difficult'] = int(obj.find('difficult').text)
+        bbox = obj.find('bndbox')
+        obj_struct['bbox'] = [int(bbox.find('xmin').text),
+                              int(bbox.find('ymin').text),
+                              int(bbox.find('xmax').text),
+                              int(bbox.find('ymax').text)]
+        objects.append(obj_struct)
+
+    return objects
+
+def voc_ap(rec, prec, use_07_metric=False):
+    """ ap = voc_ap(rec, prec, [use_07_metric])
+    Compute VOC AP given precision and recall.
+    If use_07_metric is true, uses the
+    VOC 07 11 point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+def voc_eval(detpath,
+             annopath,
+             imagesetfile,
+             classname,
+             cachedir,
+             ovthresh=0.5,
+             use_07_metric=False):
+    """rec, prec, ap = voc_eval(detpath,
+                                annopath,
+                                imagesetfile,
+                                classname,
+                                [ovthresh],
+                                [use_07_metric])
+
+    Top level function that does the PASCAL VOC evaluation.
+
+    detpath: Path to detections
+        detpath.format(classname) should produce the detection results file.
+    annopath: Path to annotations
+        annopath.format(imagename) should be the xml annotations file.
+    imagesetfile: Text file containing the list of images, one image per line.
+    classname: Category name (duh)
+    cachedir: Directory for caching the annotations
+    [ovthresh]: Overlap threshold (default = 0.5)
+    [use_07_metric]: Whether to use VOC07's 11 point AP computation
+        (default False)
+    """
+    # assumes detections are in detpath.format(classname)
+    # assumes annotations are in annopath.format(imagename)
+    # assumes imagesetfile is a text file with each line an image name
+    # cachedir caches the annotations in a pickle file
+
+    # first load gt
+    if not os.path.isdir(cachedir):
+        os.mkdir(cachedir)
+    cachefile = os.path.join(cachedir, 'annots.pkl')
+    # read list of images
+    with open(imagesetfile, 'r') as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+
+    if not os.path.isfile(cachefile):
+        # load annots
+        recs = {}
+        for i, imagename in enumerate(imagenames):
+            recs[imagename] = parse_rec(annopath.format(imagename))
+            #if i % 100 == 0:
+                #print('Reading annotation for {:d}/{:d}').format(i + 1, len(imagenames))
+        # save
+        #print('Saving cached annotations to {:s}').format(cachefile)
+        with open(cachefile, 'wb') as f:
+            cPickle.dump(recs, f)
+    else:
+        # load
+        print('!!! cachefile = ',cachefile)
+        with open(cachefile, 'rb') as f:
+            recs = cPickle.load(f)
+
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj['name'] == classname]
+        bbox = np.array([x['bbox'] for x in R])
+        difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {'bbox': bbox,
+                                 'difficult': difficult,
+                                 'det': det}
+
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, 'r') as f:
+        lines = f.readlines()
+
+    splitlines = [x.strip().split(' ') for x in lines]
+    image_ids = [x[0] for x in splitlines]
+    confidence = np.array([float(x[1]) for x in splitlines])
+    BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    sorted_scores = np.sort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R['bbox'].astype(float)
+
+        if BBGT.size > 0:
+            # compute overlaps
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1., 0.)
+            ih = np.maximum(iymax - iymin + 1., 0.)
+            inters = iw * ih
+
+            # union
+            uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
+                   (BBGT[:, 2] - BBGT[:, 0] + 1.) *
+                   (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > ovthresh:
+            if not R['difficult'][jmax]:
+                if not R['det'][jmax]:
+                    tp[d] = 1.
+                    R['det'][jmax] = 1
+                else:
+                    fp[d] = 1.
+        else:
+            fp[d] = 1.
+
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    return rec, prec, ap
diff --git a/darknet-master/scripts/voc_label.py b/darknet-master/scripts/voc_label.py
new file mode 100644
index 0000000..679fc36
--- /dev/null
+++ b/darknet-master/scripts/voc_label.py
@@ -0,0 +1,59 @@
+import xml.etree.ElementTree as ET
+import pickle
+import os
+from os import listdir, getcwd
+from os.path import join
+
+sets=[('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test')]
+
+classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
+
+
+def convert(size, box):
+    dw = 1./(size[0])
+    dh = 1./(size[1])
+    x = (box[0] + box[1])/2.0 - 1
+    y = (box[2] + box[3])/2.0 - 1
+    w = box[1] - box[0]
+    h = box[3] - box[2]
+    x = x*dw
+    w = w*dw
+    y = y*dh
+    h = h*dh
+    return (x,y,w,h)
+
+def convert_annotation(year, image_id):
+    in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id))
+    out_file = open('VOCdevkit/VOC%s/labels/%s.txt'%(year, image_id), 'w')
+    tree=ET.parse(in_file)
+    root = tree.getroot()
+    size = root.find('size')
+    w = int(size.find('width').text)
+    h = int(size.find('height').text)
+
+    for obj in root.iter('object'):
+        difficult = obj.find('difficult').text
+        cls = obj.find('name').text
+        if cls not in classes or int(difficult)==1:
+            continue
+        cls_id = classes.index(cls)
+        xmlbox = obj.find('bndbox')
+        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
+        bb = convert((w,h), b)
+        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
+
+wd = getcwd()
+
+for year, image_set in sets:
+    if not os.path.exists('VOCdevkit/VOC%s/labels/'%(year)):
+        os.makedirs('VOCdevkit/VOC%s/labels/'%(year))
+    image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set)).read().strip().split()
+    list_file = open('%s_%s.txt'%(year, image_set), 'w')
+    for image_id in image_ids:
+        list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg\n'%(wd, year, image_id))
+        convert_annotation(year, image_id)
+    list_file.close()
+
+os.system("cat 2007_train.txt 2007_val.txt 2012_train.txt 2012_val.txt > train.txt")
+os.system("cat 2007_train.txt 2007_val.txt 2007_test.txt 2012_train.txt 2012_val.txt > train.all.txt")
+
diff --git a/darknet-master/scripts/voc_label_difficult.py b/darknet-master/scripts/voc_label_difficult.py
new file mode 100644
index 0000000..cfffce3
--- /dev/null
+++ b/darknet-master/scripts/voc_label_difficult.py
@@ -0,0 +1,56 @@
+import xml.etree.ElementTree as ET
+import pickle
+import os
+from os import listdir, getcwd
+from os.path import join
+
+sets=[('2012', 'val'),('2007', 'test')]
+
+classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
+
+
+def convert(size, box):
+    dw = 1./(size[0])
+    dh = 1./(size[1])
+    x = (box[0] + box[1])/2.0 - 1
+    y = (box[2] + box[3])/2.0 - 1
+    w = box[1] - box[0]
+    h = box[3] - box[2]
+    x = x*dw
+    w = w*dw
+    y = y*dh
+    h = h*dh
+    return (x,y,w,h)
+
+def convert_annotation(year, image_id):
+    in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id))
+    out_file = open('VOCdevkit/VOC%s/labels/difficult_%s.txt'%(year, image_id), 'w')
+    tree=ET.parse(in_file)
+    root = tree.getroot()
+    size = root.find('size')
+    w = int(size.find('width').text)
+    h = int(size.find('height').text)
+
+    for obj in root.iter('object'):
+        difficult = obj.find('difficult').text
+        cls = obj.find('name').text
+        if cls not in classes or int(difficult) == 0:
+            continue
+        cls_id = classes.index(cls)
+        xmlbox = obj.find('bndbox')
+        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
+        bb = convert((w,h), b)
+        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
+
+wd = getcwd()
+
+for year, image_set in sets:
+    if not os.path.exists('VOCdevkit/VOC%s/labels/'%(year)):
+        os.makedirs('VOCdevkit/VOC%s/labels/'%(year))
+    image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set)).read().strip().split()
+    list_file = open('difficult_%s_%s.txt'%(year, image_set), 'w')
+    for image_id in image_ids:
+        list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/difficult_%s.jpg\n'%(wd, year, image_id))
+        convert_annotation(year, image_id)
+    list_file.close()
+
diff --git a/darknet-master/scripts/windows/otb_get_labels.sh b/darknet-master/scripts/windows/otb_get_labels.sh
new file mode 100644
index 0000000..43e53e4
--- /dev/null
+++ b/darknet-master/scripts/windows/otb_get_labels.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+ 
+dataset=$1
+w=$2
+h=$3
+
+# Parameters: Human3, CarScale, Human6, Biker
+#w=480
+#h=640
+IFS=','
+  
+export LC_NUMERIC="en_US.UTF-8"
+
+wd=`pwd`
+dataset_path="data/$dataset"
+
+class_id=0
+num=1
+
+mkdir data
+wget http://cvlab.hanyang.ac.kr/tracker_benchmark/seq/$dataset.zip
+
+unzip -o $dataset.zip -d data
+
+sed -i.bak 's/\o11/,/g' $dataset_path/groundtruth_rect.txt
+sed -i.bak 's/\o11/,/g' $dataset_path/groundtruth_rect.txt
+dos2unix $dataset_path/groundtruth_rect.txt
+
+while read -r left right width height; do
+filename=$(printf "$dataset_path/img/%04d.txt" $num)
+#rm $filename.txt
+echo "$class_id " > $filename
+printf "%.5f " "$(($((left + width/2))  * 100000 / $w))e-5"   >> $filename
+printf "%.5f " "$(($((right + height/2))  * 100000 / $h))e-5"   >> $filename
+printf "%.5f " "$(($((width))  * 100000 / $w))e-5"   >> $filename
+printf "%.5f " "$(($((height))  * 100000 / $h))e-5"   >> $filename
+num=$((num + 1))
+done < $dataset_path/groundtruth_rect.txt
+
+echo "$dataset" > $dataset_path/otb.names
+ 
+
+find $dataset_path/img -name \*.jpg > data/$dataset/train.txt
+
+echo "classes = 1" > data/otb_$dataset.data
+echo "train = data/$dataset/train.txt" >> data/otb_$dataset.data
+echo "valid = data/$dataset/train.txt" >> data/otb_$dataset.data
+echo "names = $dataset_path/otb.names" >> data/otb_$dataset.data
+echo "backup = backup/" >> data/otb_$dataset.data
+echo "results= results/" >> data/otb_$dataset.data
\ No newline at end of file
diff --git a/darknet-master/scripts/windows/win_cifar.cmd b/darknet-master/scripts/windows/win_cifar.cmd
new file mode 100644
index 0000000..a53f95a
--- /dev/null
+++ b/darknet-master/scripts/windows/win_cifar.cmd
@@ -0,0 +1,19 @@
+echo Run install_cygwin.cmd before:
+
+
+c:\cygwin64\bin\wget https://pjreddie.com/media/files/cifar.tgz
+
+c:\cygwin64\bin\gzip -d "%CD:\=/%/cifar.tgz"
+
+c:\cygwin64\bin\tar --force-local -xvf "%CD:\=/%/cifar.tar"
+
+c:\cygwin64\bin\cat "%CD:\=/%/labels.txt"
+
+
+c:\cygwin64\bin\find "%CD:\=/%/cifar/train" -name \*.png > "%CD:\=/%/cifar/train.list"
+
+c:\cygwin64\bin\find "%CD:\=/%/cifar/test" -name \*.png > "%CD:\=/%/cifar/test.list"
+
+
+
+pause
\ No newline at end of file
diff --git a/darknet-master/scripts/windows/win_get_imagenet_train_48hours.cmd b/darknet-master/scripts/windows/win_get_imagenet_train_48hours.cmd
new file mode 100644
index 0000000..2c56060
--- /dev/null
+++ b/darknet-master/scripts/windows/win_get_imagenet_train_48hours.cmd
@@ -0,0 +1,25 @@
+echo Run install_cygwin.cmd before:
+
+rem http://www.image-net.org/challenges/LSVRC/2012/nonpub-downloads
+rem https://github.com/amd/OpenCL-caffe/wiki/Instructions-to-create-ImageNet-2012-data
+
+
+c:\cygwin64\bin\bash -l -c "cd %CD:\=/%/; echo $PWD"
+
+echo Wait several hours...
+
+c:\cygwin64\bin\wget http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_train.tar
+
+c:\cygwin64\bin\mkdir -p "%CD:\=/%/ILSVRC2012_img_train"
+
+c:\cygwin64\bin\tar --force-local -xf "%CD:\=/%/ILSVRC2012_img_train.tar" -C "%CD:\=/%/ILSVRC2012_img_train"
+
+
+
+c:\cygwin64\bin\bash -l -c "cd %CD:\=/%/; %CD:\=/%/windows_imagenet_train.sh"
+
+c:\cygwin64\bin\find "%CD:\=/%/ILSVRC2012_img_train" -name \*.JPEG > imagenet1k.train.list
+
+
+
+pause
\ No newline at end of file
diff --git a/darknet-master/scripts/windows/win_get_imagenet_valid.cmd b/darknet-master/scripts/windows/win_get_imagenet_valid.cmd
new file mode 100644
index 0000000..1eefdf0
--- /dev/null
+++ b/darknet-master/scripts/windows/win_get_imagenet_valid.cmd
@@ -0,0 +1,36 @@
+echo Run install_cygwin.cmd before:
+
+rem http://www.image-net.org/challenges/LSVRC/2012/nonpub-downloads
+rem https://github.com/amd/OpenCL-caffe/wiki/Instructions-to-create-ImageNet-2012-data
+
+
+c:\cygwin64\bin\bash -l -c "cd %CD:\=/%/; echo $PWD"
+
+
+c:\cygwin64\bin\wget http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_bbox_val_v3.tgz
+
+c:\cygwin64\bin\gzip -d "%CD:\=/%/ILSVRC2012_bbox_val_v3.tgz"
+
+c:\cygwin64\bin\tar --force-local -xvf "%CD:\=/%/ILSVRC2012_bbox_val_v3.tar"
+
+
+c:\cygwin64\bin\wget http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar
+
+c:\cygwin64\bin\mkdir -p "%CD:\=/%/imgs"
+
+c:\cygwin64\bin\tar --force-local -xf "%CD:\=/%/ILSVRC2012_img_val.tar" -C "%CD:\=/%/imgs"
+
+
+echo Wait a few hours...
+
+rem c:\cygwin64\bin\wget https://pjreddie.com/media/files/imagenet_label.sh
+
+c:\cygwin64\bin\dos2unix "%CD:\=/%/windows_imagenet_label.sh"
+
+c:\cygwin64\bin\bash -l -c "cd %CD:\=/%/; %CD:\=/%/windows_imagenet_label.sh"
+
+c:\cygwin64\bin\find "%CD:\=/%/labelled" -name \*.JPEG > inet.val.list
+
+
+
+pause
\ No newline at end of file
diff --git a/darknet-master/scripts/windows/win_get_otb_datasets.cmd b/darknet-master/scripts/windows/win_get_otb_datasets.cmd
new file mode 100644
index 0000000..8c12249
--- /dev/null
+++ b/darknet-master/scripts/windows/win_get_otb_datasets.cmd
@@ -0,0 +1,18 @@
+echo Run install_cygwin.cmd before:
+
+rem http://cvlab.hanyang.ac.kr/tracker_benchmark/datasets.html
+
+c:\cygwin64\bin\bash -l -c "cd %CD:\=/%/; echo $PWD"
+
+c:\cygwin64\bin\dos2unix "%CD:\=/%/otb_get_labels.sh"
+
+
+c:\cygwin64\bin\bash -l -c "cd %CD:\=/%/; %CD:\=/%/otb_get_labels.sh Suv 320 240"
+
+c:\cygwin64\bin\bash -l -c "cd %CD:\=/%/; %CD:\=/%/otb_get_labels.sh Liquor 640 480"
+
+c:\cygwin64\bin\bash -l -c "cd %CD:\=/%/; %CD:\=/%/otb_get_labels.sh Freeman4 360 240"
+
+c:\cygwin64\bin\bash -l -c "cd %CD:\=/%/; %CD:\=/%/otb_get_labels.sh Human3 480 640"
+
+pause
\ No newline at end of file
diff --git a/darknet-master/scripts/windows/windows_imagenet_label.sh b/darknet-master/scripts/windows/windows_imagenet_label.sh
new file mode 100644
index 0000000..a371875
--- /dev/null
+++ b/darknet-master/scripts/windows/windows_imagenet_label.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+mkdir -p labelled
+wd=`pwd`
+
+for f in val/*.xml;
+do
+label=`grep -m1 "<name>" $f | grep -oP '<name>\K[^<]*'`
+im=`echo $f | sed 's/val/imgs/; s/xml/JPEG/'`
+out=`echo $im | sed 's/JPEG/'${label}'.JPEG/; s/imgs/labelled/'`
+mv ${wd}/$im ${wd}/$out
+#ln -s ${wd}/$im ${wd}/$out
+done
+
+#find ${wd}/labelled -name \*.JPEG > inet.val.list
+
diff --git a/darknet-master/scripts/windows/windows_imagenet_train.sh b/darknet-master/scripts/windows/windows_imagenet_train.sh
new file mode 100644
index 0000000..11e6430
--- /dev/null
+++ b/darknet-master/scripts/windows/windows_imagenet_train.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+#wget http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_train.tar
+#mkdir -p ILSVRC2012_img_train
+#tar --force-local -xf ILSVRC2012_img_train.tar -C ILSVRC2012_img_train
+
+wd=`pwd`
+
+for f in ILSVRC2012_img_train/*.tar;
+do
+name=$(echo "$f" | cut -f 1 -d '.')
+mkdir "${wd}/${name}"
+tar --force-local -xf "${wd}/${f}" -C "${wd}/${name}"
+done
+
+#find "${wd}/ILSVRC2012_img_train" -name \*.JPEG > imagenet1k.train.list
+
diff --git a/darknet-master/src/activation_kernels.cu b/darknet-master/src/activation_kernels.cu
new file mode 100644
index 0000000..d2dc771
--- /dev/null
+++ b/darknet-master/src/activation_kernels.cu
@@ -0,0 +1,745 @@
+#include "darknet.h"
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+#include <float.h>
+
+#include "activations.h"
+#include "dark_cuda.h"
+
+__device__ float lhtan_activate_kernel(float x)
+{
+    if(x < 0) return .001*x;
+    if(x > 1) return .001*(x-1) + 1;
+    return x;
+}
+__device__ float lhtan_gradient_kernel(float x)
+{
+    if(x > 0 && x < 1) return 1;
+    return .001;
+}
+
+__device__ float hardtan_activate_kernel(float x)
+{
+    if (x < -1) return -1;
+    if (x > 1) return 1;
+    return x;
+}
+__device__ float linear_activate_kernel(float x){return x;}
+__device__ float logistic_activate_kernel(float x){return 1.f/(1.f + expf(-x));}
+__device__ float loggy_activate_kernel(float x){return 2.f/(1.f + expf(-x)) - 1;}
+__device__ float relu_activate_kernel(float x){return x*(x>0);}
+__device__ float relu6_activate_kernel(float x) { return min_val_cmp(max_val_cmp(x, 0), 6); }
+__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(expf(x)-1);}
+__device__ float selu_activate_kernel(float x) { return (x >= 0)*1.0507f*x + (x < 0)*1.0507f*1.6732f*(expf(x) - 1); }
+__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01f*x;}
+__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1f*x;}
+__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1f*x;}
+__device__ float tanh_activate_kernel(float x){return (2/(1 + expf(-2*x)) - 1);}
+__device__ float gelu_activate_kernel(float x){return (0.5*x*(1 + tanhf(0.797885*x + 0.035677*powf(x, 3))));}
+__device__ float softplus_kernel(float x, float threshold = 20) {
+    if (x > threshold) return x;                // too large
+    else if (x < -threshold) return expf(x);    // too small
+    return log1pf(expf(x));
+    //return logf(expf(x) + 1);
+}
+__device__ float plse_activate_kernel(float x)
+{
+    if(x < -4) return .01f * (x + 4);
+    if(x > 4)  return .01f * (x - 4) + 1;
+    return .125f*x + .5f;
+}
+__device__ float stair_activate_kernel(float x)
+{
+    int n = floorf(x);
+    if (n%2 == 0) return floorf(x/2.f);
+    else return (x - n) + floorf(x/2.f);
+}
+
+
+__device__ float hardtan_gradient_kernel(float x)
+{
+    if (x > -1 && x < 1) return 1;
+    return 0;
+}
+__device__ float linear_gradient_kernel(float x){return 1;}
+__device__ float logistic_gradient_kernel(float x){return (1-x)*x;}
+__device__ float loggy_gradient_kernel(float x)
+{
+    float y = (x+1.F)/2.F;
+    return 2*(1-y)*y;
+}
+__device__ float relu_gradient_kernel(float x){return (x>0);}
+__device__ float relu6_gradient_kernel(float x) { return (x > 0 && x < 6); }
+__device__ float elu_gradient_kernel(float x){return (x >= 0) + (x < 0)*(x + 1);}
+__device__ float selu_gradient_kernel(float x) { return (x >= 0)*1.0507f + (x < 0)*(x + 1.0507f*1.6732f); }
+__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01f;}
+__device__ float ramp_gradient_kernel(float x){return (x>0)+.1f;}
+__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1f;}
+__device__ float tanh_gradient_kernel(float x){return 1-x*x;}
+__device__ float sech_gpu(float x) { return 2 / (expf(x) + expf(-x)); }
+__device__ float gelu_gradient_kernel(float x) {
+    const float x3 = powf(x, 3);
+    return 0.5*tanhf(0.0356774*x3 + 0.797885*x) + (0.0535161*x3 + 0.398942*x) * powf(sech_gpu(0.0356774*x3 + 0.797885*x), 2) + 0.5;
+}
+__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01f : .125f;}
+__device__ float stair_gradient_kernel(float x)
+{
+    if (floorf(x) == x) return 0;
+    return 1;
+}
+
+__device__ float activate_kernel(float x, ACTIVATION a)
+{
+    switch(a){
+        case LINEAR:
+            return linear_activate_kernel(x);
+        case LOGISTIC:
+            return logistic_activate_kernel(x);
+        case LOGGY:
+            return loggy_activate_kernel(x);
+        case RELU:
+            return relu_activate_kernel(x);
+        case RELU6:
+            return relu6_activate_kernel(x);
+        case ELU:
+            return elu_activate_kernel(x);
+        case SELU:
+            return selu_activate_kernel(x);
+        case GELU:
+            return gelu_activate_kernel(x);
+        case RELIE:
+            return relie_activate_kernel(x);
+        case RAMP:
+            return ramp_activate_kernel(x);
+        case LEAKY:
+            return leaky_activate_kernel(x);
+        case TANH:
+            return tanh_activate_kernel(x);
+        case PLSE:
+            return plse_activate_kernel(x);
+        case STAIR:
+            return stair_activate_kernel(x);
+        case HARDTAN:
+            return hardtan_activate_kernel(x);
+        case LHTAN:
+            return lhtan_activate_kernel(x);
+    }
+    return 0;
+}
+
+__device__ float gradient_kernel(float x, ACTIVATION a)
+{
+    switch (a) {
+    case LINEAR:
+        return linear_gradient_kernel(x);
+    case LOGISTIC:
+        return logistic_gradient_kernel(x);
+    case LOGGY:
+        return loggy_gradient_kernel(x);
+    case RELU:
+        return relu_gradient_kernel(x);
+    case RELU6:
+        return relu6_gradient_kernel(x);
+    case NORM_CHAN:
+        return relu_gradient_kernel(x);
+    case ELU:
+        return elu_gradient_kernel(x);
+    case SELU:
+        return selu_gradient_kernel(x);
+    case GELU:
+        return gelu_gradient_kernel(x);
+    case RELIE:
+        return relie_gradient_kernel(x);
+    case RAMP:
+        return ramp_gradient_kernel(x);
+    case LEAKY:
+        return leaky_gradient_kernel(x);
+    case TANH:
+        return tanh_gradient_kernel(x);
+    case PLSE:
+        return plse_gradient_kernel(x);
+    case STAIR:
+        return stair_gradient_kernel(x);
+    case HARDTAN:
+        return hardtan_gradient_kernel(x);
+    case LHTAN:
+        return lhtan_gradient_kernel(x);
+    }
+    return 0;
+}
+
+__global__ void binary_gradient_array_kernel(float *x, float *dy, int n, int s, BINARY_ACTIVATION a, float *dx)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    int i = id % s;
+    int b = id / s;
+    float x1 = x[b*s + i];
+    float x2 = x[b*s + s / 2 + i];
+    if (id < n) {
+        float de = dy[id];
+        dx[b*s + i] = x2*de;
+        dx[b*s + s / 2 + i] = x1*de;
+    }
+}
+
+extern "C" void binary_gradient_array_gpu(float *x, float *dx, int n, int size, BINARY_ACTIVATION a, float *y)
+{
+    binary_gradient_array_kernel <<<cuda_gridsize(n / 2), BLOCK, 0, get_cuda_stream() >>>(x, dx, n / 2, size, a, y);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+__global__ void binary_activate_array_kernel(float *x, int n, int s, BINARY_ACTIVATION a, float *y)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    int i = id % s;
+    int b = id / s;
+    float x1 = x[b*s + i];
+    float x2 = x[b*s + s / 2 + i];
+    if (id < n) y[id] = x1*x2;
+}
+
+extern "C" void binary_activate_array_gpu(float *x, int n, int size, BINARY_ACTIVATION a, float *y)
+{
+    binary_activate_array_kernel <<<cuda_gridsize(n / 2), BLOCK, 0, get_cuda_stream() >>>(x, n / 2, size, a, y);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void activate_array_kernel(float *x, int n, ACTIVATION a)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n) x[i] = activate_kernel(x[i], a);
+}
+
+
+
+__global__ void activate_array_swish_kernel(float *x, int n, float *output_sigmoid_gpu, float *output_gpu)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n) {
+        float x_val = x[i];
+        float sigmoid = logistic_activate_kernel(x_val);
+        if (output_sigmoid_gpu) output_sigmoid_gpu[i] = sigmoid;
+        output_gpu[i] = x_val * sigmoid;
+    }
+}
+
+__device__ float mish_njuffa(float x)
+{
+    float r;
+    float e = expf(x);
+    r = 1.0f / fmaf(fmaf(-0.5f, e, -1.0f), e, -1.0f);
+    r = fmaf(r, x, x);
+    return r;
+}
+
+__device__ float mish_yashas(float x)
+{
+    float e = __expf(x);
+    if (x <= -18.0f)
+        return x * e;
+
+    float n = e * e + 2 * e;
+    if (x <= -5.0f)
+        return x * __fdividef(n, n + 2);
+
+    return x - 2 * __fdividef(x, n + 2);
+}
+
+__device__ float mish_yashas2(float x)
+{
+    float e = __expf(x);
+    float n = e * e + 2 * e;
+    if (x <= -0.6f)
+        return x * __fdividef(n, n + 2);
+
+    return x - 2 * __fdividef(x, n + 2);
+}
+
+// https://github.com/digantamisra98/Mish
+__global__ void activate_array_mish_kernel(float *x, int n, float *activation_input, float *output_gpu)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n) {
+        //const float MISH_THRESHOLD = 20;
+        float x_val = x[i];
+        if (activation_input) activation_input[i] = x_val;    // store value before activation
+        //output_gpu[i] = x_val * tanh_activate_kernel(logf(1 + expf(x_val)));
+
+        // Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L17-L20
+        // TF: https://github.com/tensorflow/addons/blob/093cdfa85d334cbe19a37624c33198f3140109ed/tensorflow_addons/custom_ops/activations/cc/kernels/mish_op.h#L40-L49
+        // log1p(x) == log(x + 1)
+        //output_gpu[i] = x_val * tanh_activate_kernel( softplus_kernel(x_val, MISH_THRESHOLD) );
+        output_gpu[i] = mish_yashas2(x_val);
+        //output_gpu[i] = mish_njuffa(x_val);
+    }
+}
+
+__device__ float hard_mish_yashas(float x)
+{
+    if (x > 0)
+        return x;
+    if (x > -2)
+        return x * x / 2 + x;
+    return 0;
+}
+
+__global__ void activate_array_hard_mish_kernel(float *x, int n, float *activation_input, float *output_gpu)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n) {
+
+        float x_val = x[i];
+        if (activation_input) activation_input[i] = x_val;    // store value before activation
+        output_gpu[i] = hard_mish_yashas(x_val);
+    }
+}
+__global__ void activate_array_leaky_kernel(float *x, int n)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        x[index] = leaky_activate_kernel(x[index]);
+    }
+}
+
+__global__ void activate_array_selu_kernel(float *x, int n)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        x[index] = selu_activate_kernel(x[index]);
+    }
+}
+
+__global__ void activate_array_gelu_kernel(float *x, int n)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        x[index] = gelu_activate_kernel(x[index]);
+    }
+}
+
+__global__ void activate_array_logistic_kernel(float *x, int n)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        x[index] = logistic_activate_kernel(x[index]);
+    }
+}
+
+__global__ void activate_array_tanh_kernel(float *x, int n)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        x[index] = tanh_activate_kernel(x[index]);
+    }
+}
+
+__global__ void activate_array_hardtan_kernel(float *x, int n)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        x[index] = hardtan_activate_kernel(x[index]);
+    }
+}
+
+__global__ void activate_array_relu_kernel(float *x, int n)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        x[index] = relu_activate_kernel(x[index]);
+    }
+}
+
+__global__ void activate_array_relu6_kernel(float *x, int n)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        x[index] = relu6_activate_kernel(x[index]);
+    }
+}
+
+__global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delta)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n) delta[i] *= gradient_kernel(x[i], a);
+}
+
+// https://github.com/BVLC/caffe/blob/04ab089db018a292ae48d51732dd6c66766b36b6/src/caffe/layers/swish_layer.cu#L28-L30
+__global__ void gradient_array_swish_kernel(float *x, int n, float *sigmoid_gpu, float *delta)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n) {
+        float swish = x[i];
+        delta[i] *= swish + sigmoid_gpu[i] * (1 - swish); // gradient_kernel(x[i], a);
+    }
+}
+
+// https://github.com/digantamisra98/Mish
+__global__ void gradient_array_mish_kernel(int n, float *activation_input_gpu, float *delta)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n) {
+        const float MISH_THRESHOLD = 20.0f;
+
+        // implementation from TensorFlow: https://github.com/tensorflow/addons/blob/093cdfa85d334cbe19a37624c33198f3140109ed/tensorflow_addons/custom_ops/activations/cc/kernels/mish_op.h#L66-L80
+        // implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
+        // log1p(x) == log(x + 1)
+        const float inp = activation_input_gpu[i];
+        const float sp = softplus_kernel(inp, MISH_THRESHOLD);
+        const float grad_sp = -expm1f(-sp);
+        //const float grad_sp = 1 - expf(-sp);
+        const float tsp = tanh(sp);
+        const float grad_tsp = (1 - tsp*tsp) * grad_sp;
+        const float grad = inp * grad_tsp + tsp;
+        delta[i] *= grad;
+
+        //float x = activation_input[i];
+        //float d = 2 * expf(x) + expf(2 * x) + 2;
+        //float w = 4 * (x + 1) + 4 * expf(2 * x) + expf(3 * x) + expf(x)*(4 * x + 6);
+        //float derivative = expf(x) * w / (d * d);
+        //delta[i] *= derivative;
+    }
+}
+
+__device__ float hard_mish_yashas_grad(float x)
+{
+    if (x > 0)
+        return 1;
+    if (x > -2)
+        return x + 1;
+    return 0;
+}
+
+__global__ void gradient_array_hard_mish_kernel(int n, float *activation_input_gpu, float *delta)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n) {
+
+        const float x = activation_input_gpu[i];
+        delta[i] *= hard_mish_yashas_grad(x);
+    }
+}
+
+__global__ void gradient_array_leaky_kernel(float *x, int n, float *delta)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        delta[index] *= leaky_gradient_kernel(x[index]);
+    }
+}
+
+__global__ void gradient_array_revleaky_kernel(float *x, int n, float *delta)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        delta[index] /= leaky_gradient_kernel(x[index]);
+    }
+}
+
+__global__ void gradient_array_selu_kernel(float *x, int n, float *delta)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        delta[index] *= selu_gradient_kernel(x[index]);
+    }
+}
+
+__global__ void gradient_array_gelu_kernel(float *x, int n, float *delta)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        delta[index] *= gelu_gradient_kernel(x[index]);
+    }
+}
+
+__global__ void gradient_array_logistic_kernel(float *x, int n, float *delta)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        delta[index] *= logistic_gradient_kernel(x[index]);
+    }
+}
+
+__global__ void gradient_array_tanh_kernel(float *x, int n, float *delta)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        delta[index] *= tanh_gradient_kernel(x[index]);
+    }
+}
+
+__global__ void gradient_array_hardtan_kernel(float *x, int n, float *delta)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        delta[index] *= hardtan_gradient_kernel(x[index]);
+    }
+}
+
+__global__ void gradient_array_relu_kernel(float *x, int n, float *delta)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        delta[index] *= relu_gradient_kernel(x[index]);
+    }
+}
+
+__global__ void gradient_array_relu6_kernel(float *x, int n, float *delta)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < n) {
+        delta[index] *= relu6_gradient_kernel(x[index]);
+    }
+}
+
+extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a)
+{
+    const int num_blocks = get_number_of_blocks(n, BLOCK);
+    if (a == LINEAR) return;
+    else if (a == LEAKY || a == REVLEAKY) activate_array_leaky_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(x, n);
+    else if (a == LOGISTIC) activate_array_logistic_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(x, n);
+    else if (a == TANH) activate_array_tanh_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(x, n);
+    else if (a == HARDTAN) activate_array_hardtan_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(x, n);
+    else if (a == RELU) activate_array_relu_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(x, n);
+    else if (a == RELU6) activate_array_relu6_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(x, n);
+    else if (a == SELU) activate_array_selu_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(x, n);
+    else if (a == GELU) activate_array_gelu_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(x, n);
+    else
+        activate_array_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream()>>>(x, n, a);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void activate_array_swish_ongpu(float *x, int n, float *output_sigmoid_gpu, float *output_gpu)
+{
+    const int num_blocks = get_number_of_blocks(n, BLOCK);
+    activate_array_swish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, output_sigmoid_gpu, output_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void activate_array_mish_ongpu(float *x, int n, float *activation_input_gpu, float *output_gpu)
+{
+    const int num_blocks = get_number_of_blocks(n, BLOCK);
+    activate_array_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, activation_input_gpu, output_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void activate_array_hard_mish_ongpu(float *x, int n, float *activation_input_gpu, float *output_gpu)
+{
+    const int num_blocks = get_number_of_blocks(n, BLOCK);
+    activate_array_hard_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, activation_input_gpu, output_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta)
+{
+    const int num_blocks = get_number_of_blocks(n, BLOCK);
+    if (a == LINEAR) return;
+    else if (a == LEAKY) gradient_array_leaky_kernel <<< num_blocks, BLOCK, 0, get_cuda_stream() >>> (x, n, delta);
+    else if (a == REVLEAKY) gradient_array_revleaky_kernel <<< num_blocks, BLOCK, 0, get_cuda_stream() >>> (x, n, delta);
+    else if (a == LOGISTIC) gradient_array_logistic_kernel <<< num_blocks, BLOCK, 0, get_cuda_stream() >>> (x, n, delta);
+    else if (a == TANH) gradient_array_tanh_kernel <<< num_blocks, BLOCK, 0, get_cuda_stream() >>> (x, n, delta);
+    else if (a == HARDTAN) gradient_array_hardtan_kernel <<< num_blocks, BLOCK, 0, get_cuda_stream() >>> (x, n, delta);
+    else if (a == RELU) gradient_array_relu_kernel <<< num_blocks, BLOCK, 0, get_cuda_stream() >>> (x, n, delta);
+    else if (a == RELU6) gradient_array_relu6_kernel <<< num_blocks, BLOCK, 0, get_cuda_stream() >>> (x, n, delta);
+    //else if (a == NORM_CHAN) gradient_array_relu_kernel <<< num_blocks, BLOCK, 0, get_cuda_stream() >>>(x, n, delta);
+    else if (a == NORM_CHAN_SOFTMAX || a == NORM_CHAN) {
+        error("Error: should be used custom NORM_CHAN_SOFTMAX-function for gradient", DARKNET_LOC);
+    }
+    else if (a == SELU) gradient_array_selu_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(x, n, delta);
+    else if (a == GELU) gradient_array_gelu_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(x, n, delta);
+    else
+        gradient_array_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (x, n, a, delta);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+extern "C" void gradient_array_swish_ongpu(float *x, int n, float *sigmoid_gpu, float *delta)
+{
+    const int num_blocks = get_number_of_blocks(n, BLOCK);
+    gradient_array_swish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (x, n, sigmoid_gpu, delta);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void gradient_array_mish_ongpu(int n, float *activation_input_gpu, float *delta)
+{
+    const int num_blocks = get_number_of_blocks(n, BLOCK);
+    gradient_array_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (n, activation_input_gpu, delta);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void gradient_array_hard_mish_ongpu(int n, float *activation_input_gpu, float *delta)
+{
+    const int num_blocks = get_number_of_blocks(n, BLOCK);
+    gradient_array_hard_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (n, activation_input_gpu, delta);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void activate_array_normalize_channels_kernel(float *x, int size, int batch, int channels, int wh_step, float *output_gpu)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int wh_i = i % wh_step;
+    int b = i / wh_step;
+
+    const float eps = 0.0001;
+    if (i < size) {
+        float sum = eps;
+        int k;
+        for (k = 0; k < channels; ++k) {
+            float val = x[wh_i + k * wh_step + b*wh_step*channels];
+            if (val > 0) sum += val;
+        }
+        for (k = 0; k < channels; ++k) {
+            float val = x[wh_i + k * wh_step + b*wh_step*channels];
+            if (val > 0) val = val / sum;
+            else val = 0;
+            output_gpu[wh_i + k * wh_step + b*wh_step*channels] = val;
+        }
+    }
+}
+
+extern "C" void activate_array_normalize_channels_ongpu(float *x, int n, int batch, int channels, int wh_step, float *output_gpu)
+{
+    // n = w*h*c*batch
+    // size = w*h*batch
+    int size = n / channels;
+
+    const int num_blocks = get_number_of_blocks(size, BLOCK);
+
+    activate_array_normalize_channels_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (x, size, batch, channels, wh_step, output_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+__global__ void activate_array_normalize_channels_softmax_kernel(float *x, int size, int batch, int channels, int wh_step, float *output_gpu, int use_max_val)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int wh_i = i % wh_step;
+    int b = i / wh_step;
+
+    const float eps = 0.0001;
+    if (i < size) {
+        float sum = eps;
+        float max_val = -FLT_MAX;
+        int k;
+        if (use_max_val) {
+            for (k = 0; k < channels; ++k) {
+                float val = x[wh_i + k * wh_step + b*wh_step*channels];
+                if (val > max_val || k == 0) max_val = val;
+            }
+        }
+        else
+            max_val = 0;
+
+        for (k = 0; k < channels; ++k) {
+            float val = x[wh_i + k * wh_step + b*wh_step*channels];
+            sum += expf(val - max_val);
+        }
+        for (k = 0; k < channels; ++k) {
+            float val = x[wh_i + k * wh_step + b*wh_step*channels];
+            val = expf(val - max_val) / sum;
+            if (isnan(val) || isinf(val)) val = 0;
+            output_gpu[wh_i + k * wh_step + b*wh_step*channels] = val;
+        }
+    }
+}
+
+extern "C" void activate_array_normalize_channels_softmax_ongpu(float *x, int n, int batch, int channels, int wh_step, float *output_gpu, int use_max_val)
+{
+    // n = w*h*c*batch
+    // size = w*h*batch
+    int size = n / channels;
+
+    const int num_blocks = get_number_of_blocks(size, BLOCK);
+
+    activate_array_normalize_channels_softmax_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (x, size, batch, channels, wh_step, output_gpu, use_max_val);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+__global__ void gradient_array_normalize_channels_softmax_kernel(float *x, int size, int batch, int channels, int wh_step, float *delta_gpu)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int wh_i = i % wh_step;
+    int b = i / wh_step;
+
+    if (i < size) {
+        int k;
+        /*
+        float grad = 0;
+        for (k = 0; k < channels; ++k) {
+            const int index = wh_i + k * wh_step + b*wh_step*channels;
+            float out = x[index];
+            float delta = delta_gpu[index];
+            grad += out*fabs(delta);
+        }
+        */
+        for (k = 0; k < channels; ++k) {
+            const int index = wh_i + k * wh_step + b*wh_step*channels;
+            float delta = delta_gpu[index];
+            float grad = x[index] * (1 - x[index]);
+            delta = delta * grad;
+            if (isnan(delta) || isinf(delta)) delta = 0;
+            delta_gpu[index] = delta;
+        }
+    }
+}
+
+extern "C" void gradient_array_normalize_channels_softmax_ongpu(float *output_gpu, int n, int batch, int channels, int wh_step, float *delta_gpu)
+{
+    // n = w*h*c*batch
+    // size = w*h*batch
+    int size = n / channels;
+
+    const int num_blocks = get_number_of_blocks(size, BLOCK);
+
+    gradient_array_normalize_channels_softmax_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (output_gpu, size, batch, channels, wh_step, delta_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void gradient_array_normalize_channels_kernel(float *x, int size, int batch, int channels, int wh_step, float *delta_gpu)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int wh_i = i % wh_step;
+    int b = i / wh_step;
+
+    if (i < size) {
+        int k;
+        /*
+        float grad = 0;
+        for (k = 0; k < channels; ++k) {
+            const int index = wh_i + k * wh_step + b*wh_step*channels;
+            float out = x[index];
+            float delta = delta_gpu[index];
+            grad += out*fabs(delta);
+        }
+        */
+        for (k = 0; k < channels; ++k) {
+            const int index = wh_i + k * wh_step + b*wh_step*channels;
+            if (x[index] > 0) {
+                float delta = delta_gpu[index];
+                float grad = x[index];
+                delta = delta * grad;
+                delta_gpu[index] = delta;
+            }
+        }
+    }
+}
+
+extern "C" void gradient_array_normalize_channels_ongpu(float *output_gpu, int n, int batch, int channels, int wh_step, float *delta_gpu)
+{
+    // n = w*h*c*batch
+    // size = w*h*batch
+    int size = n / channels;
+
+    const int num_blocks = get_number_of_blocks(size, BLOCK);
+
+    gradient_array_normalize_channels_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (output_gpu, size, batch, channels, wh_step, delta_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
diff --git a/darknet-master/src/activation_layer.c b/darknet-master/src/activation_layer.c
new file mode 100644
index 0000000..4383d7e
--- /dev/null
+++ b/darknet-master/src/activation_layer.c
@@ -0,0 +1,63 @@
+#include "activation_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include "gemm.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+layer make_activation_layer(int batch, int inputs, ACTIVATION activation)
+{
+    layer l = { (LAYER_TYPE)0 };
+    l.type = ACTIVE;
+
+    l.inputs = inputs;
+    l.outputs = inputs;
+    l.batch=batch;
+
+    l.output = (float*)xcalloc(batch * inputs, sizeof(float));
+    l.delta = (float*)xcalloc(batch * inputs, sizeof(float));
+
+    l.forward = forward_activation_layer;
+    l.backward = backward_activation_layer;
+#ifdef GPU
+    l.forward_gpu = forward_activation_layer_gpu;
+    l.backward_gpu = backward_activation_layer_gpu;
+
+    l.output_gpu = cuda_make_array(l.output, inputs*batch);
+    l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
+#endif
+    l.activation = activation;
+    fprintf(stderr, "Activation Layer: %d inputs\n", inputs);
+    return l;
+}
+
+void forward_activation_layer(layer l, network_state state)
+{
+    copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
+    activate_array(l.output, l.outputs*l.batch, l.activation);
+}
+
+void backward_activation_layer(layer l, network_state state)
+{
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+    copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
+}
+
+#ifdef GPU
+
+void forward_activation_layer_gpu(layer l, network_state state)
+{
+    copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
+    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+}
+
+void backward_activation_layer_gpu(layer l, network_state state)
+{
+    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
+}
+#endif
diff --git a/darknet-master/src/activation_layer.h b/darknet-master/src/activation_layer.h
new file mode 100644
index 0000000..c766c6a
--- /dev/null
+++ b/darknet-master/src/activation_layer.h
@@ -0,0 +1,25 @@
+#ifndef ACTIVATION_LAYER_H
+#define ACTIVATION_LAYER_H
+
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_activation_layer(int batch, int inputs, ACTIVATION activation);
+
+void forward_activation_layer(layer l, network_state state);
+void backward_activation_layer(layer l, network_state state);
+
+#ifdef GPU
+void forward_activation_layer_gpu(layer l, network_state state);
+void backward_activation_layer_gpu(layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/activations.c b/darknet-master/src/activations.c
new file mode 100644
index 0000000..9e8a49f
--- /dev/null
+++ b/darknet-master/src/activations.c
@@ -0,0 +1,418 @@
+#include "activations.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <float.h>
+
+char *get_activation_string(ACTIVATION a)
+{
+    switch(a){
+        case LOGISTIC:
+            return "logistic";
+        case LOGGY:
+            return "loggy";
+        case RELU:
+            return "relu";
+        case ELU:
+            return "elu";
+        case SELU:
+            return "selu";
+        case GELU:
+            return "gelu";
+        case RELIE:
+            return "relie";
+        case RAMP:
+            return "ramp";
+        case LINEAR:
+            return "linear";
+        case TANH:
+            return "tanh";
+        case PLSE:
+            return "plse";
+        case LEAKY:
+            return "leaky";
+        case STAIR:
+            return "stair";
+        case HARDTAN:
+            return "hardtan";
+        case LHTAN:
+            return "lhtan";
+        default:
+            break;
+    }
+    return "relu";
+}
+
+ACTIVATION get_activation(char *s)
+{
+    if (strcmp(s, "logistic")==0) return LOGISTIC;
+    if (strcmp(s, "swish") == 0) return SWISH;
+    if (strcmp(s, "mish") == 0) return MISH;
+    if (strcmp(s, "hard_mish") == 0) return HARD_MISH;
+    if (strcmp(s, "normalize_channels") == 0) return NORM_CHAN;
+    if (strcmp(s, "normalize_channels_softmax") == 0) return NORM_CHAN_SOFTMAX;
+    if (strcmp(s, "normalize_channels_softmax_maxval") == 0) return NORM_CHAN_SOFTMAX_MAXVAL;
+    if (strcmp(s, "loggy")==0) return LOGGY;
+    if (strcmp(s, "relu")==0) return RELU;
+    if (strcmp(s, "relu6") == 0) return RELU6;
+    if (strcmp(s, "elu")==0) return ELU;
+    if (strcmp(s, "selu") == 0) return SELU;
+    if (strcmp(s, "gelu") == 0) return GELU;
+    if (strcmp(s, "relie")==0) return RELIE;
+    if (strcmp(s, "plse")==0) return PLSE;
+    if (strcmp(s, "hardtan")==0) return HARDTAN;
+    if (strcmp(s, "lhtan")==0) return LHTAN;
+    if (strcmp(s, "linear")==0) return LINEAR;
+    if (strcmp(s, "ramp")==0) return RAMP;
+    if (strcmp(s, "revleaky") == 0) return REVLEAKY;
+    if (strcmp(s, "leaky")==0) return LEAKY;
+    if (strcmp(s, "tanh")==0) return TANH;
+    if (strcmp(s, "stair")==0) return STAIR;
+    fprintf(stderr, "Couldn't find activation function %s, going with ReLU\n", s);
+    return RELU;
+}
+
+float activate(float x, ACTIVATION a)
+{
+    switch(a){
+        case LINEAR:
+            return linear_activate(x);
+        case LOGISTIC:
+            return logistic_activate(x);
+        case LOGGY:
+            return loggy_activate(x);
+        case RELU:
+            return relu_activate(x);
+        case ELU:
+            return elu_activate(x);
+        case SELU:
+            return selu_activate(x);
+        case GELU:
+            return gelu_activate(x);
+        case RELIE:
+            return relie_activate(x);
+        case RAMP:
+            return ramp_activate(x);
+        case REVLEAKY:
+        case LEAKY:
+            return leaky_activate(x);
+        case TANH:
+            return tanh_activate(x);
+        case PLSE:
+            return plse_activate(x);
+        case STAIR:
+            return stair_activate(x);
+        case HARDTAN:
+            return hardtan_activate(x);
+        case LHTAN:
+            return lhtan_activate(x);
+    }
+    return 0;
+}
+
+void activate_array(float *x, const int n, const ACTIVATION a)
+{
+    int i;
+    if (a == LINEAR) {}
+    else if (a == LEAKY) {
+        #pragma omp parallel for
+        for (i = 0; i < n; ++i) {
+            x[i] = leaky_activate(x[i]);
+        }
+    }
+    else if (a == LOGISTIC) {
+        #pragma omp parallel for
+        for (i = 0; i < n; ++i) {
+            x[i] = logistic_activate(x[i]);
+        }
+    }
+    else {
+        for (i = 0; i < n; ++i) {
+            x[i] = activate(x[i], a);
+        }
+    }
+}
+
+void activate_array_swish(float *x, const int n, float * output_sigmoid, float * output)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; ++i) {
+        float x_val = x[i];
+        float sigmoid = logistic_activate(x_val);
+        output_sigmoid[i] = sigmoid;
+        output[i] = x_val * sigmoid;
+    }
+}
+
+// https://github.com/digantamisra98/Mish
+void activate_array_mish(float *x, const int n, float * activation_input, float * output)
+{
+    const float MISH_THRESHOLD = 20;
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; ++i) {
+        float x_val = x[i];
+        activation_input[i] = x_val;    // store value before activation
+        output[i] = x_val * tanh_activate( softplus_activate(x_val, MISH_THRESHOLD) );
+    }
+}
+
+static float hard_mish_yashas(float x)
+{
+    if (x > 0)
+        return x;
+    if (x > -2)
+        return x * x / 2 + x;
+    return 0;
+}
+
+void activate_array_hard_mish(float *x, const int n, float * activation_input, float * output)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; ++i) {
+        float x_val = x[i];
+        activation_input[i] = x_val;    // store value before activation
+        output[i] = hard_mish_yashas(x_val);
+    }
+}
+
+void activate_array_normalize_channels(float *x, const int n, int batch, int channels, int wh_step, float *output)
+{
+    int size = n / channels;
+
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < size; ++i) {
+        int wh_i = i % wh_step;
+        int b = i / wh_step;
+
+        const float eps = 0.0001;
+        if (i < size) {
+            float sum = eps;
+            int k;
+            for (k = 0; k < channels; ++k) {
+                float val = x[wh_i + k * wh_step + b*wh_step*channels];
+                if (val > 0) sum += val;
+            }
+            for (k = 0; k < channels; ++k) {
+                float val = x[wh_i + k * wh_step + b*wh_step*channels];
+                if (val > 0) val = val / sum;
+                else val = 0;
+                output[wh_i + k * wh_step + b*wh_step*channels] = val;
+            }
+        }
+    }
+}
+
+void activate_array_normalize_channels_softmax(float *x, const int n, int batch, int channels, int wh_step, float *output, int use_max_val)
+{
+    int size = n / channels;
+
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < size; ++i) {
+        int wh_i = i % wh_step;
+        int b = i / wh_step;
+
+        const float eps = 0.0001;
+        if (i < size) {
+            float sum = eps;
+            float max_val = -FLT_MAX;
+            int k;
+            if (use_max_val) {
+                for (k = 0; k < channels; ++k) {
+                    float val = x[wh_i + k * wh_step + b*wh_step*channels];
+                    if (val > max_val || k == 0) max_val = val;
+                }
+            }
+            else
+                max_val = 0;
+
+            for (k = 0; k < channels; ++k) {
+                float val = x[wh_i + k * wh_step + b*wh_step*channels];
+                sum += expf(val - max_val);
+            }
+            for (k = 0; k < channels; ++k) {
+                float val = x[wh_i + k * wh_step + b*wh_step*channels];
+                val = expf(val - max_val) / sum;
+                output[wh_i + k * wh_step + b*wh_step*channels] = val;
+            }
+        }
+    }
+}
+
+void gradient_array_normalize_channels_softmax(float *x, const int n, int batch, int channels, int wh_step, float *delta)
+{
+    int size = n / channels;
+
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < size; ++i) {
+        int wh_i = i % wh_step;
+        int b = i / wh_step;
+
+        if (i < size) {
+            float grad = 0;
+            int k;
+            for (k = 0; k < channels; ++k) {
+                const int index = wh_i + k * wh_step + b*wh_step*channels;
+                float out = x[index];
+                float d = delta[index];
+                grad += out*d;
+            }
+            for (k = 0; k < channels; ++k) {
+                const int index = wh_i + k * wh_step + b*wh_step*channels;
+                float d = delta[index];
+                d = d * grad;
+                delta[index] = d;
+            }
+        }
+    }
+}
+
+void gradient_array_normalize_channels(float *x, const int n, int batch, int channels, int wh_step, float *delta)
+{
+    int size = n / channels;
+
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < size; ++i) {
+        int wh_i = i % wh_step;
+        int b = i / wh_step;
+
+        if (i < size) {
+            float grad = 0;
+            int k;
+            for (k = 0; k < channels; ++k) {
+                const int index = wh_i + k * wh_step + b*wh_step*channels;
+                float out = x[index];
+                float d = delta[index];
+                grad += out*d;
+            }
+            for (k = 0; k < channels; ++k) {
+                const int index = wh_i + k * wh_step + b*wh_step*channels;
+                if (x[index] > 0) {
+                    float d = delta[index];
+                    d = d * grad;
+                    delta[index] = d;
+                }
+            }
+        }
+    }
+}
+
+float gradient(float x, ACTIVATION a)
+{
+    switch(a){
+        case LINEAR:
+            return linear_gradient(x);
+        case LOGISTIC:
+            return logistic_gradient(x);
+        case LOGGY:
+            return loggy_gradient(x);
+        case RELU:
+            return relu_gradient(x);
+        case RELU6:
+            return relu6_gradient(x);
+        case NORM_CHAN:
+            //return relu_gradient(x);
+        case NORM_CHAN_SOFTMAX_MAXVAL:
+            //...
+        case NORM_CHAN_SOFTMAX:
+            error("Error: should be used custom NORM_CHAN or NORM_CHAN_SOFTMAX-function for gradient", DARKNET_LOC);
+        case ELU:
+            return elu_gradient(x);
+        case SELU:
+            return selu_gradient(x);
+        case GELU:
+            return gelu_gradient(x);
+        case RELIE:
+            return relie_gradient(x);
+        case RAMP:
+            return ramp_gradient(x);
+        case REVLEAKY:
+        case LEAKY:
+            return leaky_gradient(x);
+        case TANH:
+            return tanh_gradient(x);
+        case PLSE:
+            return plse_gradient(x);
+        case STAIR:
+            return stair_gradient(x);
+        case HARDTAN:
+            return hardtan_gradient(x);
+        case LHTAN:
+            return lhtan_gradient(x);
+    }
+    return 0;
+}
+
+void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta)
+{
+    int i;
+    #pragma omp parallel for
+    for(i = 0; i < n; ++i){
+        delta[i] *= gradient(x[i], a);
+    }
+}
+
+// https://github.com/BVLC/caffe/blob/04ab089db018a292ae48d51732dd6c66766b36b6/src/caffe/layers/swish_layer.cpp#L54-L56
+void gradient_array_swish(const float *x, const int n, const float * sigmoid, float * delta)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; ++i) {
+        float swish = x[i];
+        delta[i] *= swish + sigmoid[i]*(1 - swish);
+    }
+}
+
+// https://github.com/digantamisra98/Mish
+void gradient_array_mish(const int n, const float * activation_input, float * delta)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; ++i) {
+        const float MISH_THRESHOLD = 20.0f;
+
+        // implementation from TensorFlow: https://github.com/tensorflow/addons/commit/093cdfa85d334cbe19a37624c33198f3140109ed
+        // implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
+        float inp = activation_input[i];
+        const float sp = softplus_activate(inp, MISH_THRESHOLD);
+        const float grad_sp = 1 - exp(-sp);
+        const float tsp = tanh(sp);
+        const float grad_tsp = (1 - tsp*tsp) * grad_sp;
+        const float grad = inp * grad_tsp + tsp;
+        delta[i] *= grad;
+
+
+        //float x = activation_input[i];
+        //float d = 2 * expf(x) + expf(2 * x) + 2;
+        //float w = 4 * (x + 1) + 4 * expf(2 * x) + expf(3 * x) + expf(x)*(4 * x + 6);
+        //float derivative = expf(x) * w / (d * d);
+        //delta[i] *= derivative;
+    }
+}
+
+static float hard_mish_yashas_grad(float x)
+{
+    if (x > 0)
+        return 1;
+    if (x > -2)
+        return x + 1;
+    return 0;
+}
+
+void gradient_array_hard_mish(const int n, const float * activation_input, float * delta)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; ++i) {
+        float inp = activation_input[i];
+        delta[i] *= hard_mish_yashas_grad(inp);
+    }
+}
diff --git a/darknet-master/src/activations.h b/darknet-master/src/activations.h
new file mode 100644
index 0000000..95c2c2c
--- /dev/null
+++ b/darknet-master/src/activations.h
@@ -0,0 +1,134 @@
+#ifndef ACTIVATIONS_H
+#define ACTIVATIONS_H
+#include "darknet.h"
+#include "dark_cuda.h"
+#include "math.h"
+#include "utils.h"
+
+//typedef enum{
+//    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU, SWISH, MISH
+//}ACTIVATION;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ACTIVATION get_activation(char *s);
+
+char *get_activation_string(ACTIVATION a);
+float activate(float x, ACTIVATION a);
+float gradient(float x, ACTIVATION a);
+void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
+void gradient_array_swish(const float *x, const int n, const float * sigmoid, float * delta);
+void gradient_array_mish(const int n, const float * activation_input, float * delta);
+void gradient_array_hard_mish(const int n, const float * activation_input, float * delta);
+void activate_array(float *x, const int n, const ACTIVATION a);
+void activate_array_swish(float *x, const int n, float * output_sigmoid, float * output);
+void activate_array_mish(float *x, const int n, float * activation_input, float * output);
+void activate_array_hard_mish(float *x, const int n, float * activation_input, float * output);
+void activate_array_normalize_channels(float *x, const int n, int batch, int channels, int wh_step, float *output);
+void gradient_array_normalize_channels(float *x, const int n, int batch, int channels, int wh_step, float *delta);
+void activate_array_normalize_channels_softmax(float *x, const int n, int batch, int channels, int wh_step, float *output, int use_max_val);
+void gradient_array_normalize_channels_softmax(float *x, const int n, int batch, int channels, int wh_step, float *delta);
+#ifdef GPU
+void activate_array_ongpu(float *x, int n, ACTIVATION a);
+void activate_array_swish_ongpu(float *x, int n, float *output_sigmoid_gpu, float *output_gpu);
+void activate_array_mish_ongpu(float *x, int n, float *activation_input_gpu, float *output_gpu);
+void activate_array_hard_mish_ongpu(float *x, int n, float *activation_input_gpu, float *output_gpu);
+void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta);
+void gradient_array_swish_ongpu(float *x, int n, float *sigmoid_gpu, float *delta);
+void gradient_array_mish_ongpu(int n, float *activation_input_gpu, float *delta);
+void gradient_array_hard_mish_ongpu(int n, float *activation_input_gpu, float *delta);
+void activate_array_normalize_channels_ongpu(float *x, int n, int batch, int channels, int wh_step, float *output_gpu);
+void gradient_array_normalize_channels_ongpu(float *output_gpu, int n, int batch, int channels, int wh_step, float *delta_gpu);
+void activate_array_normalize_channels_softmax_ongpu(float *x, int n, int batch, int channels, int wh_step, float *output_gpu, int use_max_val);
+void gradient_array_normalize_channels_softmax_ongpu(float *output_gpu, int n, int batch, int channels, int wh_step, float *delta_gpu);
+
+#endif
+
+static inline float stair_activate(float x)
+{
+    int n = floorf(x);
+    if (n%2 == 0) return floorf(x/2.f);
+    else return (x - n) + floorf(x/2.f);
+}
+static inline float hardtan_activate(float x)
+{
+    if (x < -1) return -1;
+    if (x > 1) return 1;
+    return x;
+}
+static inline float linear_activate(float x){return x;}
+static inline float logistic_activate(float x){return 1.f/(1.f + expf(-x));}
+static inline float loggy_activate(float x){return 2.f/(1.f + expf(-x)) - 1;}
+static inline float relu_activate(float x){return x*(x>0);}
+static inline float relu6_activate(float x) { return min_val_cmp(max_val_cmp(x, 0), 6); }
+static inline float elu_activate(float x){return (x >= 0)*x + (x < 0)*(expf(x)-1);}
+static inline float selu_activate(float x) { return (x >= 0)*1.0507f*x + (x < 0)*1.0507f*1.6732f*(expf(x) - 1); }
+static inline float relie_activate(float x){return (x>0) ? x : .01f*x;}
+static inline float ramp_activate(float x){return x*(x>0)+.1f*x;}
+static inline float leaky_activate(float x){return (x>0) ? x : .1f*x;}
+//static inline float tanh_activate(float x){return (expf(2*x)-1)/(expf(2*x)+1);}
+static inline float tanh_activate(float x) { return (2 / (1 + expf(-2 * x)) - 1); }
+static inline float gelu_activate(float x) { return (0.5*x*(1 + tanhf(0.797885*x + 0.035677*powf(x, 3)))); }
+static inline float softplus_activate(float x, float threshold) {
+    if (x > threshold) return x;                // too large
+    else if (x < -threshold) return expf(x);    // too small
+    return logf(expf(x) + 1);
+}
+static inline float plse_activate(float x)
+{
+    if(x < -4) return .01f * (x + 4);
+    if(x > 4)  return .01f * (x - 4) + 1;
+    return .125f*x + .5f;
+}
+
+static inline float lhtan_activate(float x)
+{
+    if(x < 0) return .001f*x;
+    if(x > 1) return .001f*(x-1) + 1;
+    return x;
+}
+static inline float lhtan_gradient(float x)
+{
+    if(x > 0 && x < 1) return 1;
+    return .001f;
+}
+
+static inline float hardtan_gradient(float x)
+{
+    if (x > -1 && x < 1) return 1;
+    return 0;
+}
+static inline float linear_gradient(float x){return 1;}
+static inline float logistic_gradient(float x){return (1-x)*x;}
+static inline float loggy_gradient(float x)
+{
+    float y = (x+1.f)/2.f;
+    return 2*(1-y)*y;
+}
+static inline float stair_gradient(float x)
+{
+    if (floor(x) == x) return 0;
+    return 1.0f;
+}
+static inline float relu_gradient(float x){return (x>0);}
+static inline float relu6_gradient(float x) { return (x > 0 && x < 6); }
+static inline float elu_gradient(float x){return (x >= 0) + (x < 0)*(x + 1);}
+static inline float selu_gradient(float x) { return (x >= 0)*1.0507f + (x < 0)*(x + 1.0507f*1.6732f); }
+static inline float relie_gradient(float x){return (x>0) ? 1 : .01f;}
+static inline float ramp_gradient(float x){return (x>0)+.1f;}
+static inline float leaky_gradient(float x){return (x>0) ? 1 : .1f;}
+static inline float tanh_gradient(float x){return 1-x*x;}
+
+static inline float sech(float x) { return 2 / (expf(x) + expf(-x)); }
+static inline float gelu_gradient(float x) {
+    const float x3 = powf(x, 3);
+    return 0.5*tanhf(0.0356774*x3 + 0.797885*x) + (0.0535161*x3 + 0.398942*x) * powf(sech(0.0356774*x3 + 0.797885*x), 2) + 0.5;
+}
+static inline float plse_gradient(float x){return (x < 0 || x > 1) ? .01f : .125f;}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/art.c b/darknet-master/src/art.c
new file mode 100644
index 0000000..748e190
--- /dev/null
+++ b/darknet-master/src/art.c
@@ -0,0 +1,74 @@
+#include "network.h"
+#include "utils.h"
+#include "parser.h"
+#include "option_list.h"
+#include "blas.h"
+#include "classifier.h"
+#ifdef WIN32
+#include <time.h>
+#include "gettimeofday.h"
+#else
+#include <sys/time.h>
+#endif
+
+
+void demo_art(char *cfgfile, char *weightfile, int cam_index)
+{
+#ifdef OPENCV
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+
+    srand(2222222);
+    cap_cv * cap;
+
+    cap = get_capture_webcam(cam_index);
+
+    char *window = "ArtJudgementBot9000!!!";
+    if(!cap) error("Couldn't connect to webcam.", DARKNET_LOC);
+    create_window_cv(window, 0, 512, 512);
+    int i;
+    int idx[] = {37, 401, 434};
+    int n = sizeof(idx)/sizeof(idx[0]);
+
+    while(1){
+        image in = get_image_from_stream_cpp(cap);
+        image in_s = resize_image(in, net.w, net.h);
+        show_image(in, window);
+
+        float *p = network_predict(net, in_s.data);
+
+        printf("\033[H\033[J");
+
+        float score = 0;
+        for(i = 0; i < n; ++i){
+            float s = p[idx[i]];
+            if (s > score) score = s;
+        }
+        score = score;
+        printf("I APPRECIATE THIS ARTWORK: %10.7f%%\n", score*100);
+        printf("[");
+    int upper = 30;
+        for(i = 0; i < upper; ++i){
+            printf("%c", ((i+.5) < score*upper) ? 219 : ' ');
+        }
+        printf("]\n");
+
+        free_image(in_s);
+        free_image(in);
+
+        wait_key_cv(1);
+    }
+#endif
+}
+
+
+void run_art(int argc, char **argv)
+{
+    int cam_index = find_int_arg(argc, argv, "-c", 0);
+    char *cfg = argv[2];
+    char *weights = argv[3];
+    demo_art(cfg, weights, cam_index);
+}
diff --git a/darknet-master/src/avgpool_layer.c b/darknet-master/src/avgpool_layer.c
new file mode 100644
index 0000000..2b595aa
--- /dev/null
+++ b/darknet-master/src/avgpool_layer.c
@@ -0,0 +1,71 @@
+#include "avgpool_layer.h"
+#include "dark_cuda.h"
+#include "utils.h"
+#include <stdio.h>
+
+avgpool_layer make_avgpool_layer(int batch, int w, int h, int c)
+{
+    fprintf(stderr, "avg                          %4d x%4d x%4d ->   %4d\n",  w, h, c, c);
+    avgpool_layer l = { (LAYER_TYPE)0 };
+    l.type = AVGPOOL;
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.out_w = 1;
+    l.out_h = 1;
+    l.out_c = c;
+    l.outputs = l.out_c;
+    l.inputs = h*w*c;
+    int output_size = l.outputs * batch;
+    l.output = (float*)xcalloc(output_size, sizeof(float));
+    l.delta = (float*)xcalloc(output_size, sizeof(float));
+    l.forward = forward_avgpool_layer;
+    l.backward = backward_avgpool_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_avgpool_layer_gpu;
+    l.backward_gpu = backward_avgpool_layer_gpu;
+    l.output_gpu  = cuda_make_array(l.output, output_size);
+    l.delta_gpu   = cuda_make_array(l.delta, output_size);
+    #endif
+    return l;
+}
+
+void resize_avgpool_layer(avgpool_layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+    l->inputs = h*w*l->c;
+}
+
+void forward_avgpool_layer(const avgpool_layer l, network_state state)
+{
+    int b,i,k;
+
+    for(b = 0; b < l.batch; ++b){
+        for(k = 0; k < l.c; ++k){
+            int out_index = k + b*l.c;
+            l.output[out_index] = 0;
+            for(i = 0; i < l.h*l.w; ++i){
+                int in_index = i + l.h*l.w*(k + b*l.c);
+                l.output[out_index] += state.input[in_index];
+            }
+            l.output[out_index] /= l.h*l.w;
+        }
+    }
+}
+
+void backward_avgpool_layer(const avgpool_layer l, network_state state)
+{
+    int b,i,k;
+
+    for(b = 0; b < l.batch; ++b){
+        for(k = 0; k < l.c; ++k){
+            int out_index = k + b*l.c;
+            for(i = 0; i < l.h*l.w; ++i){
+                int in_index = i + l.h*l.w*(k + b*l.c);
+                state.delta[in_index] += l.delta[out_index] / (l.h*l.w);
+            }
+        }
+    }
+}
diff --git a/darknet-master/src/avgpool_layer.h b/darknet-master/src/avgpool_layer.h
new file mode 100644
index 0000000..2277ec6
--- /dev/null
+++ b/darknet-master/src/avgpool_layer.h
@@ -0,0 +1,29 @@
+#ifndef AVGPOOL_LAYER_H
+#define AVGPOOL_LAYER_H
+
+#include "image.h"
+#include "dark_cuda.h"
+#include "layer.h"
+#include "network.h"
+
+typedef layer avgpool_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+image get_avgpool_image(avgpool_layer l);
+avgpool_layer make_avgpool_layer(int batch, int w, int h, int c);
+void resize_avgpool_layer(avgpool_layer *l, int w, int h);
+void forward_avgpool_layer(const avgpool_layer l, network_state state);
+void backward_avgpool_layer(const avgpool_layer l, network_state state);
+
+#ifdef GPU
+void forward_avgpool_layer_gpu(avgpool_layer l, network_state state);
+void backward_avgpool_layer_gpu(avgpool_layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/avgpool_layer_kernels.cu b/darknet-master/src/avgpool_layer_kernels.cu
new file mode 100644
index 0000000..b8cdd60
--- /dev/null
+++ b/darknet-master/src/avgpool_layer_kernels.cu
@@ -0,0 +1,58 @@
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+
+#include "avgpool_layer.h"
+#include "dark_cuda.h"
+
+__global__ void forward_avgpool_layer_kernel(int n, int w, int h, int c, float *input, float *output)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(id >= n) return;
+
+    int k = id % c;
+    id /= c;
+    int b = id;
+
+    int i;
+    int out_index = (k + c*b);
+    output[out_index] = 0;
+    for(i = 0; i < w*h; ++i){
+        int in_index = i + h*w*(k + b*c);
+        output[out_index] += input[in_index];
+    }
+    output[out_index] /= w*h;
+}
+
+__global__ void backward_avgpool_layer_kernel(int n, int w, int h, int c, float *in_delta, float *out_delta)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(id >= n) return;
+
+    int k = id % c;
+    id /= c;
+    int b = id;
+
+    int i;
+    int out_index = (k + c*b);
+    for(i = 0; i < w*h; ++i){
+        int in_index = i + h*w*(k + b*c);
+        in_delta[in_index] += out_delta[out_index] / (w*h);
+    }
+}
+
+extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network_state state)
+{
+    size_t n = layer.c*layer.batch;
+
+    forward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, layer.w, layer.h, layer.c, state.input, layer.output_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network_state state)
+{
+    size_t n = layer.c*layer.batch;
+
+    backward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, layer.w, layer.h, layer.c, state.delta, layer.delta_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
diff --git a/darknet-master/src/batchnorm_layer.c b/darknet-master/src/batchnorm_layer.c
new file mode 100644
index 0000000..6729b03
--- /dev/null
+++ b/darknet-master/src/batchnorm_layer.c
@@ -0,0 +1,431 @@
+#include "batchnorm_layer.h"
+#include "blas.h"
+#include "utils.h"
+#include <stdio.h>
+
+layer make_batchnorm_layer(int batch, int w, int h, int c, int train)
+{
+    fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
+    layer layer = { (LAYER_TYPE)0 };
+    layer.type = BATCHNORM;
+    layer.batch = batch;
+    layer.train = train;
+    layer.h = layer.out_h = h;
+    layer.w = layer.out_w = w;
+    layer.c = layer.out_c = c;
+
+    layer.n = layer.c;
+    layer.output = (float*)xcalloc(h * w * c * batch, sizeof(float));
+    layer.delta = (float*)xcalloc(h * w * c * batch, sizeof(float));
+    layer.inputs = w*h*c;
+    layer.outputs = layer.inputs;
+
+    layer.biases = (float*)xcalloc(c, sizeof(float));
+    layer.bias_updates = (float*)xcalloc(c, sizeof(float));
+
+    layer.scales = (float*)xcalloc(c, sizeof(float));
+    layer.scale_updates = (float*)xcalloc(c, sizeof(float));
+    int i;
+    for(i = 0; i < c; ++i){
+        layer.scales[i] = 1;
+    }
+
+    layer.mean = (float*)xcalloc(c, sizeof(float));
+    layer.variance = (float*)xcalloc(c, sizeof(float));
+
+    layer.rolling_mean = (float*)xcalloc(c, sizeof(float));
+    layer.rolling_variance = (float*)xcalloc(c, sizeof(float));
+
+    layer.mean_delta = (float*)xcalloc(c, sizeof(float));
+    layer.variance_delta = (float*)xcalloc(c, sizeof(float));
+
+    layer.x = (float*)xcalloc(layer.batch*layer.outputs, sizeof(float));
+    layer.x_norm = (float*)xcalloc(layer.batch*layer.outputs, sizeof(float));
+
+    layer.forward = forward_batchnorm_layer;
+    layer.backward = backward_batchnorm_layer;
+    layer.update = update_batchnorm_layer;
+#ifdef GPU
+    layer.forward_gpu = forward_batchnorm_layer_gpu;
+    layer.backward_gpu = backward_batchnorm_layer_gpu;
+    layer.update_gpu = update_batchnorm_layer_gpu;
+
+    layer.output_gpu =  cuda_make_array(layer.output, h * w * c * batch);
+
+    layer.biases_gpu = cuda_make_array(layer.biases, c);
+    layer.scales_gpu = cuda_make_array(layer.scales, c);
+
+    if (train) {
+        layer.delta_gpu = cuda_make_array(layer.delta, h * w * c * batch);
+
+        layer.bias_updates_gpu = cuda_make_array(layer.bias_updates, c);
+        layer.scale_updates_gpu = cuda_make_array(layer.scale_updates, c);
+
+        layer.mean_delta_gpu = cuda_make_array(layer.mean, c);
+        layer.variance_delta_gpu = cuda_make_array(layer.variance, c);
+    }
+
+    layer.mean_gpu = cuda_make_array(layer.mean, c);
+    layer.variance_gpu = cuda_make_array(layer.variance, c);
+
+    layer.rolling_mean_gpu = cuda_make_array(layer.mean, c);
+    layer.rolling_variance_gpu = cuda_make_array(layer.variance, c);
+
+    if (train) {
+        layer.x_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
+#ifndef CUDNN
+        layer.x_norm_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
+#endif  // not CUDNN
+    }
+
+#ifdef CUDNN
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&layer.normTensorDesc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&layer.normDstTensorDesc));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(layer.normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, layer.batch, layer.out_c, layer.out_h, layer.out_w));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(layer.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, layer.out_c, 1, 1));
+#endif
+#endif
+    return layer;
+}
+
+void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
+{
+    int i,b,f;
+    for(f = 0; f < n; ++f){
+        float sum = 0;
+        for(b = 0; b < batch; ++b){
+            for(i = 0; i < size; ++i){
+                int index = i + size*(f + n*b);
+                sum += delta[index] * x_norm[index];
+            }
+        }
+        scale_updates[f] += sum;
+    }
+}
+
+void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
+{
+
+    int i,j,k;
+    for(i = 0; i < filters; ++i){
+        mean_delta[i] = 0;
+        for (j = 0; j < batch; ++j) {
+            for (k = 0; k < spatial; ++k) {
+                int index = j*filters*spatial + i*spatial + k;
+                mean_delta[i] += delta[index];
+            }
+        }
+        mean_delta[i] *= (-1./sqrt(variance[i] + .00001f));
+    }
+}
+void  variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
+{
+
+    int i,j,k;
+    for(i = 0; i < filters; ++i){
+        variance_delta[i] = 0;
+        for(j = 0; j < batch; ++j){
+            for(k = 0; k < spatial; ++k){
+                int index = j*filters*spatial + i*spatial + k;
+                variance_delta[i] += delta[index]*(x[index] - mean[i]);
+            }
+        }
+        variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.));
+    }
+}
+void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
+{
+    int f, j, k;
+    for(j = 0; j < batch; ++j){
+        for(f = 0; f < filters; ++f){
+            for(k = 0; k < spatial; ++k){
+                int index = j*filters*spatial + f*spatial + k;
+                delta[index] = delta[index] * 1./(sqrt(variance[f]) + .00001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
+            }
+        }
+    }
+}
+
+void resize_batchnorm_layer(layer *l, int w, int h)
+{
+    l->out_h = l->h = h;
+    l->out_w = l->w = w;
+    l->outputs = l->inputs = h*w*l->c;
+
+    const int output_size = l->outputs * l->batch;
+
+    l->output = (float*)realloc(l->output, output_size * sizeof(float));
+    l->delta = (float*)realloc(l->delta, output_size * sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    l->output_gpu = cuda_make_array(l->output, output_size);
+
+    if (l->train) {
+        cuda_free(l->delta_gpu);
+        l->delta_gpu = cuda_make_array(l->delta, output_size);
+
+        cuda_free(l->x_gpu);
+        l->x_gpu = cuda_make_array(l->output, output_size);
+#ifndef CUDNN
+        cuda_free(l->x_norm_gpu);
+        l->x_norm_gpu = cuda_make_array(l->output, output_size);
+#endif  // not CUDNN
+    }
+
+
+#ifdef CUDNN
+    CHECK_CUDNN(cudnnDestroyTensorDescriptor(l->normDstTensorDesc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normDstTensorDesc));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w));
+#endif // CUDNN
+#endif // GPU
+}
+
+void forward_batchnorm_layer(layer l, network_state state)
+{
+    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
+    if(l.type == CONNECTED){
+        l.out_c = l.outputs;
+        l.out_h = l.out_w = 1;
+    }
+    if(state.train){
+        mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
+        variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
+
+        scal_cpu(l.out_c, .9, l.rolling_mean, 1);
+        axpy_cpu(l.out_c, .1, l.mean, 1, l.rolling_mean, 1);
+        scal_cpu(l.out_c, .9, l.rolling_variance, 1);
+        axpy_cpu(l.out_c, .1, l.variance, 1, l.rolling_variance, 1);
+
+        copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
+        normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);
+        copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
+    } else {
+        normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
+    }
+    scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
+    add_bias(l.output, l.biases, l.batch, l.out_c, l.out_w*l.out_h);
+}
+
+void backward_batchnorm_layer(const layer l, network_state state)
+{
+    backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
+
+    scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
+
+    mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
+    variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
+    normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
+    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
+}
+
+void update_batchnorm_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+{
+    //int size = l.nweights;
+    axpy_cpu(l.c, learning_rate / batch, l.bias_updates, 1, l.biases, 1);
+    scal_cpu(l.c, momentum, l.bias_updates, 1);
+
+    axpy_cpu(l.c, learning_rate / batch, l.scale_updates, 1, l.scales, 1);
+    scal_cpu(l.c, momentum, l.scale_updates, 1);
+}
+
+
+
+
+#ifdef GPU
+
+void pull_batchnorm_layer(layer l)
+{
+    cuda_pull_array(l.biases_gpu, l.biases, l.out_c);
+    cuda_pull_array(l.scales_gpu, l.scales, l.out_c);
+    cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.out_c);
+    cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.out_c);
+}
+void push_batchnorm_layer(layer l)
+{
+    cuda_push_array(l.biases_gpu, l.biases, l.out_c);
+    cuda_push_array(l.scales_gpu, l.scales, l.out_c);
+    cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.out_c);
+    cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.out_c);
+}
+
+void forward_batchnorm_layer_gpu(layer l, network_state state)
+{
+    if (l.type == BATCHNORM) simple_copy_ongpu(l.outputs*l.batch, state.input, l.output_gpu);
+        //copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
+
+    if (state.net.adversarial) {
+        normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+        return;
+    }
+
+    if (state.train) {
+        simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_gpu);
+
+        // cbn
+        if (l.batch_normalize == 2) {
+
+            fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
+
+            //fast_v_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.v_cbn_gpu);
+            const int minibatch_index = state.net.current_subdivision + 1;
+            const int max_minibatch_index = state.net.subdivisions;
+            //printf("\n minibatch_index = %d, max_minibatch_index = %d \n", minibatch_index, max_minibatch_index);
+            const float alpha = 0.01;
+
+            int inverse_variance = 0;
+#ifdef CUDNN
+            inverse_variance = 1;
+#endif  // CUDNN
+
+            fast_v_cbn_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, minibatch_index, max_minibatch_index, l.m_cbn_avg_gpu, l.v_cbn_avg_gpu, l.variance_gpu,
+                alpha, l.rolling_mean_gpu, l.rolling_variance_gpu, inverse_variance, .00001);
+
+            normalize_scale_bias_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.scales_gpu, l.biases_gpu, l.batch, l.out_c, l.out_h*l.out_w, inverse_variance, .00001f);
+
+#ifndef CUDNN
+            simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_norm_gpu);
+#endif  // CUDNN
+
+            //printf("\n CBN, minibatch_index = %d \n", minibatch_index);
+        }
+        else {
+#ifdef CUDNN
+            float one = 1;
+            float zero = 0;
+            cudnnBatchNormalizationForwardTraining(cudnn_handle(),
+                CUDNN_BATCHNORM_SPATIAL,
+                &one,
+                &zero,
+                l.normDstTensorDesc,
+                l.x_gpu,                // input
+                l.normDstTensorDesc,
+                l.output_gpu,            // output
+                l.normTensorDesc,
+                l.scales_gpu,
+                l.biases_gpu,
+                .01,
+                l.rolling_mean_gpu,        // output (should be FP32)
+                l.rolling_variance_gpu,    // output (should be FP32)
+                .00001,
+                l.mean_gpu,            // output (should be FP32)
+                l.variance_gpu);    // output (should be FP32)
+
+            if (state.net.try_fix_nan) {
+                fix_nan_and_inf(l.scales_gpu, l.n);
+                fix_nan_and_inf(l.biases_gpu, l.n);
+                fix_nan_and_inf(l.mean_gpu, l.n);
+                fix_nan_and_inf(l.variance_gpu, l.n);
+                fix_nan_and_inf(l.rolling_mean_gpu, l.n);
+                fix_nan_and_inf(l.rolling_variance_gpu, l.n);
+            }
+
+            //simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_norm_gpu);
+#else   // CUDNN
+            fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
+            fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);
+
+            scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
+            axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
+            scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
+            axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
+
+            copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
+            normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+            copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
+
+            scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+            add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+#endif  // CUDNN
+        }
+    }
+    else {
+        normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+    }
+
+}
+
+void backward_batchnorm_layer_gpu(layer l, network_state state)
+{
+    if (state.net.adversarial) {
+        inverse_variance_ongpu(l.out_c, l.rolling_variance_gpu, l.variance_gpu, 0.00001);
+
+        scale_bias_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        return;
+    }
+
+    if (!state.train) {
+        //l.mean_gpu = l.rolling_mean_gpu;
+        //l.variance_gpu = l.rolling_variance_gpu;
+        simple_copy_ongpu(l.out_c, l.rolling_mean_gpu, l.mean_gpu);
+#ifdef CUDNN
+        inverse_variance_ongpu(l.out_c, l.rolling_variance_gpu, l.variance_gpu, 0.00001);
+#else
+        simple_copy_ongpu(l.out_c, l.rolling_variance_gpu, l.variance_gpu);
+#endif
+    }
+
+#ifdef CUDNN
+    float one = 1;
+    float zero = 0;
+    cudnnBatchNormalizationBackward(cudnn_handle(),
+        CUDNN_BATCHNORM_SPATIAL,
+        &one,
+        &zero,
+        &one,
+        &one,
+        l.normDstTensorDesc,
+        l.x_gpu,                // input
+        l.normDstTensorDesc,
+        l.delta_gpu,            // input
+        l.normDstTensorDesc,
+        l.output_gpu, //l.x_norm_gpu,            // output
+        l.normTensorDesc,
+        l.scales_gpu,            // input (should be FP32)
+        l.scale_updates_gpu,    // output (should be FP32)
+        l.bias_updates_gpu,        // output (should be FP32)
+        .00001,
+        l.mean_gpu,                // input (should be FP32)
+        l.variance_gpu);        // input (should be FP32)
+    simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.delta_gpu);
+    //simple_copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, l.delta_gpu);
+#else   // CUDNN
+    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+    backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
+
+    scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+
+    fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
+    fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
+    normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
+#endif  // CUDNN
+    if (l.type == BATCHNORM) simple_copy_ongpu(l.outputs*l.batch, l.delta_gpu, state.delta);
+        //copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
+
+    if (state.net.try_fix_nan) {
+        fix_nan_and_inf(l.scale_updates_gpu, l.n);
+        fix_nan_and_inf(l.bias_updates_gpu, l.n);
+    }
+}
+
+void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
+{
+    float learning_rate = learning_rate_init * l.learning_rate_scale / loss_scale;
+    //float momentum = a.momentum;
+    //float decay = a.decay;
+    //int batch = a.batch;
+
+    axpy_ongpu(l.c, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+    scal_ongpu(l.c, momentum, l.bias_updates_gpu, 1);
+
+    axpy_ongpu(l.c, learning_rate / batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+    scal_ongpu(l.c, momentum, l.scale_updates_gpu, 1);
+}
+
+#endif  // GPU
diff --git a/darknet-master/src/batchnorm_layer.h b/darknet-master/src/batchnorm_layer.h
new file mode 100644
index 0000000..afdc54b
--- /dev/null
+++ b/darknet-master/src/batchnorm_layer.h
@@ -0,0 +1,29 @@
+#ifndef BATCHNORM_LAYER_H
+#define BATCHNORM_LAYER_H
+
+#include "image.h"
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_batchnorm_layer(int batch, int w, int h, int c, int train);
+void forward_batchnorm_layer(layer l, network_state state);
+void backward_batchnorm_layer(layer l, network_state state);
+void update_batchnorm_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+
+void resize_batchnorm_layer(layer *l, int w, int h);
+
+#ifdef GPU
+void forward_batchnorm_layer_gpu(layer l, network_state state);
+void backward_batchnorm_layer_gpu(layer l, network_state state);
+void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale);
+void pull_batchnorm_layer(layer l);
+void push_batchnorm_layer(layer l);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/blas.c b/darknet-master/src/blas.c
new file mode 100644
index 0000000..122bca0
--- /dev/null
+++ b/darknet-master/src/blas.c
@@ -0,0 +1,889 @@
+#include "blas.h"
+#include "utils.h"
+
+#include <math.h>
+#include <assert.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+void reorg_cpu(float *x, int out_w, int out_h, int out_c, int batch, int stride, int forward, float *out)
+{
+    int b,i,j,k;
+    int in_c = out_c/(stride*stride);
+
+    //printf("\n out_c = %d, out_w = %d, out_h = %d, stride = %d, forward = %d \n", out_c, out_w, out_h, stride, forward);
+    //printf("  in_c = %d,  in_w = %d,  in_h = %d \n", in_c, out_w*stride, out_h*stride);
+
+    for(b = 0; b < batch; ++b){
+        for(k = 0; k < out_c; ++k){
+            for(j = 0; j < out_h; ++j){
+                for(i = 0; i < out_w; ++i){
+                    int in_index  = i + out_w*(j + out_h*(k + out_c*b));
+                    int c2 = k % in_c;
+                    int offset = k / in_c;
+                    int w2 = i*stride + offset % stride;
+                    int h2 = j*stride + offset / stride;
+                    int out_index = w2 + out_w*stride*(h2 + out_h*stride*(c2 + in_c*b));
+                    if(forward) out[out_index] = x[in_index];    // used by default for forward (i.e. forward = 0)
+                    else out[in_index] = x[out_index];
+                }
+            }
+        }
+    }
+}
+
+void flatten(float *x, int size, int layers, int batch, int forward)
+{
+    float* swap = (float*)xcalloc(size * layers * batch, sizeof(float));
+    int i,c,b;
+    for(b = 0; b < batch; ++b){
+        for(c = 0; c < layers; ++c){
+            for(i = 0; i < size; ++i){
+                int i1 = b*layers*size + c*size + i;
+                int i2 = b*layers*size + i*layers + c;
+                if (forward) swap[i2] = x[i1];
+                else swap[i1] = x[i2];
+            }
+        }
+    }
+    memcpy(x, swap, size*layers*batch*sizeof(float));
+    free(swap);
+}
+
+void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        c[i] = s[i]*a[i] + (1-s[i])*(b ? b[i] : 0);
+    }
+}
+
+void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        if(da) da[i] += dc[i] * s[i];
+        if(db) db[i] += dc[i] * (1-s[i]);
+        ds[i] += dc[i] * (a[i] - b[i]);
+    }
+}
+
+static float relu(float src) {
+    if (src > 0) return src;
+    return 0;
+}
+
+void shortcut_multilayer_cpu(int size, int src_outputs, int batch, int n, int *outputs_of_layers, float **layers_output, float *out, float *in, float *weights, int nweights, WEIGHTS_NORMALIZATION_T weights_normalization)
+{
+    // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w)
+    const int layer_step = nweights / (n + 1);    // 1 or l.c or (l.c * l.h * l.w)
+    int step = 0;
+    if (nweights > 0) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1
+
+    int id;
+    #pragma omp parallel for
+    for (id = 0; id < size; ++id) {
+
+        int src_id = id;
+        const int src_i = src_id % src_outputs;
+        src_id /= src_outputs;
+        int src_b = src_id;
+
+        float sum = 1, max_val = -FLT_MAX;
+        int i;
+        if (weights && weights_normalization) {
+            if (weights_normalization == SOFTMAX_NORMALIZATION) {
+                for (i = 0; i < (n + 1); ++i) {
+                    const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
+                    float w = weights[weights_index];
+                    if (max_val < w) max_val = w;
+                }
+            }
+            const float eps = 0.0001;
+            sum = eps;
+            for (i = 0; i < (n + 1); ++i) {
+                const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
+                const float w = weights[weights_index];
+                if (weights_normalization == RELU_NORMALIZATION) sum += relu(w);
+                else if (weights_normalization == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
+            }
+        }
+
+        if (weights) {
+            float w = weights[src_i / step];
+            if (weights_normalization == RELU_NORMALIZATION) w = relu(w) / sum;
+            else if (weights_normalization == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
+
+            out[id] = in[id] * w; // [0 or c or (c, h ,w)]
+        }
+        else out[id] = in[id];
+
+        // layers
+        for (i = 0; i < n; ++i) {
+            int add_outputs = outputs_of_layers[i];
+            if (src_i < add_outputs) {
+                int add_index = add_outputs*src_b + src_i;
+                int out_index = id;
+
+                float *add = layers_output[i];
+
+                if (weights) {
+                    const int weights_index = src_i / step + (i + 1)*layer_step;  // [0 or c or (c, h ,w)]
+                    float w = weights[weights_index];
+                    if (weights_normalization == RELU_NORMALIZATION) w = relu(w) / sum;
+                    else if (weights_normalization == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
+
+                    out[out_index] += add[add_index] * w; // [0 or c or (c, h ,w)]
+                }
+                else out[out_index] += add[add_index];
+            }
+        }
+    }
+}
+
+void backward_shortcut_multilayer_cpu(int size, int src_outputs, int batch, int n, int *outputs_of_layers,
+    float **layers_delta, float *delta_out, float *delta_in, float *weights, float *weight_updates, int nweights, float *in, float **layers_output, WEIGHTS_NORMALIZATION_T weights_normalization)
+{
+    // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w)
+    const int layer_step = nweights / (n + 1);    // 1 or l.c or (l.c * l.h * l.w)
+    int step = 0;
+    if (nweights > 0) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1
+
+    int id;
+    #pragma omp parallel for
+    for (id = 0; id < size; ++id) {
+        int src_id = id;
+        int src_i = src_id % src_outputs;
+        src_id /= src_outputs;
+        int src_b = src_id;
+
+        float grad = 1, sum = 1, max_val = -FLT_MAX;;
+        int i;
+        if (weights && weights_normalization) {
+            if (weights_normalization == SOFTMAX_NORMALIZATION) {
+                for (i = 0; i < (n + 1); ++i) {
+                    const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
+                    float w = weights[weights_index];
+                    if (max_val < w) max_val = w;
+                }
+            }
+            const float eps = 0.0001;
+            sum = eps;
+            for (i = 0; i < (n + 1); ++i) {
+                const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
+                const float w = weights[weights_index];
+                if (weights_normalization == RELU_NORMALIZATION) sum += relu(w);
+                else if (weights_normalization == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
+            }
+
+            /*
+            grad = 0;
+            for (i = 0; i < (n + 1); ++i) {
+                const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
+                const float delta_w = delta_in[id] * in[id];
+                const float w = weights[weights_index];
+                if (weights_normalization == RELU_NORMALIZATION) grad += delta_w * relu(w) / sum;
+                else if (weights_normalization == SOFTMAX_NORMALIZATION) grad += delta_w * expf(w - max_val) / sum;
+            }
+            */
+        }
+
+        if (weights) {
+            float w = weights[src_i / step];
+            if (weights_normalization == RELU_NORMALIZATION) w = relu(w) / sum;
+            else if (weights_normalization == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
+
+            delta_out[id] += delta_in[id] * w; // [0 or c or (c, h ,w)]
+            weight_updates[src_i / step] += delta_in[id] * in[id] * grad;
+        }
+        else delta_out[id] += delta_in[id];
+
+        // layers
+        for (i = 0; i < n; ++i) {
+            int add_outputs = outputs_of_layers[i];
+            if (src_i < add_outputs) {
+                int add_index = add_outputs*src_b + src_i;
+                int out_index = id;
+
+                float *layer_delta = layers_delta[i];
+                if (weights) {
+                    float *add = layers_output[i];
+
+                    const int weights_index = src_i / step + (i + 1)*layer_step;  // [0 or c or (c, h ,w)]
+                    float w = weights[weights_index];
+                    if (weights_normalization == RELU_NORMALIZATION) w = relu(w) / sum;
+                    else if (weights_normalization == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
+
+                    layer_delta[add_index] += delta_in[id] * w; // [0 or c or (c, h ,w)]
+                    weight_updates[weights_index] += delta_in[id] * add[add_index] * grad;
+                }
+                else layer_delta[add_index] += delta_in[id];
+            }
+        }
+    }
+}
+
+void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+{
+    int stride = w1/w2;
+    int sample = w2/w1;
+    assert(stride == h1/h2);
+    assert(sample == h2/h1);
+    if(stride < 1) stride = 1;
+    if(sample < 1) sample = 1;
+    int minw = (w1 < w2) ? w1 : w2;
+    int minh = (h1 < h2) ? h1 : h2;
+    int minc = (c1 < c2) ? c1 : c2;
+
+    int i,j,k,b;
+    for(b = 0; b < batch; ++b){
+        for(k = 0; k < minc; ++k){
+            for(j = 0; j < minh; ++j){
+                for(i = 0; i < minw; ++i){
+                    int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
+                    int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
+                    out[out_index] += add[add_index];
+                }
+            }
+        }
+    }
+}
+
+void mean_cpu(float *x, int batch, int filters, int spatial, float *mean)
+{
+    float scale = 1./(batch * spatial);
+    int i,j,k;
+    for(i = 0; i < filters; ++i){
+        mean[i] = 0;
+        for(j = 0; j < batch; ++j){
+            for(k = 0; k < spatial; ++k){
+                int index = j*filters*spatial + i*spatial + k;
+                mean[i] += x[index];
+            }
+        }
+        mean[i] *= scale;
+    }
+}
+
+void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
+{
+    float scale = 1./(batch * spatial - 1);
+    int i,j,k;
+    for(i = 0; i < filters; ++i){
+        variance[i] = 0;
+        for(j = 0; j < batch; ++j){
+            for(k = 0; k < spatial; ++k){
+                int index = j*filters*spatial + i*spatial + k;
+                variance[i] += pow((x[index] - mean[i]), 2);
+            }
+        }
+        variance[i] *= scale;
+    }
+}
+
+void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
+{
+    int b, f, i;
+    for(b = 0; b < batch; ++b){
+        for(f = 0; f < filters; ++f){
+            for(i = 0; i < spatial; ++i){
+                int index = b*filters*spatial + f*spatial + i;
+                x[index] = (x[index] - mean[f])/(sqrt(variance[f] + .00001f));
+            }
+        }
+    }
+}
+
+void const_cpu(int N, float ALPHA, float *X, int INCX)
+{
+    int i;
+    for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
+}
+
+void mul_cpu(int N, float *X, int INCX, float *Y, int INCY)
+{
+    int i;
+    for(i = 0; i < N; ++i) Y[i*INCY] *= X[i*INCX];
+}
+
+void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
+{
+    int i;
+    for(i = 0; i < N; ++i) Y[i*INCY] = pow(X[i*INCX], ALPHA);
+}
+
+void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
+{
+    int i;
+    for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX];
+}
+
+void scal_cpu(int N, float ALPHA, float *X, int INCX)
+{
+    int i;
+    for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA;
+}
+
+void scal_add_cpu(int N, float ALPHA, float BETA, float *X, int INCX)
+{
+    int i;
+    for (i = 0; i < N; ++i) X[i*INCX] = X[i*INCX] * ALPHA + BETA;
+}
+
+void fill_cpu(int N, float ALPHA, float *X, int INCX)
+{
+    int i;
+    if (INCX == 1 && ALPHA == 0) {
+        memset(X, 0, N * sizeof(float));
+    }
+    else {
+        for (i = 0; i < N; ++i) X[i*INCX] = ALPHA;
+    }
+}
+
+void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUTPUT)
+{
+    int i, j;
+    int index = 0;
+    for(j = 0; j < B; ++j) {
+        for(i = 0; i < NX; ++i){
+            if(X) X[j*NX + i] += OUTPUT[index];
+            ++index;
+        }
+        for(i = 0; i < NY; ++i){
+            if(Y) Y[j*NY + i] += OUTPUT[index];
+            ++index;
+        }
+    }
+}
+
+void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUTPUT)
+{
+    int i, j;
+    int index = 0;
+    for(j = 0; j < B; ++j) {
+        for(i = 0; i < NX; ++i){
+            OUTPUT[index++] = X[j*NX + i];
+        }
+        for(i = 0; i < NY; ++i){
+            OUTPUT[index++] = Y[j*NY + i];
+        }
+    }
+}
+
+void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
+{
+    int i;
+    for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX];
+}
+
+void mult_add_into_cpu(int N, float *X, float *Y, float *Z)
+{
+    int i;
+    for(i = 0; i < N; ++i) Z[i] += X[i]*Y[i];
+}
+
+void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float diff = truth[i] - pred[i];
+        float abs_val = fabs(diff);
+        if(abs_val < 1) {
+            error[i] = diff * diff;
+            delta[i] = diff;
+        }
+        else {
+            error[i] = 2*abs_val - 1;
+            delta[i] = (diff > 0) ? 1 : -1;
+        }
+    }
+}
+
+void l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float diff = truth[i] - pred[i];
+        error[i] = fabs(diff);
+        delta[i] = diff > 0 ? 1 : -1;
+    }
+}
+
+void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = (t) ? -log(p) : 0;
+        delta[i] = t-p;
+    }
+}
+
+void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = -t*log(p) - (1-t)*log(1-p);
+        delta[i] = t-p;
+    }
+}
+
+void l2_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float diff = truth[i] - pred[i];
+        error[i] = diff * diff;
+        delta[i] = diff;
+    }
+}
+
+float dot_cpu(int N, float *X, int INCX, float *Y, int INCY)
+{
+    int i;
+    float dot = 0;
+    for(i = 0; i < N; ++i) dot += X[i*INCX] * Y[i*INCY];
+    return dot;
+}
+
+void softmax(float *input, int n, float temp, float *output, int stride)
+{
+    int i;
+    float sum = 0;
+    float largest = -FLT_MAX;
+    for(i = 0; i < n; ++i){
+        if(input[i*stride] > largest) largest = input[i*stride];
+    }
+    for(i = 0; i < n; ++i){
+        float e = exp(input[i*stride]/temp - largest/temp);
+        sum += e;
+        output[i*stride] = e;
+    }
+    for(i = 0; i < n; ++i){
+        output[i*stride] /= sum;
+    }
+}
+
+
+void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
+{
+    int g, b;
+    for(b = 0; b < batch; ++b){
+        for(g = 0; g < groups; ++g){
+            softmax(input + b*batch_offset + g*group_offset, n, temp, output + b*batch_offset + g*group_offset, stride);
+        }
+    }
+}
+
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+    int i, j, k, b;
+    for (b = 0; b < batch; ++b) {
+        for (k = 0; k < c; ++k) {
+            for (j = 0; j < h*stride; ++j) {
+                for (i = 0; i < w*stride; ++i) {
+                    int in_index = b*w*h*c + k*w*h + (j / stride)*w + i / stride;
+                    int out_index = b*w*h*c*stride*stride + k*w*h*stride*stride + j*w*stride + i;
+                    if (forward) out[out_index] = scale*in[in_index];
+                    else in[in_index] += scale*out[out_index];
+                }
+            }
+        }
+    }
+}
+
+
+void constrain_cpu(int size, float ALPHA, float *X)
+{
+    int i;
+    for (i = 0; i < size; ++i) {
+        X[i] = fminf(ALPHA, fmaxf(-ALPHA, X[i]));
+    }
+}
+
+void fix_nan_and_inf_cpu(float *input, size_t size)
+{
+    int i;
+    for (i = 0; i < size; ++i) {
+        float val = input[i];
+        if (isnan(val) || isinf(val))
+            input[i] = 1.0f / i;  // pseudo random value
+    }
+}
+
+void get_embedding(float *src, int src_w, int src_h, int src_c, int embedding_size, int cur_w, int cur_h, int cur_n, int cur_b, float *dst)
+{
+    int i;
+    for (i = 0; i < embedding_size; ++i) {
+        const int src_index = cur_b*(src_c*src_h*src_w) + cur_n*(embedding_size*src_h*src_w) + i*src_h*src_w + cur_h*(src_w) + cur_w;
+
+        const float val = src[src_index];
+        dst[i] = val;
+        //printf(" val = %f, ", val);
+    }
+}
+
+
+// Euclidean_norm
+float math_vector_length(float *A, unsigned int feature_size)
+{
+    float sum = 0;
+    int i;
+    for (i = 0; i < feature_size; ++i)
+    {
+        sum += A[i] * A[i];
+    }
+    float vector_length = sqrtf(sum);
+    return vector_length;
+}
+
+float cosine_similarity(float *A, float *B, unsigned int feature_size)
+{
+    float mul = 0.0, d_a = 0.0, d_b = 0.0;
+
+    int i;
+    for(i = 0; i < feature_size; ++i)
+    {
+        mul += A[i] * B[i];
+        d_a += A[i] * A[i];
+        d_b += B[i] * B[i];
+    }
+    float similarity;
+    float divider = sqrtf(d_a) * sqrtf(d_b);
+    if (divider > 0) similarity = mul / divider;
+    else similarity = 0;
+
+    return similarity;
+}
+
+int get_sim_P_index(size_t i, size_t j, contrastive_params *contrast_p, int contrast_p_size)
+{
+    size_t z;
+    for (z = 0; z < contrast_p_size; ++z) {
+        if (contrast_p[z].i == i && contrast_p[z].j == j) break;
+    }
+    if (z == contrast_p_size) {
+        return -1;   // not found
+    }
+
+    return z;   // found
+}
+
+int check_sim(size_t i, size_t j, contrastive_params *contrast_p, int contrast_p_size)
+{
+    size_t z;
+    for (z = 0; z < contrast_p_size; ++z) {
+        if (contrast_p[z].i == i && contrast_p[z].j == j) break;
+    }
+    if (z == contrast_p_size) {
+        return 0;   // not found
+    }
+
+    return 1;   // found
+}
+
+float find_sim(size_t i, size_t j, contrastive_params *contrast_p, int contrast_p_size)
+{
+    size_t z;
+    for (z = 0; z < contrast_p_size; ++z) {
+        if (contrast_p[z].i == i && contrast_p[z].j == j) break;
+    }
+    if (z == contrast_p_size) {
+        printf(" Error: find_sim(): sim isn't found: i = %zu, j = %zu, z = %zu \n", i, j, z);
+        error("Error!", DARKNET_LOC);
+    }
+
+    return contrast_p[z].sim;
+}
+
+float find_P_constrastive(size_t i, size_t j, contrastive_params *contrast_p, int contrast_p_size)
+{
+    size_t z;
+    for (z = 0; z < contrast_p_size; ++z) {
+        if (contrast_p[z].i == i && contrast_p[z].j == j) break;
+    }
+    if (z == contrast_p_size) {
+        printf(" Error: find_P_constrastive(): P isn't found: i = %zu, j = %zu, z = %zu \n", i, j, z);
+        error("Error!", DARKNET_LOC);
+    }
+
+    return contrast_p[z].P;
+}
+
+// num_of_samples = 2 * loaded_images = mini_batch_size
+float P_constrastive_f_det(size_t il, int *labels, float **z, unsigned int feature_size, float temperature, contrastive_params *contrast_p, int contrast_p_size)
+{
+    const float sim = contrast_p[il].sim;
+    const size_t i = contrast_p[il].i;
+    const size_t j = contrast_p[il].j;
+
+    const float numerator = expf(sim / temperature);
+
+    float denominator = 0;
+    int k;
+    for (k = 0; k < contrast_p_size; ++k) {
+        contrastive_params cp = contrast_p[k];
+        //if (k != i && labels[k] != labels[i]) {
+        //if (k != i) {
+        if (cp.i != i && cp.j == j) {
+            //const float sim_den = cp.sim;
+            ////const float sim_den = find_sim(k, l, contrast_p, contrast_p_size); // cosine_similarity(z[k], z[l], feature_size);
+            //denominator += expf(sim_den / temperature);
+            denominator += cp.exp_sim;
+        }
+    }
+
+    float result = 0.9999;
+    if (denominator != 0) result = numerator / denominator;
+    if (result > 1) result = 0.9999;
+    return result;
+}
+
+// num_of_samples = 2 * loaded_images = mini_batch_size
+float P_constrastive_f(size_t i, size_t l, int *labels, float **z, unsigned int feature_size, float temperature, contrastive_params *contrast_p, int contrast_p_size)
+{
+    if (i == l) {
+        fprintf(stderr, " Error: in P_constrastive must be i != l, while i = %zu, l = %zu \n", i, l);
+        error("Error!", DARKNET_LOC);
+    }
+
+    const float sim = find_sim(i, l, contrast_p, contrast_p_size); // cosine_similarity(z[i], z[l], feature_size);
+    const float numerator = expf(sim / temperature);
+
+    float denominator = 0;
+    int k;
+    for (k = 0; k < contrast_p_size; ++k) {
+        contrastive_params cp = contrast_p[k];
+        //if (k != i && labels[k] != labels[i]) {
+        //if (k != i) {
+        if (cp.i != i && cp.j == l) {
+            //const float sim_den = cp.sim;
+            ////const float sim_den = find_sim(k, l, contrast_p, contrast_p_size); // cosine_similarity(z[k], z[l], feature_size);
+            //denominator += expf(sim_den / temperature);
+            denominator += cp.exp_sim;
+        }
+    }
+
+    float result = 0.9999;
+    if (denominator != 0) result = numerator / denominator;
+    if (result > 1) result = 0.9999;
+    return result;
+}
+
+void grad_contrastive_loss_positive_f(size_t i, int *class_ids, int *labels, size_t num_of_samples, float **z, unsigned int feature_size, float temperature, float *delta, int wh, contrastive_params *contrast_p, int contrast_p_size)
+{
+    const float vec_len = math_vector_length(z[i], feature_size);
+    size_t j;
+    float N = 0;
+    for (j = 0; j < num_of_samples; ++j) {
+        if (labels[i] == labels[j] && labels[i] >= 0) N++;
+    }
+    if (N == 0 || temperature == 0 || vec_len == 0) {
+        fprintf(stderr, " Error: N == 0 || temperature == 0 || vec_len == 0. N=%f, temperature=%f, vec_len=%f, labels[i] = %d \n",
+            N, temperature, vec_len, labels[i]);
+        error("Error!", DARKNET_LOC);
+    }
+    const float mult = 1 / ((N - 1) * temperature * vec_len);
+
+    for (j = 0; j < num_of_samples; ++j) {
+        //if (i != j && (i/2) == (j/2)) {
+        if (i != j && labels[i] == labels[j] && labels[i] >= 0) {
+            //printf(" i = %d, j = %d, num_of_samples = %d, labels[i] = %d, labels[j] = %d \n",
+            //    i, j, num_of_samples, labels[i], labels[j]);
+            const int sim_P_i = get_sim_P_index(i, j, contrast_p, contrast_p_size);
+            if (sim_P_i < 0) continue;
+            const float sim = contrast_p[sim_P_i].sim;
+            const float P = contrast_p[sim_P_i].P;
+            //if (!check_sim(i, j, contrast_p, contrast_p_size)) continue;
+            //const float sim = find_sim(i, j, contrast_p, contrast_p_size); //cos_sim[i*num_of_samples + j];        // cosine_similarity(z[i], z[j], feature_size);
+            //const float P = find_P_constrastive(i, j, contrast_p, contrast_p_size); //p_constrastive[i*num_of_samples + j];   // P_constrastive(i, j, labels, num_of_samples, z, feature_size, temperature, cos_sim);
+                                                                    //const float custom_pos_mult = 1 - sim;
+
+
+            int m;
+            //const float d = mult*(sim * z[i][m] - z[j][m]) * (1 - P); // 1
+            for (m = 0; m < feature_size; ++m) {
+                //const float d = mult*(sim * z[j][m] - z[j][m]) * (1 - P); // my
+                //const float d = mult*(sim * z[i][m] + sim * z[j][m] - z[j][m]) *(1 - P); // 1+2
+                const float d = mult*(sim * z[i][m] - z[j][m]) *(1 - P); // 1 (70%)
+                //const float d = mult*(sim * z[j][m] - z[j][m]) * (1 - P); // 2
+                // printf(" pos: z[j][m] = %f, z[i][m] = %f, d = %f, sim = %f \n", z[j][m], z[i][m], d, sim);
+                const int out_i = m * wh;
+                delta[out_i] -= d;
+            }
+        }
+    }
+}
+
+void grad_contrastive_loss_negative_f(size_t i, int *class_ids, int *labels, size_t num_of_samples, float **z, unsigned int feature_size, float temperature, float *delta, int wh, contrastive_params *contrast_p, int contrast_p_size, int neg_max)
+{
+    const float vec_len = math_vector_length(z[i], feature_size);
+    size_t j;
+    float N = 0;
+    for (j = 0; j < num_of_samples; ++j) {
+        if (labels[i] == labels[j] && labels[i] >= 0) N++;
+    }
+    if (N == 0 || temperature == 0 || vec_len == 0) {
+        fprintf(stderr, " Error: N == 0 || temperature == 0 || vec_len == 0. N=%f, temperature=%f, vec_len=%f, labels[i] = %d \n",
+            N, temperature, vec_len, labels[i]);
+        error("Error!", DARKNET_LOC);
+    }
+    const float mult = 1 / ((N - 1) * temperature * vec_len);
+
+    int neg_counter = 0;
+
+    for (j = 0; j < num_of_samples; ++j) {
+        //if (i != j && (i/2) == (j/2)) {
+        if (labels[i] >= 0 && labels[i] == labels[j] && i != j) {
+
+            size_t k;
+            for (k = 0; k < num_of_samples; ++k) {
+                //if (k != i && k != j && labels[k] != labels[i]) {
+                if (k != i && k != j && labels[k] != labels[i] && class_ids[j] == class_ids[k]) {
+                    neg_counter++;
+                    const int sim_P_i = get_sim_P_index(i, k, contrast_p, contrast_p_size);
+                    if (sim_P_i < 0) continue;
+                    const float sim = contrast_p[sim_P_i].sim;
+                    const float P = contrast_p[sim_P_i].P;
+                    //if (!check_sim(i, k, contrast_p, contrast_p_size)) continue;
+                    //const float sim = find_sim(i, k, contrast_p, contrast_p_size); //cos_sim[i*num_of_samples + k];        // cosine_similarity(z[i], z[k], feature_size);
+                    //const float P = find_P_constrastive(i, k, contrast_p, contrast_p_size); //p_constrastive[i*num_of_samples + k];   // P_constrastive(i, k, labels, num_of_samples, z, feature_size, temperature, cos_sim);
+                                                                            //const float custom_pos_mult = 1 + sim;
+
+                    int m;
+                    //const float d = mult*(z[k][m] + sim * z[i][m]) * P;   // my1
+                    for (m = 0; m < feature_size; ++m) {
+                        //const float d = mult*(z[k][m] + sim * z[i][m]) * P;   // 1 (70%)
+                        //const float d = mult*(z[k][m] - sim * z[k][m] - sim * z[i][m]) * P;   // 1+2
+                        const float d = mult*(z[k][m] - sim * z[i][m]) * P;   // 1 (70%)
+                        //const float d = mult*(z[k][m] - sim * z[k][m]) * P; // 2
+                        //printf(" neg: z[k][m] = %f, z[i][m] = %f, d = %f, sim = %f \n", z[k][m], z[i][m], d, sim);
+                        const int out_i = m * wh;
+                        delta[out_i] -= d;
+                    }
+
+                    if (neg_counter >= neg_max) return;
+                }
+            }
+        }
+    }
+}
+
+
+
+// num_of_samples = 2 * loaded_images = mini_batch_size
+float P_constrastive(size_t i, size_t l, int *labels, size_t num_of_samples, float **z, unsigned int feature_size, float temperature, float *cos_sim, float *exp_cos_sim)
+{
+    if (i == l) {
+        fprintf(stderr, " Error: in P_constrastive must be i != l, while i = %zu, l = %zu \n", i, l);
+        error("Error!", DARKNET_LOC);
+    }
+
+    //const float sim = cos_sim[i*num_of_samples + l]; // cosine_similarity(z[i], z[l], feature_size);
+    //const float numerator = expf(sim / temperature);
+    const float numerator = exp_cos_sim[i*num_of_samples + l];
+
+    float denominator = 0;
+    int k;
+    for (k = 0; k < num_of_samples; ++k) {
+        //if (k != i && labels[k] != labels[i]) {
+        if (k != i) {
+            //const float sim_den = cos_sim[k*num_of_samples + l]; // cosine_similarity(z[k], z[l], feature_size);
+            //denominator += expf(sim_den / temperature);
+            denominator += exp_cos_sim[k*num_of_samples + l];
+        }
+    }
+
+    float result = numerator / denominator;
+    return result;
+}
+
+// i - id of the current sample in mini_batch
+// labels[num_of_samples] - array with class_id for each sample in the current mini_batch
+// z[feature_size][num_of_samples] - array of arrays with contrastive features (output of conv-layer, f.e. 128 floats for each sample)
+// delta[feature_size] - array with deltas for backpropagation
+// temperature - scalar temperature param (temperature > 0), f.e. temperature = 0.07: Supervised Contrastive Learning
+void grad_contrastive_loss_positive(size_t i, int *labels, size_t num_of_samples, float **z, unsigned int feature_size, float temperature, float *cos_sim, float *p_constrastive, float *delta, int wh)
+{
+    const float vec_len = math_vector_length(z[i], feature_size);
+    size_t j;
+    float N = 0;
+    for (j = 0; j < num_of_samples; ++j) {
+        if (labels[i] == labels[j]) N++;
+    }
+    if (N == 0 || temperature == 0 || vec_len == 0) {
+        fprintf(stderr, " Error: N == 0 || temperature == 0 || vec_len == 0. N=%f, temperature=%f, vec_len=%f \n", N, temperature, vec_len);
+        error("Error!", DARKNET_LOC);
+    }
+    const float mult = 1 / ((N - 1) * temperature * vec_len);
+
+    for (j = 0; j < num_of_samples; ++j) {
+        //if (i != j && (i/2) == (j/2)) {
+        if (i != j && labels[i] == labels[j]) {
+            //printf(" i = %d, j = %d, num_of_samples = %d, labels[i] = %d, labels[j] = %d \n",
+            //    i, j, num_of_samples, labels[i], labels[j]);
+            const float sim = cos_sim[i*num_of_samples + j];        // cosine_similarity(z[i], z[j], feature_size);
+            const float P = p_constrastive[i*num_of_samples + j];   // P_constrastive(i, j, labels, num_of_samples, z, feature_size, temperature, cos_sim);
+            //const float custom_pos_mult = 1 - sim;
+
+            int m;
+            for (m = 0; m < feature_size; ++m) {
+                const float d = mult*(sim * z[i][m] - z[j][m]) * (1 - P); // good
+                //const float d = mult*(sim * z[j][m] - z[j][m]) * (1 - P); // bad
+               // printf(" pos: z[j][m] = %f, z[i][m] = %f, d = %f, sim = %f \n", z[j][m], z[i][m], d, sim);
+                const int out_i = m * wh;
+                delta[out_i] -= d;
+            }
+        }
+    }
+}
+
+// i - id of the current sample in mini_batch
+// labels[num_of_samples] - array with class_id for each sample in the current mini_batch
+// z[feature_size][num_of_samples] - array of arrays with contrastive features (output of conv-layer, f.e. 128 floats for each sample)
+// delta[feature_size] - array with deltas for backpropagation
+// temperature - scalar temperature param (temperature > 0), f.e. temperature = 0.07: Supervised Contrastive Learning
+void grad_contrastive_loss_negative(size_t i, int *labels, size_t num_of_samples, float **z, unsigned int feature_size, float temperature, float *cos_sim, float *p_constrastive, float *delta, int wh)
+{
+    const float vec_len = math_vector_length(z[i], feature_size);
+    size_t j;
+    float N = 0;
+    for (j = 0; j < num_of_samples; ++j) {
+        if (labels[i] == labels[j]) N++;
+    }
+    if (N == 0 || temperature == 0 || vec_len == 0) {
+        fprintf(stderr, " Error: N == 0 || temperature == 0 || vec_len == 0. N=%f, temperature=%f, vec_len=%f \n", N, temperature, vec_len);
+        error("Error!", DARKNET_LOC);
+    }
+    const float mult = 1 / ((N - 1) * temperature * vec_len);
+
+    for (j = 0; j < num_of_samples; ++j) {
+        //if (i != j && (i/2) == (j/2)) {
+        if (i != j && labels[i] == labels[j]) {
+
+            size_t k;
+            for (k = 0; k < num_of_samples; ++k) {
+                //if (k != i && k != j && labels[k] != labels[i]) {
+                if (k != i && k != j && labels[k] >= 0) {
+                    const float sim = cos_sim[i*num_of_samples + k];        // cosine_similarity(z[i], z[k], feature_size);
+                    const float P = p_constrastive[i*num_of_samples + k];   // P_constrastive(i, k, labels, num_of_samples, z, feature_size, temperature, cos_sim);
+                    //const float custom_pos_mult = 1 + sim;
+
+                    int m;
+                    for (m = 0; m < feature_size; ++m) {
+                        const float d = mult*(z[k][m] - sim * z[i][m]) * P;   // good
+                        //const float d = mult*(z[k][m] - sim * z[k][m]) * P; // bad
+                        //printf(" neg: z[k][m] = %f, z[i][m] = %f, d = %f, sim = %f \n", z[k][m], z[i][m], d, sim);
+                        const int out_i = m * wh;
+                        delta[out_i] -= d;
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/darknet-master/src/blas.h b/darknet-master/src/blas.h
new file mode 100644
index 0000000..b69a702
--- /dev/null
+++ b/darknet-master/src/blas.h
@@ -0,0 +1,184 @@
+#ifndef BLAS_H
+#define BLAS_H
+#include <stdlib.h>
+#include "darknet.h"
+
+#ifdef GPU
+#include "dark_cuda.h"
+#include "tree.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void flatten(float *x, int size, int layers, int batch, int forward);
+void pm(int M, int N, float *A);
+float *random_matrix(int rows, int cols);
+void time_random_matrix(int TA, int TB, int m, int k, int n);
+void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
+
+void test_blas();
+
+void const_cpu(int N, float ALPHA, float *X, int INCX);
+void constrain_ongpu(int N, float ALPHA, float * X, int INCX);
+void constrain_min_max_ongpu(int N, float MIN, float MAX, float * X, int INCX);
+void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
+void mul_cpu(int N, float *X, int INCX, float *Y, int INCY);
+
+void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
+void copy_cpu(int N, float *X, int INCX, float *Y, int INCY);
+void scal_cpu(int N, float ALPHA, float *X, int INCX);
+void scal_add_cpu(int N, float ALPHA, float BETA, float *X, int INCX);
+void fill_cpu(int N, float ALPHA, float * X, int INCX);
+float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
+void test_gpu_blas();
+void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
+void shortcut_multilayer_cpu(int size, int src_outputs, int batch, int n, int *outputs_of_layers, float **layers_output, float *out, float *in, float *weights, int nweights, WEIGHTS_NORMALIZATION_T weights_normalization);
+void backward_shortcut_multilayer_cpu(int size, int src_outputs, int batch, int n, int *outputs_of_layers,
+    float **layers_delta, float *delta_out, float *delta_in, float *weights, float *weight_updates, int nweights, float *in, float **layers_output, WEIGHTS_NORMALIZATION_T weights_normalization);
+
+void mean_cpu(float *x, int batch, int filters, int spatial, float *mean);
+void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
+void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
+
+void add_bias(float *output, float *biases, int batch, int n, int size);
+void scale_bias(float *output, float *scales, int batch, int n, int size);
+void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
+void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta);
+void  variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta);
+void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);
+
+void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void l2_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);
+
+void softmax(float *input, int n, float temp, float *output, int stride);
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
+void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
+void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void constrain_cpu(int size, float ALPHA, float *X);
+void fix_nan_and_inf_cpu(float *input, size_t size);
+
+
+int check_sim(size_t i, size_t j, contrastive_params *contrast_p, int contrast_p_size);
+float find_sim(size_t i, size_t j, contrastive_params *contrast_p, int contrast_p_size);
+float find_P_constrastive(size_t i, size_t j, contrastive_params *contrast_p, int contrast_p_size);
+float P_constrastive_f_det(size_t il, int *labels, float **z, unsigned int feature_size, float temperature, contrastive_params *contrast_p, int contrast_p_size);
+float P_constrastive_f(size_t i, size_t l, int *labels, float **z, unsigned int feature_size, float temperature, contrastive_params *contrast_p, int contrast_p_size);
+void grad_contrastive_loss_positive_f(size_t i, int *class_ids, int *labels, size_t num_of_samples, float **z, unsigned int feature_size, float temperature, float *delta, int wh, contrastive_params *contrast_p, int contrast_p_size);
+void grad_contrastive_loss_negative_f(size_t i, int *class_ids, int *labels, size_t num_of_samples, float **z, unsigned int feature_size, float temperature, float *delta, int wh, contrastive_params *contrast_p, int contrast_p_size, int neg_max);
+
+void get_embedding(float *src, int src_w, int src_h, int src_c, int embedding_size, int cur_w, int cur_h, int cur_n, int cur_b, float *dst);
+float math_vector_length(float *A, unsigned int feature_size);
+float cosine_similarity(float *A, float *B, unsigned int feature_size);
+float P_constrastive(size_t i, size_t l, int *labels, size_t num_of_samples, float **z, unsigned int feature_size, float temperature, float *cos_sim, float *exp_cos_sim);
+void grad_contrastive_loss_positive(size_t i, int *labels, size_t num_of_samples, float **z, unsigned int feature_size, float temperature, float *cos_sim, float *p_constrastive, float *delta, int wh);
+void grad_contrastive_loss_negative(size_t i, int *labels, size_t num_of_samples, float **z, unsigned int feature_size, float temperature, float *cos_sim, float *p_constrastive, float *delta, int wh);
+
+
+#ifdef GPU
+
+void constrain_weight_updates_ongpu(int N, float coef, float *weights_gpu, float *weight_updates_gpu);
+void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
+void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
+void simple_copy_ongpu(int size, float *src, float *dst);
+void memcpy_ongpu(void *dst, void *src, int size_bytes);
+void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY);
+void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
+void scal_ongpu(int N, float ALPHA, float * X, int INCX);
+void scal_add_ongpu(int N, float ALPHA, float BETA, float * X, int INCX);
+void supp_ongpu(int N, float ALPHA, float * X, int INCX);
+void mask_gpu_new_api(int N, float * X, float mask_num, float * mask, float val);
+void mask_ongpu(int N, float * X, float mask_num, float * mask);
+void const_ongpu(int N, float ALPHA, float *X, int INCX);
+void pow_ongpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
+void mul_ongpu(int N, float *X, int INCX, float *Y, int INCY);
+void fill_ongpu(int N, float ALPHA, float * X, int INCX);
+void gradient_centralization_gpu(int w, int h, int c, int f, float *in);
+
+void mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
+void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
+void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
+
+void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);
+
+void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta);
+void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta);
+
+void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
+void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
+void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, int max_minibatch_index, float *m_avg, float *v_avg, float *variance,
+    const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon);
+void inverse_variance_ongpu(int size, float *src, float *dst, float epsilon);
+void normalize_scale_bias_gpu(float *x, float *mean, float *variance, float *scales, float *biases, int batch, int filters, int spatial, int inverse_variance, float epsilon);
+void compare_2_arrays_gpu(float *one, float *two, int size);
+void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
+void shortcut_multilayer_gpu(int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_output_gpu, float *out, float *in, float *weights_gpu, int nweights, WEIGHTS_NORMALIZATION_T weights_normalization);
+void backward_shortcut_multilayer_gpu(int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_delta_gpu, float *delta_out, float *delta_in,
+    float *weights, float *weight_updates, int nweights, float *in, float **layers_output, WEIGHTS_NORMALIZATION_T weights_normalization);
+void input_shortcut_gpu(float *in, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
+void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
+void mean_array_gpu(float *src, int size, float alpha, float *avg);
+void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
+void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
+void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
+
+void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void l2_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc);
+void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c);
+void mult_add_into_gpu(int num, float *a, float *b, float *c);
+
+void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
+
+void softmax_gpu_new_api(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
+void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output);
+void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t);
+void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t);
+
+void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out);
+
+void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
+
+void softmax_tree_gpu(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier);
+
+void fix_nan_and_inf(float *input, size_t size);
+void reset_nan_and_inf(float *input, size_t size);
+int is_nan_or_inf(float *input, size_t size);
+
+void add_3_arrays_activate(float *a1, float *a2, float *a3, size_t size, ACTIVATION a, float *dst);
+void sum_of_mults(float *a1, float *a2, float *b1, float *b2, size_t size, float *dst);
+void activate_and_mult(float *a1, float *a2, size_t size, ACTIVATION a, float *dst);
+
+void scale_channels_gpu(float *in_w_h_c, int size, int channel_size, int batch_size, int scale_wh, float *scales_c, float *out);
+void backward_scale_channels_gpu(float *in_w_h_c_delta, int size, int channel_size, int batch_size, int scale_wh,
+    float *in_scales_c, float *out_from_delta,
+    float *in_from_output, float *out_state_delta);
+
+
+void backward_sam_gpu(float *in_w_h_c_delta, int size, int channel_size,
+    float *in_scales_c, float *out_from_delta,
+    float *in_from_output, float *out_state_delta);
+
+void sam_gpu(float *in_w_h_c, int size, int channel_size, float *scales_c, float *out);
+
+void smooth_rotate_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int angle, int reverse);
+void stretch_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, float scale, int reverse);
+void sway_and_flip_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int angle, int reverse);
+void stretch_sway_flip_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int angle, int reverse);
+void rotate_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int reverse);
+void reduce_and_expand_array_gpu(const float *src_gpu, float *dst_gpu, int size, int groups);
+void expand_array_gpu(const float *src_gpu, float *dst_gpu, int size, int groups);
+void mult_inverse_array_gpu(const float *src_gpu, float *dst_gpu, int size, float eps, float divider, float clip, float abs_add);
+void P_constrastive_f_det_gpu(int *labels, unsigned int feature_size, float temperature, contrastive_params *contrast_p, const int contrast_p_size);
+void coord_conv_gpu(float *dst, int size, int w, int h, int chan, int b, int type);
+
+void forward_implicit_gpu(int batch, int nweights, float *weight_gpu, float *output_gpu);
+void backward_implicit_gpu(int batch, int nweights, float *weight_updates_gpu, float *delta_gpu);
+
+#endif // GPU
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/blas_kernels.cu b/darknet-master/src/blas_kernels.cu
new file mode 100644
index 0000000..3bc0d90
--- /dev/null
+++ b/darknet-master/src/blas_kernels.cu
@@ -0,0 +1,2470 @@
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+#include <assert.h>
+#include <float.h>
+
+#include "blas.h"
+#include "dark_cuda.h"
+#include "utils.h"
+#include "tree.h"
+
+__inline__ __device__
+float warpAllReduceSum(float val) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2)
+#if CUDART_VERSION >= 9000
+        val += __shfl_xor_sync(0xffffffff, val, mask);
+#else
+        val += __shfl_xor(val, mask);
+#endif
+    return val;
+}
+
+__global__ void compare_2_arrays_kernel(float *one, float *two, int size)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= size) return;
+
+    const float diff = 100 * fabs(one[index] - two[index]) / fabs(one[index]);
+
+    if (diff > 10) printf(" i: %d - one = %f, two = %f, diff = %f %% \n", index, one[index], two[index], diff);
+}
+
+void compare_2_arrays_gpu(float *one, float *two, int size)
+{
+    const int num_blocks = get_number_of_blocks(size, BLOCK);
+
+    compare_2_arrays_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(one, two, size);
+    CHECK_CUDA(cudaPeekAtLastError());
+    CHECK_CUDA(cudaDeviceSynchronize());
+}
+
+__global__ void mean_array_kernel(float *src, int size, float alpha, float *avg)
+{
+    const int i = blockIdx.x*blockDim.x + threadIdx.x;
+    if (i >= size) return;
+
+    avg[i] = avg[i] * (1 - alpha) + src[i] * alpha;
+    src[i] = avg[i];
+}
+
+
+void mean_array_gpu(float *src, int size, float alpha, float *avg)
+{
+    const int num_blocks = get_number_of_blocks(size, BLOCK);
+
+    mean_array_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(src, size, alpha, avg);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void scale_bias_kernel(float *output, float *scale, int batch, int filters, int spatial, int current_size)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= current_size) return;
+
+    int f = (index / spatial) % filters;
+    output[index] *= scale[f];
+}
+
+void scale_bias_gpu(float *output, float *scale, int batch, int filters, int spatial)
+{
+    const int current_size = batch * filters * spatial;
+    const int num_blocks = get_number_of_blocks(current_size, BLOCK);
+
+    scale_bias_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(output, scale, batch, filters, spatial, current_size);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void backward_scale_kernel(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
+{
+    __shared__ float part[BLOCK];
+    int i,b;
+    int filter = blockIdx.x;
+    int p = threadIdx.x;
+    float sum = 0;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < size; i += BLOCK){
+            int index = p + i + size*(filter + n*b);
+            sum += (p+i < size) ? delta[index]*x_norm[index] : 0;
+        }
+    }
+    part[p] = sum;
+    __syncthreads();
+    if (p == 0) {
+        for(i = 0; i < BLOCK; ++i) scale_updates[filter] += part[i];
+    }
+}
+
+void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
+{
+    backward_scale_kernel<<<n, BLOCK, 0, get_cuda_stream() >>>(x_norm, delta, batch, n, size, scale_updates);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void add_bias_kernel(float *output, float *biases, int batch, int filters, int spatial, int current_size)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= current_size) return;
+
+    int f = (index / spatial) % filters;
+    output[index] += biases[f];
+}
+
+void add_bias_gpu(float *output, float *biases, int batch, int filters, int spatial)
+{
+    const int current_size = batch * filters * spatial;
+    const int num_blocks = get_number_of_blocks(current_size, BLOCK);
+
+    add_bias_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(output, biases, batch, filters, spatial, current_size);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void backward_bias_kernel(float *bias_updates, float *delta, int batch, int n, int size)
+{
+    __shared__ float part[BLOCK];
+    int i,b;
+    int filter = blockIdx.x;
+    int p = threadIdx.x;
+    float sum = 0;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < size; i += BLOCK){
+            int index = p + i + size*(filter + n*b);
+            sum += (p+i < size) ? delta[index] : 0;
+        }
+    }
+    part[p] = sum;
+    __syncthreads();
+    if (p == 0) {
+        for(i = 0; i < BLOCK; ++i) bias_updates[filter] += part[i];
+    }
+}
+
+/*
+__global__ void dot_kernel(float *output, float scale, int batch, int n, int size, float *delta)
+{
+    int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    int f1 = index / n;
+    int f2 = index % n;
+    if (f2 <= f1) return;
+
+    float sum = 0;
+    float norm1 = 0;
+    float norm2 = 0;
+    int b, i;
+    for(b = 0; b <  batch; ++b){
+        for(i = 0; i < size; ++i){
+            int i1 = b * size * n + f1 * size + i;
+            int i2 = b * size * n + f2 * size + i;
+            sum += output[i1] * output[i2];
+            norm1 += output[i1] * output[i1];
+            norm2 += output[i2] * output[i2];
+        }
+    }
+    norm1 = sqrt(norm1);
+    norm2 = sqrt(norm2);
+    float norm = norm1 * norm2;
+    sum = sum / norm;
+    for(b = 0; b <  batch; ++b){
+        for(i = 0; i < size; ++i){
+            int i1 = b * size * n + f1 * size + i;
+            int i2 = b * size * n + f2 * size + i;
+            delta[i1] += - scale * sum * output[i2] / norm;
+            delta[i2] += - scale * sum * output[i1] / norm;
+        }
+    }
+}
+
+void dot_error_gpu(layer l)
+{
+    dot_kernel<<<cuda_gridsize(l.n*l.n), BLOCK, 0, get_cuda_stream()>>>(l.output_gpu, l.dot, l.batch, l.n, l.out_w * l.out_h, l.delta_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+*/
+
+void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size)
+{
+    backward_bias_kernel<<<n, BLOCK, 0, get_cuda_stream() >>>(bias_updates, delta, batch, n, size);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t)
+{
+    int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (index >= N) return;
+
+    float mhat = m[index] / (1.f - powf(B1, t));
+    float vhat = v[index] / (1.f - powf(B2, t));
+
+    x[index] = x[index] + rate * mhat / (sqrtf(vhat) + eps);
+}
+
+extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t)
+{
+    adam_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, x, m, v, B1, B2, rate, eps, t);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t)
+{
+    scal_ongpu(n, B1, m, 1);
+    scal_ongpu(n, B2, v, 1);
+    axpy_ongpu(n, -decay*batch, w, 1, d, 1);
+
+    axpy_ongpu(n, (1 - B1), d, 1, m, 1);
+    mul_ongpu(n, d, 1, d, 1);
+    axpy_ongpu(n, (1 - B2), d, 1, v, 1);
+
+    adam_gpu(n, w, m, v, B1, B2, rate, eps, t);
+    fill_ongpu(n, 0, d, 1);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= N) return;
+    int f = (index / spatial) % filters;
+
+    x[index] = (x[index] - mean[f]) / (sqrtf(variance[f] + .00001f));
+}
+
+extern "C" void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
+{
+    const int current_size = batch * filters * spatial;
+    const int num_blocks = get_number_of_blocks(current_size, BLOCK);
+
+    normalize_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(current_size, x, mean, variance, batch, filters, spatial);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void normalize_delta_kernel(int N, float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
+{
+    int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (index >= N) return;
+    int f = (index/spatial)%filters;
+
+    delta[index] = delta[index] * 1.F/(sqrtf(variance[f]) + .000001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
+}
+
+extern "C" void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
+{
+    size_t N = batch*filters*spatial;
+    normalize_delta_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, x, mean, variance, mean_delta, variance_delta, batch, filters, spatial, delta);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void  variance_delta_kernel(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i >= filters) return;
+    int j,k;
+    variance_delta[i] = 0;
+    for(j = 0; j < batch; ++j){
+        for(k = 0; k < spatial; ++k){
+            int index = j*filters*spatial + i*spatial + k;
+            variance_delta[i] += delta[index]*(x[index] - mean[i]);
+        }
+    }
+    variance_delta[i] *= -.5 * powf(variance[i] + .000001f, (float)(-3./2.));
+}
+
+__global__ void accumulate_kernel(float *x, int n, int groups, float *sum)
+{
+    int k;
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i >= groups) return;
+    sum[i] = 0;
+    for(k = 0; k < n; ++k){
+        sum[i] += x[k*groups + i];
+    }
+}
+
+__global__ void fast_mean_delta_kernel(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
+{
+    const int threads = BLOCK;
+    __shared__ float local[threads];
+
+    int id = threadIdx.x;
+    local[id] = 0;
+
+    int filter = blockIdx.x;
+
+    int i, j;
+    for(j = 0; j < batch; ++j){
+        for(i = 0; i < spatial; i += threads){
+            int index = j*spatial*filters + filter*spatial + i + id;
+            local[id] += (i+id < spatial) ? delta[index] : 0;
+        }
+    }
+    __syncthreads();
+
+    if(id == 0){
+        mean_delta[filter] = 0;
+        for(i = 0; i < threads; ++i){
+            mean_delta[filter] += local[i];
+        }
+        mean_delta[filter] *= (-1.F/sqrtf(variance[filter] + .000001f));
+    }
+}
+
+__global__ void  fast_variance_delta_kernel(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
+{
+    const int threads = BLOCK;
+    __shared__ float local[threads];
+
+    int id = threadIdx.x;
+    local[id] = 0;
+
+    int filter = blockIdx.x;
+
+    int i, j;
+    for(j = 0; j < batch; ++j){
+        for(i = 0; i < spatial; i += threads){
+            int index = j*spatial*filters + filter*spatial + i + id;
+
+            local[id] += (i+id < spatial) ? delta[index]*(x[index] - mean[filter]) : 0;
+        }
+    }
+    __syncthreads();
+
+    if(id == 0){
+        variance_delta[filter] = 0;
+        for(i = 0; i < threads; ++i){
+            variance_delta[filter] += local[i];
+        }
+        variance_delta[filter] *= -.5 * powf(variance[filter] + .000001f, (float)(-3./2.));
+    }
+}
+
+
+__global__ void mean_delta_kernel(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i >= filters) return;
+    int j,k;
+    mean_delta[i] = 0;
+    for (j = 0; j < batch; ++j) {
+        for (k = 0; k < spatial; ++k) {
+            int index = j*filters*spatial + i*spatial + k;
+            mean_delta[i] += delta[index];
+        }
+    }
+    mean_delta[i] *= (-1.F/sqrtf(variance[i] + .000001f));
+}
+
+extern "C" void mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
+{
+    mean_delta_kernel<<<cuda_gridsize(filters), BLOCK, 0, get_cuda_stream() >>>(delta, variance, batch, filters, spatial, mean_delta);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
+{
+    fast_mean_delta_kernel<<<filters, BLOCK, 0, get_cuda_stream() >>>(delta, variance, batch, filters, spatial, mean_delta);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
+{
+    fast_variance_delta_kernel<<<filters, BLOCK, 0, get_cuda_stream() >>>(x, delta, mean, variance, batch, filters, spatial, variance_delta);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void  mean_kernel(float *x, int batch, int filters, int spatial, float *mean)
+{
+    float scale = 1.F/(batch * spatial);
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i >= filters) return;
+    int j,k;
+    mean[i] = 0;
+    for(j = 0; j < batch; ++j){
+        for(k = 0; k < spatial; ++k){
+            int index = j*filters*spatial + i*spatial + k;
+            mean[i] += x[index];
+        }
+    }
+    mean[i] *= scale;
+}
+
+__global__ void variance_kernel(float *x, float *mean, int batch, int filters, int spatial, float *variance)
+{
+    float scale = 1.F/(batch * spatial - 1);
+    int j,k;
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i >= filters) return;
+    variance[i] = 0;
+    for(j = 0; j < batch; ++j){
+        for(k = 0; k < spatial; ++k){
+            int index = j*filters*spatial + i*spatial + k;
+            variance[i] += powf((x[index] - mean[i]), 2);
+        }
+    }
+    variance[i] *= scale;
+}
+
+__global__ void reorg_kernel(int N, float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i >= N) return;
+    int in_index = i;
+    int in_w = i%w;
+    i = i/w;
+    int in_h = i%h;
+    i = i/h;
+    int in_c = i%c;
+    i = i/c;
+    int b = i%batch;
+
+    int out_c = c/(stride*stride);
+
+    int c2 = in_c % out_c;
+    int offset = in_c / out_c;
+    int w2 = in_w*stride + offset % stride;
+    int h2 = in_h*stride + offset / stride;
+    //printf("%d\n", offset);
+    int out_index = w2 + w*stride*(h2 + h*stride*(c2 + out_c*b));
+
+   // printf("%d %d %d\n", w2, h2, c2);
+    //printf("%d %d\n", in_index, out_index);
+    //if(out_index >= N || out_index < 0) printf("bad bad bad \n");
+
+    if(forward) out[out_index] = x[in_index];
+    else out[in_index] = x[out_index];
+    //if(forward) out[1] = x[1];
+    //else out[0] = x[0];
+}
+
+__global__ void constrain_weight_updates_kernel(int N, float coef, float *weights_gpu, float *weight_updates_gpu)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < N) {
+        const float w = weights_gpu[i];
+        const float wu = weight_updates_gpu[i];
+        const float wu_sign = (wu == 0) ? 0 : (fabs(wu) / wu);
+        const float abs_limit = fabs(w * coef);
+        if (fabs(wu) > abs_limit) weight_updates_gpu[i] = abs_limit * wu_sign;
+    }
+}
+
+extern "C" void constrain_weight_updates_ongpu(int N, float coef, float *weights_gpu, float *weight_updates_gpu)
+{
+    constrain_weight_updates_kernel <<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, coef, weights_gpu, weight_updates_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX,  float *Y, int OFFY, int INCY)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < N) Y[OFFY+i*INCY] += ALPHA*X[OFFX+i*INCX];
+}
+
+__global__ void pow_kernel(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < N) Y[i*INCY] = powf(X[i*INCX], ALPHA);
+}
+
+__global__ void const_kernel(int N, float ALPHA, float *X, int INCX)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < N) X[i*INCX] = ALPHA;
+}
+
+__global__ void constrain_kernel(int N, float ALPHA, float *X, int INCX)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < N) X[i*INCX] = fminf(ALPHA, fmaxf(-ALPHA, X[i*INCX]));
+}
+__global__ void constrain_min_max_kernel(int N, float MIN, float MAX, float *X, int INCX)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < N) X[i*INCX] = fminf(MAX, fmaxf(MIN, X[i*INCX]));
+}
+
+__global__ void supp_kernel(int N, float ALPHA, float *X, int INCX)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < N) {
+        if((X[i*INCX] * X[i*INCX]) < (ALPHA * ALPHA)) X[i*INCX] = 0;
+    }
+}
+
+__global__ void scal_kernel(int N, float ALPHA, float *X, int INCX)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < N) X[i*INCX] *= ALPHA;
+}
+
+__global__ void scal_add_kernel(int N, float ALPHA, float BETA, float *X, int INCX)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < N) X[i*INCX] = X[i*INCX] * ALPHA + BETA;
+}
+
+__global__ void fill_kernel(int N, float ALPHA, float *X, int INCX)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= N) return;
+    X[index*INCX] = ALPHA;
+}
+
+__global__ void mask_kernel_new_api(int n, float *x, float mask_num, float *mask, float val)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n && mask[i] == mask_num) x[i] = val;
+}
+
+__global__ void mask_kernel(int n, float *x, float mask_num, float *mask)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n && mask[i] == mask_num) x[i] = mask_num;
+}
+
+__global__ void copy_kernel(int N,  float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < N) Y[i*INCY + OFFY] = X[i*INCX + OFFX];
+}
+
+__global__ void simple_copy_kernel(int size, float *src, float *dst)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size)
+        dst[index] = src[index];
+}
+
+__global__ void mul_kernel(int N, float *X, int INCX, float *Y, int INCY)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < N) Y[i*INCY] *= X[i*INCX];
+}
+
+
+__global__ void  fast_mean_kernel(float *x, int batch, int filters, int spatial, float *mean)
+{
+    const int threads = BLOCK;
+    __shared__ float local[threads];
+
+    int id = threadIdx.x;
+    local[id] = 0;
+
+    int filter = blockIdx.x;
+
+    int i, j;
+    for(j = 0; j < batch; ++j){
+        for(i = 0; i < spatial; i += threads){
+            int index = j*spatial*filters + filter*spatial + i + id;
+            local[id] += (i+id < spatial) ? x[index] : 0;
+        }
+    }
+    __syncthreads();
+
+    if(id == 0){
+        float mean_tmp = 0;
+        for(i = 0; i < threads; ++i){
+            mean_tmp += local[i];
+        }
+        mean_tmp /= spatial * batch;
+        mean[filter] = mean_tmp;
+    }
+}
+
+extern "C" void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean)
+{
+    fast_mean_kernel <<<filters, BLOCK, 0, get_cuda_stream() >>>(x, batch, filters, spatial, mean);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void  fast_variance_kernel(float *x, float *mean, int batch, int filters, int spatial, float *variance)
+{
+    const int threads = BLOCK;
+    __shared__ float local[threads];
+
+    int id = threadIdx.x;
+    local[id] = 0;
+
+    int filter = blockIdx.x;
+
+    int i, j;
+    for(j = 0; j < batch; ++j){
+        for(i = 0; i < spatial; i += threads){
+            int index = j*spatial*filters + filter*spatial + i + id;
+
+            local[id] += (i+id < spatial) ? powf((x[index] - mean[filter]), 2) : 0;
+        }
+    }
+    __syncthreads();
+
+    if(id == 0){
+        float variance_tmp = 0;
+        for(i = 0; i < threads; ++i){
+            variance_tmp += local[i];
+        }
+        variance_tmp /= (spatial * batch);// -1);
+        variance[filter] = variance_tmp;
+    }
+}
+
+extern "C" void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
+{
+    fast_variance_kernel<<<filters, BLOCK, 0, get_cuda_stream() >>>(x, mean, batch, filters, spatial, variance);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void  fast_v_cbn_kernel(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, int max_minibatch_index, float *m_avg, float *v_avg, float *variance,
+    const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon)
+{
+    const int threads = BLOCK;
+    __shared__ float local[threads];
+
+    int id = threadIdx.x;
+    local[id] = 0;
+
+    int filter = blockIdx.x;
+
+    int i, j;
+    for (j = 0; j < batch; ++j) {
+        for (i = 0; i < spatial; i += threads) {
+            int index = j*spatial*filters + filter*spatial + i + id;
+
+            local[id] += (i + id < spatial) ? powf(x[index], 2) : 0;
+        }
+    }
+    __syncthreads();
+
+    if (id == 0) {
+        float v_tmp = 0;
+        v_tmp = 0;
+        for (i = 0; i < threads; ++i) {
+            v_tmp += local[i];
+        }
+        v_tmp /= (spatial * batch - 1);
+
+        v_tmp = fmax(v_tmp, powf(mean[filter], 2));
+
+
+        const float alpha_cbn = 1.0f / minibatch_index;
+
+        m_avg[filter] = alpha_cbn * mean[filter] + (1 - alpha_cbn) * m_avg[filter];
+        mean[filter] = m_avg[filter];
+
+        v_avg[filter] = alpha_cbn * v_tmp + (1 - alpha_cbn) * v_avg[filter];
+
+        float variance_tmp = fmax(0.0f, v_avg[filter] - powf(m_avg[filter], 2));
+        if (inverse_variance) variance[filter] = 1.0f / sqrtf(variance_tmp + epsilon);
+        else variance[filter] = variance_tmp;
+
+        //if (max_minibatch_index == minibatch_index)
+        {
+            if(rolling_mean_gpu) rolling_mean_gpu[filter] = alpha * mean[filter] + (1 - alpha) * rolling_mean_gpu[filter];
+
+            if(rolling_variance_gpu) rolling_variance_gpu[filter] = alpha * variance_tmp + (1 - alpha) * rolling_variance_gpu[filter];
+        }
+    }
+}
+
+extern "C" void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, int max_minibatch_index, float *m_avg, float *v_avg, float *variance,
+    const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon)
+{
+    fast_v_cbn_kernel <<<filters, BLOCK, 0, get_cuda_stream() >>>(x, mean, batch, filters, spatial, minibatch_index, max_minibatch_index, m_avg, v_avg, variance, alpha, rolling_mean_gpu, rolling_variance_gpu, inverse_variance, epsilon);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void inverse_variance_kernel(int size, float *src, float *dst, float epsilon)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size)
+        dst[index] = 1.0f / sqrtf(src[index] + epsilon);
+}
+
+extern "C" void inverse_variance_ongpu(int size, float *src, float *dst, float epsilon)
+{
+    const int num_blocks = size / BLOCK + 1;
+    inverse_variance_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(size, src, dst, epsilon);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void normalize_scale_bias_kernel(int N, float *x, float *mean, float *variance, float *scales, float *biases, int batch, int filters, int spatial, int inverse_variance, float epsilon)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= N) return;
+    int f = (index / spatial) % filters;
+
+    float val = 0;
+    if(inverse_variance) val = (x[index] - mean[f]) * variance[f];
+    else val = (x[index] - mean[f]) / (sqrtf(variance[f] + epsilon));
+    val *= scales[f];
+    val += biases[f];
+
+    if (!isnan(val) && !isinf(val))
+        x[index] = val;
+}
+
+extern "C" void normalize_scale_bias_gpu(float *x, float *mean, float *variance, float *scales, float *biases, int batch, int filters, int spatial, int inverse_variance, float epsilon)
+{
+    const int current_size = batch * filters * spatial;
+    const int num_blocks = get_number_of_blocks(current_size, BLOCK);
+
+    normalize_scale_bias_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(current_size, x, mean, variance, scales, biases, batch, filters, spatial, inverse_variance, epsilon);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void mean_gpu(float *x, int batch, int filters, int spatial, float *mean)
+{
+    mean_kernel<<<cuda_gridsize(filters), BLOCK, 0, get_cuda_stream() >>>(x, batch, filters, spatial, mean);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
+{
+    variance_kernel<<<cuda_gridsize(filters), BLOCK, 0, get_cuda_stream() >>>(x, mean, batch, filters, spatial, variance);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
+{
+    axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY);
+}
+
+extern "C" void pow_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
+{
+    pow_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, ALPHA, X, INCX, Y, INCY);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
+{
+    axpy_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream()>>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY)
+{
+    copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY);
+}
+
+extern "C" void simple_copy_ongpu(int size, float *src, float *dst)
+{
+    const int num_blocks = size / BLOCK + 1;
+    simple_copy_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(size, src, dst);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void memcpy_ongpu(void *dst, void *src, int size_bytes)
+{
+    CHECK_CUDA(cudaMemcpyAsync(dst, src, size_bytes, cudaMemcpyDefault, get_cuda_stream()));
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void mul_ongpu(int N, float * X, int INCX, float * Y, int INCY)
+{
+    mul_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, X, INCX, Y, INCY);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
+{
+    copy_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream()>>>(N, X, OFFX, INCX, Y, OFFY, INCY);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void flatten_kernel(int N, float *x, int spatial, int layers, int batch, int forward, float *out)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i >= N) return;
+    int in_s = i%spatial;
+    i = i/spatial;
+    int in_c = i%layers;
+    i = i/layers;
+    int b = i;
+
+    int i1 = b*layers*spatial + in_c*spatial + in_s;
+    int i2 = b*layers*spatial + in_s*layers +  in_c;
+
+    if (forward) out[i2] = x[i1];
+    else out[i1] = x[i2];
+}
+
+extern "C" void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out)
+{
+    int size = spatial*batch*layers;
+    flatten_kernel<<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream()>>>(size, x, spatial, layers, batch, forward, out);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
+{
+    int size = w*h*c*batch;
+    reorg_kernel<<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream()>>>(size, x, w, h, c, batch, stride, forward, out);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void mask_gpu_new_api(int N, float * X, float mask_num, float * mask, float val)
+{
+    mask_kernel_new_api <<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, X, mask_num, mask, val);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask)
+{
+    mask_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, X, mask_num, mask);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void const_ongpu(int N, float ALPHA, float * X, int INCX)
+{
+    const_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, ALPHA, X, INCX);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void constrain_ongpu(int N, float ALPHA, float * X, int INCX)
+{
+    constrain_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, ALPHA, X, INCX);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void constrain_min_max_ongpu(int N, float MIN, float MAX, float * X, int INCX)
+{
+    constrain_min_max_kernel <<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, MIN, MAX, X, INCX);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX)
+{
+    scal_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream()>>>(N, ALPHA, X, INCX);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void scal_add_ongpu(int N, float ALPHA, float BETA, float * X, int INCX)
+{
+    scal_add_kernel <<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, ALPHA, BETA, X, INCX);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void supp_ongpu(int N, float ALPHA, float * X, int INCX)
+{
+    supp_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, ALPHA, X, INCX);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void fill_ongpu(int N, float ALPHA, float * X, int INCX)
+{
+    //fill_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream()>>>(N, ALPHA, X, INCX);
+    //CHECK_CUDA(cudaPeekAtLastError());
+    fill_kernel <<<get_number_of_blocks(N, BLOCK), BLOCK, 0, get_cuda_stream() >>>(N, ALPHA, X, INCX);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void gradient_centralization_kernel(int filters, int f_size, float *in)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    const int tid = index % WARP_SIZE;
+    const int f = index / WARP_SIZE;
+
+    if (f >= filters) return;
+
+    float mean = 0;
+    for (int i = 0; i < f_size; i += WARP_SIZE) {
+        mean += warpAllReduceSum(in[f*f_size + i + tid]);
+    }
+    mean = mean / f_size;
+    for (int i = 0; i < f_size; i += WARP_SIZE) {
+        in[f*f_size + i + tid] -= mean;
+    }
+
+}
+
+extern "C" void gradient_centralization_gpu(int w, int h, int c, int f, float *in)
+{
+    const int size = f * WARP_SIZE;
+    const int f_size = c * h * w;
+    if (f_size % WARP_SIZE == 0) {
+
+        gradient_centralization_kernel <<<get_number_of_blocks(size, BLOCK), BLOCK, 0, get_cuda_stream() >>> (f, f_size, in);
+        CHECK_CUDA(cudaPeekAtLastError());
+    }
+}
+
+__device__ float relu(float src) {
+    if (src > 0) return src;
+    return 0;
+}
+
+__device__ float lrelu(float src) {
+    const float eps = 0.001;
+    if (src > eps) return src;
+    return eps;
+}
+
+__device__ float grad_relu(float src) {
+    return (src > 0);
+}
+
+__device__ float grad_lrelu(float src) {
+    const float eps = 0.001;
+    return (src > eps);
+}
+
+__global__ void shortcut_singlelayer_simple_kernel(int size, int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_output_gpu, float *out, float *in, float *weights_gpu, int nweights, WEIGHTS_NORMALIZATION_T weights_normalization)
+{
+    const int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= size) return;
+
+    int src_id = id;
+    const int src_i = src_id % src_outputs;
+    src_id /= src_outputs;
+    int src_b = src_id;
+
+    float out_val = in[id];
+
+    int add_outputs = outputs_of_layers_gpu[0];
+    if (src_i < add_outputs) {
+        int add_index = add_outputs*src_b + src_i;
+
+        float *add = layers_output_gpu[0];
+        out_val += add[add_index];
+    }
+    out[id] = out_val;
+}
+
+__global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_output_gpu, float *out, float *in, float *weights_gpu, int nweights, WEIGHTS_NORMALIZATION_T weights_normalization)
+{
+    const int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= size) return;
+
+    // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w)
+    const int layer_step = nweights / (n + 1);    // 1 or l.c or (l.c * l.h * l.w)
+    int step = 0;
+    if (nweights > 0) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1
+
+    int src_id = id;
+    const int src_i = src_id % src_outputs;
+    src_id /= src_outputs;
+    int src_b = src_id;
+
+    float sum = 1, max_val = -FLT_MAX;
+    if (weights_gpu && weights_normalization) {
+        if (weights_normalization == SOFTMAX_NORMALIZATION) {
+            for (int i = 0; i < (n + 1); ++i) {
+                const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
+                const float w = weights_gpu[weights_index];
+                if (max_val < w) max_val = w;
+            }
+        }
+        const float eps = 0.0001;
+        sum = eps;
+        for (int i = 0; i < (n + 1); ++i) {
+            const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
+            const float w = weights_gpu[weights_index];
+            if (weights_normalization == RELU_NORMALIZATION) sum += lrelu(w);
+            else if (weights_normalization == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
+        }
+    }
+
+    float out_val = 0;
+
+    if (weights_gpu) {
+        float w = weights_gpu[src_i / step];
+        if (weights_normalization == RELU_NORMALIZATION) w = lrelu(w) / sum;
+        else if (weights_normalization == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
+
+        out_val = in[id] * w; // [0 or c or (c, h ,w)]
+    }
+    else out_val = in[id];
+
+    // layers
+    for (int i = 0; i < n; ++i) {
+        int add_outputs = outputs_of_layers_gpu[i];
+        if (src_i < add_outputs) {
+            int add_index = add_outputs*src_b + src_i;
+
+            float *add = layers_output_gpu[i];
+
+            if (weights_gpu) {
+                const int weights_index = src_i / step + (i + 1)*layer_step;  // [0 or c or (c, h ,w)]
+                float w = weights_gpu[weights_index];
+                if (weights_normalization == RELU_NORMALIZATION) w = lrelu(w) / sum;
+                else if (weights_normalization == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
+
+                out_val += add[add_index] * w; // [0 or c or (c, h ,w)]
+            }
+            else out_val += add[add_index];
+        }
+    }
+    out[id] = out_val;
+}
+
+extern "C" void shortcut_multilayer_gpu(int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_output_gpu, float *out, float *in, float *weights_gpu, int nweights, WEIGHTS_NORMALIZATION_T weights_normalization)
+{
+    //printf(" src_outputs = %d, batch = %d, n = %d \n", src_outputs, batch, n);
+    int size = batch * src_outputs;
+    if (nweights == 0 && n == 1) {
+        shortcut_singlelayer_simple_kernel <<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>> (size, src_outputs, batch, n, outputs_of_layers_gpu, layers_output_gpu, out, in, weights_gpu, nweights, weights_normalization);
+    }
+    else {
+        shortcut_multilayer_kernel <<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>> (size, src_outputs, batch, n, outputs_of_layers_gpu, layers_output_gpu, out, in, weights_gpu, nweights, weights_normalization);
+    }
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void backward_shortcut_multilayer_kernel(int size, int src_outputs, int batch, int n, int *outputs_of_layers_gpu,
+    float **layers_delta_gpu, float *delta_out, float *delta_in, float *weights_gpu, float *weight_updates_gpu, int nweights, float *in, float **layers_output_gpu, WEIGHTS_NORMALIZATION_T weights_normalization)
+{
+    const int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= size) return;
+
+    // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w)
+    const int layer_step = nweights / (n + 1);    // 1 or l.c or (l.c * l.h * l.w)
+    int step = 0;
+    if (nweights > 0) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1
+
+    int src_id = id;
+    const int src_i = src_id % src_outputs;
+    src_id /= src_outputs;
+    int src_b = src_id;
+
+    float grad = 1, sum = 1, max_val = -FLT_MAX;
+    int i;
+    if (weights_gpu && weights_normalization) {
+        if (weights_normalization == SOFTMAX_NORMALIZATION) {
+            for (int i = 0; i < (n + 1); ++i) {
+                const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
+                float w = weights_gpu[weights_index];
+                if (max_val < w) max_val = w;
+            }
+        }
+        const float eps = 0.0001;
+        sum = eps;
+        for (i = 0; i < (n + 1); ++i) {
+            const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
+            const float w = weights_gpu[weights_index];
+            if (weights_normalization == RELU_NORMALIZATION) sum += lrelu(w);
+            else if (weights_normalization == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
+        }
+
+    }
+
+    if (weights_gpu) {
+        float w = weights_gpu[src_i / step];
+        if (weights_normalization == RELU_NORMALIZATION) w = lrelu(w) / sum;
+        else if (weights_normalization == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
+
+        if (weights_normalization == RELU_NORMALIZATION) grad = w;
+        else if (weights_normalization == SOFTMAX_NORMALIZATION) grad = w*(1-w);
+
+        delta_out[id] += delta_in[id] * w; // [0 or c or (c, h ,w)]
+        float weights_update_tmp = delta_in[id] * in[id] * grad;// / step;
+
+        if (layer_step == 1 && (size/32) > (id/32 + 1)) {
+            if (isnan(weights_update_tmp) || isinf(weights_update_tmp)) {
+                weights_update_tmp = 0;
+            }
+            float wu = warpAllReduceSum(weights_update_tmp);
+            if (threadIdx.x % 32 == 0) {
+                if (!isnan(wu) && !isinf(wu))
+                    atomicAdd(&weight_updates_gpu[src_i / step], wu);
+            }
+        }
+        else {
+            if (!isnan(weights_update_tmp) && !isinf(weights_update_tmp))
+                atomicAdd(&weight_updates_gpu[src_i / step], weights_update_tmp);
+                //weight_updates_gpu[src_i / step] += weights_update_tmp;
+        }
+    }
+    else delta_out[id] += delta_in[id];
+
+    // layers
+    for (int i = 0; i < n; ++i) {
+        int add_outputs = outputs_of_layers_gpu[i];
+        if (src_i < add_outputs) {
+            int add_index = add_outputs*src_b + src_i;
+
+            float *layer_delta = layers_delta_gpu[i];
+            if (weights_gpu) {
+                float *add = layers_output_gpu[i];
+
+                const int weights_index = src_i / step + (i + 1)*layer_step;  // [0 or c or (c, h ,w)]
+                float w = weights_gpu[weights_index];
+                if (weights_normalization == RELU_NORMALIZATION) w = lrelu(w) / sum;
+                else if (weights_normalization == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
+
+                if (weights_normalization == RELU_NORMALIZATION) grad = w;
+                else if (weights_normalization == SOFTMAX_NORMALIZATION) grad = w*(1 - w);
+
+                layer_delta[add_index] += delta_in[id] * w;
+                float weights_update_tmp = delta_in[id] * add[add_index] * grad;// / step;
+
+                if (layer_step == 1 && (size / 32) > (id / 32 + 1)) {
+                    if (isnan(weights_update_tmp) || isinf(weights_update_tmp)) {
+                        weights_update_tmp = 0;
+                    }
+                    float wu = warpAllReduceSum(weights_update_tmp);
+                    if (threadIdx.x % 32 == 0) {
+                        if (!isnan(wu) && !isinf(wu))
+                            atomicAdd(&weight_updates_gpu[weights_index], wu);
+                        //if(weights_gpu[weights_index] != 1) printf(" wu = %f, weights_update_tmp = %f, w = %f, weights_gpu[weights_index] = %f, grad = %f, weights_normalization = %d ",
+                        //    wu, weights_update_tmp, w, weights_gpu[weights_index], grad, weights_normalization);
+                    }
+                }
+                else {
+                    if (!isnan(weights_update_tmp) && !isinf(weights_update_tmp))
+                        atomicAdd(&weight_updates_gpu[weights_index], weights_update_tmp);
+                        //weight_updates_gpu[weights_index] += weights_update_tmp;
+                }
+            }
+            else layer_delta[add_index] += delta_in[id];
+        }
+    }
+}
+
+extern "C" void backward_shortcut_multilayer_gpu(int src_outputs, int batch, int n, int *outputs_of_layers_gpu,
+    float **layers_delta_gpu, float *delta_out, float *delta_in, float *weights_gpu, float *weight_updates_gpu, int nweights, float *in, float **layers_output_gpu, WEIGHTS_NORMALIZATION_T weights_normalization)
+{
+    //const int layer_step = nweights / (n + 1);    // 1 or l.c or (l.c * l.h * l.w)
+    //int step = 0;
+    //if (nweights > 0) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1
+    //printf(" nweights = %d, n = %d, layer_step = %d, step = %d \n", nweights, n, layer_step, step);
+
+    //printf(" src_outputs = %d, batch = %d, n = %d \n", src_outputs, batch, n);
+    int size = batch * src_outputs;
+    backward_shortcut_multilayer_kernel <<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>> (size, src_outputs, batch, n, outputs_of_layers_gpu,
+        layers_delta_gpu, delta_out, delta_in, weights_gpu, weight_updates_gpu, nweights, in, layers_output_gpu, weights_normalization);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= size) return;
+    int i = id % minw;
+    id /= minw;
+    int j = id % minh;
+    id /= minh;
+    int k = id % minc;
+    id /= minc;
+    int b = id % batch;
+
+    int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
+    int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
+    out[out_index] += add[add_index];
+}
+
+extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+{
+    int minw = (w1 < w2) ? w1 : w2;
+    int minh = (h1 < h2) ? h1 : h2;
+    int minc = (c1 < c2) ? c1 : c2;
+
+    int stride = w1/w2;
+    int sample = w2/w1;
+    assert(stride == h1/h2);
+    assert(sample == h2/h1);
+    if(stride < 1) stride = 1;
+    if(sample < 1) sample = 1;
+
+    int size = batch * minw * minh * minc;
+    shortcut_kernel<<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream()>>>(size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, out);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void simple_input_shortcut_kernel(float *in, int size, float *add, float *out)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= size) return;
+
+    out[id] = in[id] + add[id];
+}
+
+__global__ void input_shortcut_kernel(float *in, int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= size) return;
+    int i = id % minw;
+    id /= minw;
+    int j = id % minh;
+    id /= minh;
+    int k = id % minc;
+    id /= minc;
+    int b = id % batch;
+
+    int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
+    int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
+    out[out_index] = in[out_index] + add[add_index];
+}
+
+extern "C" void input_shortcut_gpu(float *in, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+{
+    if (w1 == w2 && h1 == h2 && c1 == c2) {
+        int size = batch * w1 * h1 * c1;
+        simple_input_shortcut_kernel <<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>>(in, size, add, out);
+        CHECK_CUDA(cudaPeekAtLastError());
+        return;
+    }
+
+    int minw = (w1 < w2) ? w1 : w2;
+    int minh = (h1 < h2) ? h1 : h2;
+    int minc = (c1 < c2) ? c1 : c2;
+
+    int stride = w1 / w2;
+    int sample = w2 / w1;
+    assert(stride == h1 / h2);
+    assert(sample == h2 / h1);
+    if (stride < 1) stride = 1;
+    if (sample < 1) sample = 1;
+
+    int size = batch * minw * minh * minc;
+    //input_shortcut_kernel <<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>>(in, size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, out);
+    simple_copy_ongpu(w2 * h2 * c2 * batch, in, out);
+    shortcut_kernel <<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>>(size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, out);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void smooth_l1_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        float diff = truth[i] - pred[i];
+        float abs_val = abs(diff);
+        if(abs_val < 1) {
+            error[i] = diff * diff;
+            delta[i] = diff;
+        }
+        else {
+            error[i] = 2*abs_val - 1;
+            delta[i] = (diff < 0) ? -1 : 1;
+        }
+    }
+}
+
+extern "C" void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    smooth_l1_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, pred, truth, delta, error);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void softmax_x_ent_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n) {
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = (t) ? -log(p) : 0;
+        delta[i] = t - p;
+    }
+}
+
+extern "C" void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    softmax_x_ent_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, pred, truth, delta, error);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void l2_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        float diff = truth[i] - pred[i];
+        error[i] = diff * diff; //I know this is technically wrong, deal with it.
+        delta[i] = diff;
+    }
+}
+
+extern "C" void l2_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    l2_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, pred, truth, delta, error);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+__global__ void weighted_sum_kernel(int n, float *a, float *b, float *s, float *c)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        c[i] = s[i]*a[i] + (1-s[i])*(b ? b[i] : 0);
+    }
+}
+
+extern "C" void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c)
+{
+    weighted_sum_kernel<<<cuda_gridsize(num), BLOCK, 0, get_cuda_stream() >>>(num, a, b, s, c);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void weighted_delta_kernel(int n, float *a, float *b, float *s, float *da, float *db, float *ds, float *dc)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        if(da) da[i] += dc[i] * s[i];
+        db[i] += dc[i] * (1-s[i]);
+        ds[i] += dc[i] * a[i] + dc[i] * -b[i];
+    }
+}
+
+extern "C" void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc)
+{
+    weighted_delta_kernel<<<cuda_gridsize(num), BLOCK, 0, get_cuda_stream() >>>(num, a, b, s, da, db, ds, dc);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void mult_add_into_kernel(int n, float *a, float *b, float *c)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        c[i] += a[i]*b[i];
+    }
+}
+
+extern "C" void mult_add_into_gpu(int num, float *a, float *b, float *c)
+{
+    mult_add_into_kernel<<<cuda_gridsize(num), BLOCK, 0, get_cuda_stream() >>>(num, a, b, c);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__device__ void softmax_device(int n, float *input, float temp, float *output)
+{
+    int i;
+    float sum = 0;
+    float largest = -INFINITY;
+    for(i = 0; i < n; ++i){
+        int val = input[i];
+        largest = (val>largest) ? val : largest;
+    }
+    for(i = 0; i < n; ++i){
+        float e = exp(input[i]/temp - largest/temp);
+        sum += e;
+        output[i] = e;
+    }
+    for(i = 0; i < n; ++i){
+        output[i] /= sum;
+    }
+}
+
+__global__ void softmax_kernel(int n, int offset, int batch, float *input, float temp, float *output)
+{
+    int b = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(b >= batch) return;
+    softmax_device(n, input + b*offset, temp, output + b*offset);
+}
+
+extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output)
+{
+    int inputs = n;
+    int batch = groups;
+    softmax_kernel<<<cuda_gridsize(batch), BLOCK, 0, get_cuda_stream()>>>(inputs, offset, batch, input, temp, output);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__device__ void softmax_device_new_api(float *input, int n, float temp, int stride, float *output)
+{
+    int i;
+    float sum = 0;
+    float largest = -INFINITY;
+    for (i = 0; i < n; ++i) {
+        int val = input[i*stride];
+        largest = (val>largest) ? val : largest;
+    }
+    for (i = 0; i < n; ++i) {
+        float e = expf(input[i*stride] / temp - largest / temp);
+        sum += e;
+        output[i*stride] = e;
+    }
+    for (i = 0; i < n; ++i) {
+        output[i*stride] /= sum;
+    }
+}
+
+__global__ void softmax_kernel_new_api(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= batch*groups) return;
+    int b = id / groups;
+    int g = id % groups;
+    softmax_device_new_api(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
+}
+
+extern "C" void softmax_gpu_new_api(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
+{
+    softmax_kernel_new_api <<<cuda_gridsize(batch*groups), BLOCK, 0, get_cuda_stream() >>>(input, n, batch, batch_offset, groups, group_offset, stride, temp, output);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void upsample_kernel(size_t N, float *x, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+    size_t i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    int out_index = i;
+    int out_w = i % (w*stride);
+    i = i / (w*stride);
+    int out_h = i % (h*stride);
+    i = i / (h*stride);
+    int out_c = i%c;
+    i = i / c;
+    int b = i%batch;
+
+    int in_w = out_w / stride;
+    int in_h = out_h / stride;
+    int in_c = out_c;
+
+    int in_index = b*w*h*c + in_c*w*h + in_h*w + in_w;
+
+
+    if (forward) out[out_index] += scale * x[in_index];
+    else atomicAdd(x + in_index, scale * out[out_index]);
+}
+
+extern "C" void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+    size_t size = w*h*c*batch*stride*stride;
+    upsample_kernel <<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>>(size, in, w, h, c, batch, stride, forward, scale, out);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void softmax_tree_kernel(float *input, int spatial, int batch, int stride, float temp, float *output, int groups, int *group_size, int *group_offset)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= spatial*batch*groups) return;
+    int s = id % spatial;
+    id = id / spatial;
+    int g = id % groups;
+    int b = id / groups;
+    int goff = group_offset[g] * spatial;
+    int boff = b*stride;
+    softmax_device_new_api(input + goff + boff + s, group_size[g], temp, spatial, output + goff + boff + s);
+}
+
+extern "C" void softmax_tree_gpu(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier)
+{
+    int *tree_groups_size = cuda_make_int_array_new_api(hier.group_size, hier.groups);
+    int *tree_groups_offset = cuda_make_int_array_new_api(hier.group_offset, hier.groups);
+    /*
+    static int *tree_groups_size = 0;
+    static int *tree_groups_offset = 0;
+    if(!tree_groups_size){
+    tree_groups_size = cuda_make_int_array(hier.group_size, hier.groups);
+    tree_groups_offset = cuda_make_int_array(hier.group_offset, hier.groups);
+    }
+    */
+    int num = spatial*batch*hier.groups;
+    softmax_tree_kernel <<<cuda_gridsize(num), BLOCK, 0, get_cuda_stream() >>>(input, spatial, batch, stride, temp, output, hier.groups, tree_groups_size, tree_groups_offset);
+    CHECK_CUDA(cudaPeekAtLastError());
+    cuda_free((float *)tree_groups_size);
+    cuda_free((float *)tree_groups_offset);
+}
+
+
+__global__ void fix_nan_and_inf_kernel(float *input, size_t size)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        float val = input[index];
+        if (isnan(val) || isinf(val)) {
+            input[index] = 1.0f / (fabs((float)index) + 1);  // pseudo random value
+        }
+    }
+}
+
+extern "C" void fix_nan_and_inf(float *input, size_t size)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    fix_nan_and_inf_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>>(input, size);
+    CHECK_CUDA(cudaPeekAtLastError());
+    //CHECK_CUDA(cudaDeviceSynchronize());
+}
+
+
+__global__ void reset_nan_and_inf_kernel(float *input, size_t size)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        float val = input[index];
+        if (isnan(val) || isinf(val)) {
+            input[index] = 0;
+        }
+    }
+}
+
+extern "C" void reset_nan_and_inf(float *input, size_t size)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    reset_nan_and_inf_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>>(input, size);
+    CHECK_CUDA(cudaPeekAtLastError());
+    //CHECK_CUDA(cudaDeviceSynchronize());
+}
+
+
+
+__global__ void is_nan_or_inf_kernel(float *input, size_t size, int *pinned_return)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        float val = input[index];
+        if (isnan(val) || isinf(val))
+            *pinned_return = 1;
+    }
+}
+
+extern "C" int is_nan_or_inf(float *input, size_t size)
+{
+    int *pinned_return;
+    CHECK_CUDA(cudaHostAlloc(&pinned_return, sizeof(int), cudaHostRegisterMapped));
+    *pinned_return = 0;
+
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    is_nan_or_inf_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>>(input, size, pinned_return);
+    CHECK_CUDA(cudaDeviceSynchronize());
+    int ret_val = *pinned_return;
+
+    CHECK_CUDA(cudaFreeHost(pinned_return));
+    return ret_val;
+}
+
+__global__ void add_3_arrays_activate_kernel(float *a1, float *a2, float *a3, size_t size, ACTIVATION a, float *dst)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        float val = 0;
+        if (a1) val += a1[index];
+        if (a2) val += a2[index];
+        if (a3) val += a3[index];
+        if (a == LOGISTIC) val = 1.f / (1.f + expf(-val));
+        else if (a == TANH) val = (2 / (1 + expf(-2 * val)) - 1);
+        else if (a == LEAKY) val = (val < 0) ? val*0.1 : val;
+        dst[index] = val;
+    }
+}
+
+extern "C" void add_3_arrays_activate(float *a1, float *a2, float *a3, size_t size, ACTIVATION a, float *dst)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    if (!(a == LOGISTIC || a == TANH || a == LEAKY || a == LINEAR)) {
+        error("Error: add_3_arrays_activate() supports only LOGISTIC and TANH", DARKNET_LOC);
+    }
+    add_3_arrays_activate_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>>(a1, a2, a3, size, a, dst);
+}
+
+
+__global__ void sum_of_mults_kernel(float *a1, float *a2, float *b1, float *b2, size_t size, float *dst)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        dst[index] = a1[index] * a2[index] + b1[index] * b2[index];
+    }
+}
+
+extern "C" void sum_of_mults(float *a1, float *a2, float *b1, float *b2,  size_t size, float *dst)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    sum_of_mults_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>>(a1, a2, b1, b2, size, dst);
+}
+
+
+__global__ void activate_and_mult_kernel(float *a1, float *a2, size_t size, ACTIVATION a, float *dst)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        float val = a1[index];
+        if (a == TANH) val = (2 / (1 + expf(-2 * val)) - 1);
+        else if (a == LEAKY) val = (val < 0) ? val*0.1 : val;
+        dst[index] = val * a2[index];
+    }
+}
+
+extern "C" void activate_and_mult(float *a1, float *a2, size_t size, ACTIVATION a, float *dst)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    if (!(a == TANH || a == LEAKY || a == LINEAR)) {
+        error("Error: activat_and_mult() supports only TANH", DARKNET_LOC);
+    }
+    activate_and_mult_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>>(a1, a2, size, a, dst);
+}
+
+
+
+__global__ void scale_channels_kernel(float *in_w_h_c, int size, int channel_size, int batch_size, int scale_wh, float *scales_c, float *out)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        if (scale_wh) {
+            int osd_index = index % channel_size + (index / batch_size)*channel_size;
+
+            out[index] = in_w_h_c[index] * scales_c[osd_index];
+        }
+        else {
+            out[index] = in_w_h_c[index] * scales_c[index / channel_size];
+        }
+    }
+}
+
+extern "C" void scale_channels_gpu(float *in_w_h_c, int size, int channel_size, int batch_size, int scale_wh, float *scales_c, float *out)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    scale_channels_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>>(in_w_h_c, size, channel_size, batch_size, scale_wh, scales_c, out);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+
+__global__ void backward_scale_channels_kernel(float *in_w_h_c_delta, int size, int channel_size, int batch_size, int scale_wh,
+    float *in_scales_c, float *out_from_delta,
+    float *in_from_output, float *out_state_delta)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (index < size) {
+
+        if (scale_wh)
+        {
+            int osd_index = index % channel_size + (index / batch_size)*channel_size;
+
+            //out_state_delta[osd_index] += in_w_h_c_delta[index] * in_from_output[index]; // l.delta * from  (should be divided by channel_size?)
+            atomicAdd(&out_state_delta[osd_index], in_w_h_c_delta[index] * in_from_output[index] / channel_size); // l.delta * from
+
+            out_from_delta[index] += in_scales_c[osd_index] * in_w_h_c_delta[index]; // input * l.delta  // atomic isn't required here
+
+        }
+        else {
+            int osd_index = index / channel_size;
+            //out_state_delta[osd_index] += in_w_h_c_delta[index] * in_from_output[index]; // l.delta * from  (should be divided by channel_size?)
+
+            int warp_id = index / 32;
+            int index_warp_start = warp_id * 32;
+            int osd_index_warp_start = index_warp_start / channel_size;
+            int osd_index_warp_end = (index_warp_start + 31) / channel_size;
+
+            if (osd_index_warp_start == osd_index_warp_end) // all thread in warp process the same channel
+            {
+                float sum = warpAllReduceSum(in_w_h_c_delta[index] * in_from_output[index]); // l.delta * from
+                if (threadIdx.x % 32 == 0) {
+                    atomicAdd(&out_state_delta[osd_index], sum);
+                    //out_state_delta[osd_index] += sum;
+                }
+            }
+            else {
+                atomicAdd(&out_state_delta[osd_index], in_w_h_c_delta[index] * in_from_output[index]); // l.delta * from
+            }
+
+            out_from_delta[index] += in_scales_c[osd_index] * in_w_h_c_delta[index]; // input * l.delta  // atomic isn't required here
+        }
+    }
+}
+
+extern "C" void backward_scale_channels_gpu(float *in_w_h_c_delta, int size, int channel_size, int batch_size, int scale_wh,
+    float *in_scales_c, float *out_from_delta,
+    float *in_from_output, float *out_state_delta)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    backward_scale_channels_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (in_w_h_c_delta, size, channel_size, batch_size, scale_wh,
+        in_scales_c, out_from_delta,
+        in_from_output, out_state_delta);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void sam_kernel(float *in_w_h_c, int size, int channel_size, float *scales_c, float *out)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        out[index] = in_w_h_c[index] * scales_c[index];
+    }
+}
+
+extern "C" void sam_gpu(float *in_w_h_c, int size, int channel_size, float *scales_c, float *out)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    sam_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>>(in_w_h_c, size, channel_size, scales_c, out);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void backward_sam_kernel(float *in_w_h_c_delta, int size, int channel_size,
+    float *in_scales_c, float *out_from_delta,
+    float *in_from_output, float *out_state_delta)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        out_state_delta[index] += in_w_h_c_delta[index] * in_from_output[index]; // l.delta * from  (should be divided by channel_size?)
+        out_from_delta[index] += in_scales_c[index] * in_w_h_c_delta[index]; // input * l.delta
+
+                                                                             //out_state_delta[index] += in_w_h_c_delta[index];
+                                                                             //out_from_delta[index] = in_w_h_c_delta[index];
+    }
+}
+
+extern "C" void backward_sam_gpu(float *in_w_h_c_delta, int size, int channel_size,
+    float *in_scales_c, float *out_from_delta,
+    float *in_from_output, float *out_state_delta)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    backward_sam_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (in_w_h_c_delta, size, channel_size,
+        in_scales_c, out_from_delta,
+        in_from_output, out_state_delta);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__  void smooth_rotate_weights_kernel(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int kernel_size, int angle, int reverse)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    const int kernel_area = kernel_size * kernel_size;
+    const int i = index * kernel_area;
+
+    //const int stage_step = (nweights / kernel_area) / 4;  // 4 stages
+    //const int stage_id = index / stage_step;
+
+    // nweights = (c / groups) * n * size * size;
+    // kernel_area = size*size
+
+    if (i < nweights)
+    {
+        // rotate left or right
+        if (reverse) angle = -angle;
+
+        const float cos_a = cosf(angle * 3.14159265 / 180);
+        const float sin_a = sinf(angle * 3.14159265 / 180);
+        const int x_c = kernel_size / 2;
+        const int y_c = kernel_size / 2;
+
+        float dropout_sum = 0;
+
+        for (int y = 0; y < kernel_size; ++y) {
+            for (int x = 0; x < kernel_size; ++x) {
+                // Xsource = x*cos(alpha) + y*sin(alpha)
+                // Ysource = -x*sin(alpha) + y*cos(alpha)
+
+                float x_s = x_c + (x - x_c)*cos_a + (y - y_c)*sin_a;
+                float y_s = y_c - (x - x_c)*sin_a + (y - y_c)*cos_a;
+
+                int x_0 = floorf(x_s);   // round down
+                int x_1 = ceilf(x_s);    // round up
+                if (x_0 == x_1) x_1 = x_0 + 1;
+                int y_0 = floorf(y_s);
+                int y_1 = ceilf(y_s);
+                if (y_0 == y_1) y_1 = y_0 + 1;
+
+                float c_x_0 = x_1 - x_s;
+                float c_x_1 = x_s - x_0;
+                float c_y_0 = y_1 - y_s;
+                float c_y_1 = y_s - y_0;
+
+
+                float val = 0;
+                if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0;
+                else dropout_sum += c_x_0 * c_y_0;
+
+                if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0;
+                else dropout_sum += c_x_1 * c_y_0;
+
+                if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1;
+                else dropout_sum += c_x_0 * c_y_1;
+
+                if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1;
+                else dropout_sum += c_x_1 * c_y_1;
+
+                weight_deform_gpu[x + y*kernel_size + i] = val;
+            }
+        }
+
+        // compensate for dropped items
+        const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum);
+        for (int y = 0; y < kernel_size; ++y) {
+            for (int x = 0; x < kernel_size; ++x) {
+                weight_deform_gpu[x + y*kernel_size + i] *= coef;
+            }
+        }
+    }
+}
+
+
+extern "C" void smooth_rotate_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int angle, int reverse)
+{
+    const int kernel_area = size*size;
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(nweights / kernel_area, block_size);
+    smooth_rotate_weights_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (src_weight_gpu, weight_deform_gpu, nweights, n, size, angle, reverse);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+__global__  void stretch_weights_kernel(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int kernel_size, float scale, int reverse)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    const int kernel_area = kernel_size * kernel_size;
+    const int i = index * kernel_area;
+
+    const int stage_step = (nweights / kernel_area) / 4;  // 4 stages
+    const int stage_id = index / stage_step;
+
+    // nweights = (c / groups) * n * size * size;
+    // kernel_area = size*size
+
+    if (i < nweights)
+    {
+
+        if (stage_id == 0) {
+            // simple copy
+            for (int x = 0; x < kernel_size; ++x) {
+                for (int y = 0; y < kernel_size; ++y) {
+                    weight_deform_gpu[x + y*kernel_size + i] = src_weight_gpu[x + y*kernel_size + i];
+                }
+            }
+        }
+        else if (stage_id > 0)
+        {
+            if (stage_id == 1) scale = 0.65;
+            else if (stage_id == 2) scale = 0.8;
+            else if (stage_id == 3) scale = 1.3;
+
+            if (reverse) scale = 1 / scale;
+
+            const int x_c = kernel_size / 2;
+            const int y_c = kernel_size / 2;
+
+            float dropout_sum = 0;
+
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    // Xsource = x_c + (x_d - x_c) / scale
+                    // Ysource = y_c + (y_d - y_c) / scale
+
+                    float x_s = x_c + (x - x_c) / scale;
+                    float y_s = y_c + (y - y_c) / scale;
+
+                    int x_0 = floorf(x_s);   // round down
+                    int x_1 = ceilf(x_s);    // round up
+                    if (x_0 == x_1) x_1 = x_0 + 1;
+                    int y_0 = floorf(y_s);
+                    int y_1 = ceilf(y_s);
+                    if (y_0 == y_1) y_1 = y_0 + 1;
+
+                    float c_x_0 = x_1 - x_s;
+                    float c_x_1 = x_s - x_0;
+                    float c_y_0 = y_1 - y_s;
+                    float c_y_1 = y_s - y_0;
+
+                    float val = 0;
+                    if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0;
+                    else dropout_sum += c_x_0 * c_y_0;
+
+                    if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0;
+                    else dropout_sum += c_x_1 * c_y_0;
+
+                    if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1;
+                    else dropout_sum += c_x_0 * c_y_1;
+
+                    if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1;
+                    else dropout_sum += c_x_1 * c_y_1;
+
+                    weight_deform_gpu[x + y*kernel_size + i] = val;
+                }
+            }
+
+            // compensate for dropped items
+            //const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum);
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    //if (scale < 1) weight_deform_gpu[x + y*kernel_size + i] /= scale;// *= coef;
+                    weight_deform_gpu[x + y*kernel_size + i] /= scale;// *= coef;
+                }
+            }
+        }
+    }
+}
+
+
+extern "C" void stretch_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, float scale, int reverse)
+{
+    const int kernel_area = size*size;
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(nweights / kernel_area, block_size);
+    stretch_weights_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (src_weight_gpu, weight_deform_gpu, nweights, n, size, scale, reverse);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+__global__  void sway_and_flip_weights_kernel(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int kernel_size, int angle, int reverse)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    const int kernel_area = kernel_size * kernel_size;
+    const int i = index * kernel_area;
+
+    const int stage_step = (nweights / kernel_area) / 4;  // 4 stages
+    const int stage_id = index / stage_step;
+
+    // nweights = (c / groups) * n * size * size;
+    // kernel_area = size*size
+
+    if (i < nweights)
+    {
+
+        if (stage_id == 0) {
+            // simple copy
+            for (int x = 0; x < kernel_size; ++x) {
+                for (int y = 0; y < kernel_size; ++y) {
+                    weight_deform_gpu[x + y*kernel_size + i] = src_weight_gpu[x + y*kernel_size + i];
+                }
+            }
+        }
+        else if (stage_id == 1 || stage_id == 2)
+        {
+            // rotate left or right
+            if (stage_id == 2) angle = -angle;
+            if (reverse) angle = -angle;
+
+            const float cos_a = cosf(angle * 3.14159265 / 180);
+            const float sin_a = sinf(angle * 3.14159265 / 180);
+            const int x_c = kernel_size / 2;
+            const int y_c = kernel_size / 2;
+
+            float dropout_sum = 0;
+
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    // Xsource = x*cos(alpha) + y*sin(alpha)
+                    // Ysource = -x*sin(alpha) + y*cos(alpha)
+
+                    float x_s = x_c + (x - x_c)*cos_a + (y - y_c)*sin_a;
+                    float y_s = y_c - (x - x_c)*sin_a + (y - y_c)*cos_a;
+
+                    int x_0 = floorf(x_s);   // round down
+                    int x_1 = ceilf(x_s);    // round up
+                    if (x_0 == x_1) x_1 = x_0 + 1;
+                    int y_0 = floorf(y_s);
+                    int y_1 = ceilf(y_s);
+                    if (y_0 == y_1) y_1 = y_0 + 1;
+
+                    float c_x_0 = x_1 - x_s;
+                    float c_x_1 = x_s - x_0;
+                    float c_y_0 = y_1 - y_s;
+                    float c_y_1 = y_s - y_0;
+
+                    float val = 0;
+                    if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0;
+                    else dropout_sum += c_x_0 * c_y_0;
+
+                    if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0;
+                    else dropout_sum += c_x_1 * c_y_0;
+
+                    if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1;
+                    else dropout_sum += c_x_0 * c_y_1;
+
+                    if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1;
+                    else dropout_sum += c_x_1 * c_y_1;
+
+                    weight_deform_gpu[x + y*kernel_size + i] = val;
+                }
+            }
+
+            // compensate for dropped items
+            const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum);
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    weight_deform_gpu[x + y*kernel_size + i] *= coef;
+                }
+            }
+        }
+        else if (stage_id == 3)
+        {
+            // flip
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    weight_deform_gpu[(kernel_size - x - 1) + y*kernel_size + i] = src_weight_gpu[x + y*kernel_size + i];
+                }
+            }
+        }
+    }
+}
+
+
+extern "C" void sway_and_flip_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int angle, int reverse)
+{
+    const int kernel_area = size*size;
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(nweights / kernel_area, block_size);
+    sway_and_flip_weights_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (src_weight_gpu, weight_deform_gpu, nweights, n, size, angle, reverse);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+
+
+
+
+__global__  void rotate_weights_kernel(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int kernel_size, int reverse)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    const int kernel_area = kernel_size * kernel_size;
+    const int i = index * kernel_area;
+
+    const int stage_step = (nweights / kernel_area) / 4;  // 4 stages
+    const int stage_id = index / stage_step;
+
+    // nweights = (c / groups) * n * size * size;
+    // kernel_area = size*size
+
+    if (i < nweights)
+    {
+        // if(reverse)
+
+        if (stage_id == 0) {
+            // simple copy
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    const int src_i = x + y*kernel_size + i;
+                    const int dst_i = x + y*kernel_size + i;
+                    if (reverse) weight_deform_gpu[src_i] = src_weight_gpu[dst_i];
+                    else weight_deform_gpu[dst_i] = src_weight_gpu[src_i];
+                }
+            }
+        }
+        else if (stage_id == 1)
+        {
+            // 90 degree clockwise rotation - 1
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    const int src_i = x + y*kernel_size + i;
+                    const int dst_i = (kernel_size - 1 - y) + x*kernel_size + i;
+                    if (reverse) weight_deform_gpu[src_i] = src_weight_gpu[dst_i];
+                    else weight_deform_gpu[dst_i] = src_weight_gpu[src_i];
+                }
+            }
+        }
+        else if (stage_id == 2)
+        {
+            // 180 degree clockwise rotation - 2
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    const int src_i = x + y*kernel_size + i;
+                    const int dst_i = (kernel_size - 1 - x) + (kernel_size - 1 - y)*kernel_size + i;
+                    if (reverse) weight_deform_gpu[src_i] = src_weight_gpu[dst_i];
+                    else weight_deform_gpu[dst_i] = src_weight_gpu[src_i];
+                }
+            }
+        }
+        else if (stage_id == 3)
+        {
+            // 270 degree clockwise rotation - 3
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    const int src_i = x + y*kernel_size + i;
+                    const int dst_i = y + (kernel_size - 1 - x)*kernel_size + i;
+                    if (reverse) weight_deform_gpu[src_i] = src_weight_gpu[dst_i];
+                    else weight_deform_gpu[dst_i] = src_weight_gpu[src_i];
+                }
+            }
+        }
+    }
+}
+
+
+extern "C" void rotate_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int reverse)
+{
+    const int kernel_area = size*size;
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(nweights / kernel_area, block_size);
+    rotate_weights_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (src_weight_gpu, weight_deform_gpu, nweights, n, size, reverse);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+__global__  void stretch_sway_flip_weights_kernel(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int kernel_size, float angle, int reverse)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    const int kernel_area = kernel_size * kernel_size;
+    const int i = index * kernel_area;
+
+    const int stage_step = (nweights / kernel_area) / 8;  // 8 stages
+    const int stage_id = index / stage_step;
+
+    // nweights = (c / groups) * n * size * size;
+    // kernel_area = size*size
+
+    if (i < nweights)
+    {
+
+        if (stage_id == 0) {
+            // simple copy
+            for (int x = 0; x < kernel_size; ++x) {
+                for (int y = 0; y < kernel_size; ++y) {
+                    weight_deform_gpu[x + y*kernel_size + i] = src_weight_gpu[x + y*kernel_size + i];
+                }
+            }
+        }
+        else if (stage_id == 1 || stage_id == 2 || stage_id == 3 || stage_id == 4)
+        {
+            float scale = 0.5;
+            if (stage_id == 1) scale = 0.65;
+            else if (stage_id == 2) scale = 0.8;
+            else if (stage_id == 3) scale = 1.2;
+            else if (stage_id == 4) scale = 1.4;
+
+            if (reverse) scale = 1 / scale;
+
+            const int x_c = kernel_size / 2;
+            const int y_c = kernel_size / 2;
+
+            float dropout_sum = 0;
+
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    // Xsource = x_c + (x_d - x_c) / scale
+                    // Ysource = y_c + (y_d - y_c) / scale
+
+                    float x_s = x_c + (x - x_c) / scale;
+                    float y_s = y_c + (y - y_c) / scale;
+
+                    int x_0 = floorf(x_s);   // round down
+                    int x_1 = ceilf(x_s);    // round up
+                    if (x_0 == x_1) x_1 = x_0 + 1;
+                    int y_0 = floorf(y_s);
+                    int y_1 = ceilf(y_s);
+                    if (y_0 == y_1) y_1 = y_0 + 1;
+
+                    float c_x_0 = x_1 - x_s;
+                    float c_x_1 = x_s - x_0;
+                    float c_y_0 = y_1 - y_s;
+                    float c_y_1 = y_s - y_0;
+
+                    float val = 0;
+                    if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0;
+                    else dropout_sum += c_x_0 * c_y_0;
+
+                    if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0;
+                    else dropout_sum += c_x_1 * c_y_0;
+
+                    if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1;
+                    else dropout_sum += c_x_0 * c_y_1;
+
+                    if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1;
+                    else dropout_sum += c_x_1 * c_y_1;
+
+                    weight_deform_gpu[x + y*kernel_size + i] = val;
+                }
+            }
+
+            // compensate for dropped items
+            //const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum);
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    if(scale > 1)
+                        weight_deform_gpu[x + y*kernel_size + i] /= scale;// *= coef;
+                }
+            }
+        }
+        else if (stage_id == 5 || stage_id == 6)
+        {
+            // rotate left or right
+            if (stage_id == 6) angle = -angle;
+            if (reverse) angle = -angle;
+
+            const float cos_a = cosf(angle * 3.14159265 / 180);
+            const float sin_a = sinf(angle * 3.14159265 / 180);
+            const int x_c = kernel_size / 2;
+            const int y_c = kernel_size / 2;
+
+            float dropout_sum = 0;
+
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    // Xsource = x*cos(alpha) + y*sin(alpha)
+                    // Ysource = -x*sin(alpha) + y*cos(alpha)
+
+                    float x_s = x_c + (x - x_c)*cos_a + (y - y_c)*sin_a;
+                    float y_s = y_c - (x - x_c)*sin_a + (y - y_c)*cos_a;
+
+                    int x_0 = floorf(x_s);   // round down
+                    int x_1 = ceilf(x_s);    // round up
+                    if (x_0 == x_1) x_1 = x_0 + 1;
+                    int y_0 = floorf(y_s);
+                    int y_1 = ceilf(y_s);
+                    if (y_0 == y_1) y_1 = y_0 + 1;
+
+                    float c_x_0 = x_1 - x_s;
+                    float c_x_1 = x_s - x_0;
+                    float c_y_0 = y_1 - y_s;
+                    float c_y_1 = y_s - y_0;
+
+                    float val = 0;
+                    if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0;
+                    else dropout_sum += c_x_0 * c_y_0;
+
+                    if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0;
+                    else dropout_sum += c_x_1 * c_y_0;
+
+                    if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1;
+                    else dropout_sum += c_x_0 * c_y_1;
+
+                    if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1;
+                    else dropout_sum += c_x_1 * c_y_1;
+
+                    weight_deform_gpu[x + y*kernel_size + i] = val;
+                }
+            }
+
+            // compensate for dropped items
+            const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum);
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    weight_deform_gpu[x + y*kernel_size + i] *= coef;
+                }
+            }
+        }
+        else if (stage_id == 7)
+        {
+            // flip
+            for (int y = 0; y < kernel_size; ++y) {
+                for (int x = 0; x < kernel_size; ++x) {
+                    weight_deform_gpu[(kernel_size - x - 1) + y*kernel_size + i] = src_weight_gpu[x + y*kernel_size + i];
+                }
+            }
+        }
+    }
+}
+
+
+extern "C" void stretch_sway_flip_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int angle, int reverse)
+{
+    const int kernel_area = size*size;
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(nweights / kernel_area, block_size);
+    stretch_sway_flip_weights_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (src_weight_gpu, weight_deform_gpu, nweights, n, size, angle, reverse);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+__global__  void reduce_and_expand_array_kernel(const float *src_gpu, float *dst_gpu, int current_size, int groups)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (index < current_size) {
+        float val = 0;
+        for (int i = 0; i < groups; ++i) {
+            val += src_gpu[index + i*current_size];
+        }
+        for (int i = 0; i < groups; ++i) {
+            dst_gpu[index + i*current_size] = val / groups;
+        }
+    }
+}
+
+extern "C" void reduce_and_expand_array_gpu(const float *src_gpu, float *dst_gpu, int size, int groups)
+{
+    const int current_size = size / groups;
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(current_size, block_size);
+    reduce_and_expand_array_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (src_gpu, dst_gpu, current_size, groups);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+__global__  void expand_array_kernel(const float *src_gpu, float *dst_gpu, int current_size, int groups)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (index < current_size) {
+        for (int i = 0; i < groups; ++i) {
+            dst_gpu[index + i*current_size] = src_gpu[index];
+        }
+    }
+}
+
+extern "C" void expand_array_gpu(const float *src_gpu, float *dst_gpu, int size, int groups)
+{
+    const int current_size = size / groups;
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(current_size, block_size);
+    expand_array_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (src_gpu, dst_gpu, current_size, groups);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+__global__  void mult_inverse_array_kernel(const float *src_gpu, float *dst_gpu, int size, const float eps,
+    float divider, const float clip, const float abs_add)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (index < size) {
+        float val = src_gpu[index];
+        float sign = (val < 0) ? -1 : 1;
+        // eps = 1 by default
+        // eps = 2 - lower delta
+        // eps = 0 - higher delta (linear)
+        // eps = -1 - high delta (inverse number)
+        // = (abs(x)*10+1)^(-1)
+        float unsigned_val = powf(fabs(val)*10 + abs_add, eps);
+        unsigned_val = unsigned_val / divider;
+        if (unsigned_val > clip && clip != 0.0) unsigned_val = clip;
+        if (isnan(unsigned_val) || isinf(unsigned_val)) unsigned_val = 0;
+        dst_gpu[index] = unsigned_val * sign;
+    }
+}
+
+extern "C" void mult_inverse_array_gpu(const float *src_gpu, float *dst_gpu, int size, float eps, float divider, float clip, float abs_add)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    mult_inverse_array_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (src_gpu, dst_gpu, size, eps, divider, clip, abs_add);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+__global__ void P_constrastive_f_det_kernel(int *labels, unsigned int feature_size, float temperature, contrastive_params *contrast_p, const int contrast_p_size)
+{
+    const int il = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (il < contrast_p_size) {
+        const float sim = contrast_p[il].sim;
+        const size_t i = contrast_p[il].i;
+        const size_t j = contrast_p[il].j;
+
+        const float numerator = expf(sim / temperature);
+
+        float denominator = 0;
+        int k;
+        for (k = 0; k < contrast_p_size; ++k) {
+            contrastive_params cp = contrast_p[k];
+            //if (k != i && labels[k] != labels[i]) {
+            //if (k != i) {
+            if (cp.i != i && cp.j == j) {
+                //const float sim_den = cp.sim;
+                ////const float sim_den = find_sim(k, l, contrast_p, contrast_p_size); // cosine_similarity(z[k], z[l], feature_size);
+                //denominator += expf(sim_den / temperature);
+                denominator += cp.exp_sim;
+            }
+        }
+
+        float result = 0.9999;
+        if (denominator != 0) result = numerator / denominator;
+        if (result > 1) result = 0.9999;
+
+        contrast_p[il].P = result;
+    }
+}
+
+
+extern "C" void P_constrastive_f_det_gpu(int *labels, unsigned int feature_size, float temperature, contrastive_params *contrast_p, const int contrast_p_size)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(contrast_p_size, block_size);
+    P_constrastive_f_det_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (labels, feature_size, temperature, contrast_p, contrast_p_size);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+
+__global__ void coord_conv_kernel(float *dst, int w, int h, int chan, int batch, int type)
+{
+    int i = blockIdx.x*blockDim.x + threadIdx.x;
+
+    const int x = i % w;
+    i = i / w;
+    const int y = i % h;
+    i = i / h;
+    const int c = i % chan;
+    //i = i / chan;
+    //const int b = i % batch;
+
+    if (type == 0) {
+        if (c == 0) {
+            const float x_val = (2.0f * x) / w - 1.0f;  // [-1; 1)
+            dst[i] = x_val; // x - coord
+        }
+        else if (c == 1) {
+            const float y_val = (2.0f * y) / h - 1.0f;  // [-1; 1)
+            dst[i] = y_val; // y - coord
+        }
+        else if (c == 2) {
+            const float x_val = (2.0f * x) / w - 1.0f;  // [-1; 1)
+            const float y_val = (2.0f * y) / h - 1.0f;  // [-1; 1)
+            const float rad_val = sqrtf(x_val*x_val + y_val*y_val);  // [0; 1.414)
+            dst[i] = rad_val; // rad - coord
+        }
+    }
+    else if (type == 1) {
+        if (c >= 0 && c <= 2) {
+            dst[i] = 0;
+        }
+    }
+}
+
+extern "C" void coord_conv_gpu(float *dst, int size, int w, int h, int chan, int b, int type)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    coord_conv_kernel <<<num_blocks, block_size, 0, get_cuda_stream() >>> (dst, w, h, chan, b, type);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void forward_implicit_kernel(int size, int batch, int nweights, float *weight_gpu, float *output_gpu)
+{
+    const int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= size) return;
+
+    output_gpu[id] = weight_gpu[id % nweights];
+}
+
+extern "C" void forward_implicit_gpu(int batch, int nweights, float *weight_gpu, float *output_gpu)
+{
+    int size = batch * nweights;
+    forward_implicit_kernel <<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>> (size, batch, nweights, weight_gpu, output_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+
+__global__ void backward_implicit_kernel(int size, int batch, int nweights, float *weight_updates_gpu, float *delta_gpu)
+{
+    const int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= size) return;
+
+    for (int i = 0; i < batch; ++i) {
+        weight_updates_gpu[id] += delta_gpu[id + i * nweights];
+    }
+}
+
+extern "C" void backward_implicit_gpu(int batch, int nweights, float *weight_updates_gpu, float *delta_gpu)
+{
+    int size = nweights;
+    backward_implicit_kernel <<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>> (size, batch, nweights, weight_updates_gpu, delta_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
diff --git a/darknet-master/src/box.c b/darknet-master/src/box.c
new file mode 100644
index 0000000..0ad1263
--- /dev/null
+++ b/darknet-master/src/box.c
@@ -0,0 +1,950 @@
+#include "box.h"
+#include "utils.h"
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+
+#ifndef M_PI
+#define M_PI 3.141592
+#endif
+
+box float_to_box(float *f)
+{
+    box b;
+    b.x = f[0];
+    b.y = f[1];
+    b.w = f[2];
+    b.h = f[3];
+    return b;
+}
+
+box float_to_box_stride(float *f, int stride)
+{
+    box b = { 0 };
+    b.x = f[0];
+    b.y = f[1 * stride];
+    b.w = f[2 * stride];
+    b.h = f[3 * stride];
+    return b;
+}
+
+
+dbox derivative(box a, box b)
+{
+    dbox d;
+    d.dx = 0;
+    d.dw = 0;
+    d.dy = 0;
+    d.dh = 0;
+    d.dx = a.x < b.x ? 1.0 : -1.0;
+    d.dy = a.y < b.y ? 1.0 : -1.0;
+    d.dw = a.w < b.w ? 1.0 : -1.0;
+    d.dh = a.h < b.h ? 1.0 : -1.0;
+    return d;
+}
+
+
+/*
+dbox derivative(box a, box b)
+{
+    dbox d;
+    d.dx = 0;
+    d.dw = 0;
+    float l1 = a.x - a.w/2;
+    float l2 = b.x - b.w/2;
+    if (l1 > l2){
+        d.dx -= 1;
+        d.dw += .5;
+    }
+    float r1 = a.x + a.w/2;
+    float r2 = b.x + b.w/2;
+    if(r1 < r2){
+        d.dx += 1;
+        d.dw += .5;
+    }
+    if (l1 > r2) {
+        d.dx = -1;
+        d.dw = 0;
+    }
+    if (r1 < l2){
+        d.dx = 1;
+        d.dw = 0;
+    }
+
+    d.dy = 0;
+    d.dh = 0;
+    float t1 = a.y - a.h/2;
+    float t2 = b.y - b.h/2;
+    if (t1 > t2){
+        d.dy -= 1;
+        d.dh += .5;
+    }
+    float b1 = a.y + a.h/2;
+    float b2 = b.y + b.h/2;
+    if(b1 < b2){
+        d.dy += 1;
+        d.dh += .5;
+    }
+    if (t1 > b2) {
+        d.dy = -1;
+        d.dh = 0;
+    }
+    if (b1 < t2){
+        d.dy = 1;
+        d.dh = 0;
+    }
+    return d;
+}
+*/
+
+// where c is the smallest box that fully encompases a and b
+boxabs box_c(box a, box b) {
+    boxabs ba = { 0 };
+    ba.top = fmin(a.y - a.h / 2, b.y - b.h / 2);
+    ba.bot = fmax(a.y + a.h / 2, b.y + b.h / 2);
+    ba.left = fmin(a.x - a.w / 2, b.x - b.w / 2);
+    ba.right = fmax(a.x + a.w / 2, b.x + b.w / 2);
+    return ba;
+}
+
+// representation from x, y, w, h to top, left, bottom, right
+boxabs to_tblr(box a) {
+    boxabs tblr = { 0 };
+    float t = a.y - (a.h / 2);
+    float b = a.y + (a.h / 2);
+    float l = a.x - (a.w / 2);
+    float r = a.x + (a.w / 2);
+    tblr.top = t;
+    tblr.bot = b;
+    tblr.left = l;
+    tblr.right = r;
+    return tblr;
+}
+
+float overlap(float x1, float w1, float x2, float w2)
+{
+    float l1 = x1 - w1/2;
+    float l2 = x2 - w2/2;
+    float left = l1 > l2 ? l1 : l2;
+    float r1 = x1 + w1/2;
+    float r2 = x2 + w2/2;
+    float right = r1 < r2 ? r1 : r2;
+    return right - left;
+}
+
+float box_intersection(box a, box b)
+{
+    float w = overlap(a.x, a.w, b.x, b.w);
+    float h = overlap(a.y, a.h, b.y, b.h);
+    if(w < 0 || h < 0) return 0;
+    float area = w*h;
+    return area;
+}
+
+float box_union(box a, box b)
+{
+    float i = box_intersection(a, b);
+    float u = a.w*a.h + b.w*b.h - i;
+    return u;
+}
+
+float box_iou_kind(box a, box b, IOU_LOSS iou_kind)
+{
+    //IOU, GIOU, MSE, DIOU, CIOU
+    switch(iou_kind) {
+        case IOU: return box_iou(a, b);
+        case GIOU: return box_giou(a, b);
+        case DIOU: return box_diou(a, b);
+        case CIOU: return box_ciou(a, b);
+    }
+    return box_iou(a, b);
+}
+
+float box_iou(box a, box b)
+{
+    //return box_intersection(a, b)/box_union(a, b);
+
+    float I = box_intersection(a, b);
+    float U = box_union(a, b);
+    if (I == 0 || U == 0) {
+        return 0;
+    }
+    return I / U;
+}
+
+float box_giou(box a, box b)
+{
+    boxabs ba = box_c(a, b);
+    float w = ba.right - ba.left;
+    float h = ba.bot - ba.top;
+    float c = w*h;
+    float iou = box_iou(a, b);
+    if (c == 0) {
+        return iou;
+    }
+    float u = box_union(a, b);
+    float giou_term = (c - u) / c;
+#ifdef DEBUG_PRINTS
+    printf("  c: %f, u: %f, giou_term: %f\n", c, u, giou_term);
+#endif
+    return iou - giou_term;
+}
+
+// https://github.com/Zzh-tju/DIoU-darknet
+// https://arxiv.org/abs/1911.08287
+float box_diou(box a, box b)
+{
+    boxabs ba = box_c(a, b);
+    float w = ba.right - ba.left;
+    float h = ba.bot - ba.top;
+    float c = w * w + h * h;
+    float iou = box_iou(a, b);
+    if (c == 0) {
+        return iou;
+    }
+    float d = (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+    float u = pow(d / c, 0.6);
+    float diou_term = u;
+#ifdef DEBUG_PRINTS
+    printf("  c: %f, u: %f, riou_term: %f\n", c, u, diou_term);
+#endif
+    return iou - diou_term;
+}
+
+float box_diounms(box a, box b, float beta1)
+{
+    boxabs ba = box_c(a, b);
+    float w = ba.right - ba.left;
+    float h = ba.bot - ba.top;
+    float c = w * w + h * h;
+    float iou = box_iou(a, b);
+    if (c == 0) {
+        return iou;
+    }
+    float d = (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+    float u = pow(d / c, beta1);
+    float diou_term = u;
+#ifdef DEBUG_PRINTS
+    printf("  c: %f, u: %f, riou_term: %f\n", c, u, diou_term);
+#endif
+    return iou - diou_term;
+}
+
+// https://github.com/Zzh-tju/DIoU-darknet
+// https://arxiv.org/abs/1911.08287
+float box_ciou(box a, box b)
+{
+    boxabs ba = box_c(a, b);
+    float w = ba.right - ba.left;
+    float h = ba.bot - ba.top;
+    float c = w * w + h * h;
+    float iou = box_iou(a, b);
+    if (c == 0) {
+        return iou;
+    }
+    float u = (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+    float d = u / c;
+    float ar_gt = b.w / b.h;
+    float ar_pred = a.w / a.h;
+    float ar_loss = 4 / (M_PI * M_PI) * (atan(ar_gt) - atan(ar_pred)) * (atan(ar_gt) - atan(ar_pred));
+    float alpha = ar_loss / (1 - iou + ar_loss + 0.000001);
+    float ciou_term = d + alpha * ar_loss;                   //ciou
+#ifdef DEBUG_PRINTS
+    printf("  c: %f, u: %f, riou_term: %f\n", c, u, ciou_term);
+#endif
+    return iou - ciou_term;
+}
+
+dxrep dx_box_iou(box pred, box truth, IOU_LOSS iou_loss) {
+ boxabs pred_tblr = to_tblr(pred);
+    float pred_t = fmin(pred_tblr.top, pred_tblr.bot);
+    float pred_b = fmax(pred_tblr.top, pred_tblr.bot);
+    float pred_l = fmin(pred_tblr.left, pred_tblr.right);
+    float pred_r = fmax(pred_tblr.left, pred_tblr.right);
+    //dbox dover = derivative(pred,truth);
+    //dbox diouu = diou(pred, truth);
+    boxabs truth_tblr = to_tblr(truth);
+#ifdef DEBUG_PRINTS
+    printf("\niou: %f, giou: %f\n", box_iou(pred, truth), box_giou(pred, truth));
+    printf("pred: x,y,w,h: (%f, %f, %f, %f) -> t,b,l,r: (%f, %f, %f, %f)\n", pred.x, pred.y, pred.w, pred.h, pred_tblr.top, pred_tblr.bot, pred_tblr.left, pred_tblr.right);
+    printf("truth: x,y,w,h: (%f, %f, %f, %f) -> t,b,l,r: (%f, %f, %f, %f)\n", truth.x, truth.y, truth.w, truth.h, truth_tblr.top, truth_tblr.bot, truth_tblr.left, truth_tblr.right);
+#endif
+    //printf("pred (t,b,l,r): (%f, %f, %f, %f)\n", pred_t, pred_b, pred_l, pred_r);
+    //printf("trut (t,b,l,r): (%f, %f, %f, %f)\n", truth_tblr.top, truth_tblr.bot, truth_tblr.left, truth_tblr.right);
+    dxrep ddx = {0};
+    float X = (pred_b - pred_t) * (pred_r - pred_l);
+    float Xhat = (truth_tblr.bot - truth_tblr.top) * (truth_tblr.right - truth_tblr.left);
+    float Ih = fmin(pred_b, truth_tblr.bot) - fmax(pred_t, truth_tblr.top);
+    float Iw = fmin(pred_r, truth_tblr.right) - fmax(pred_l, truth_tblr.left);
+    float I = Iw * Ih;
+    float U = X + Xhat - I;
+    float S = (pred.x-truth.x)*(pred.x-truth.x)+(pred.y-truth.y)*(pred.y-truth.y);
+    float giou_Cw = fmax(pred_r, truth_tblr.right) - fmin(pred_l, truth_tblr.left);
+    float giou_Ch = fmax(pred_b, truth_tblr.bot) - fmin(pred_t, truth_tblr.top);
+    float giou_C = giou_Cw * giou_Ch;
+    //float IoU = I / U;
+//#ifdef DEBUG_PRINTS
+    //printf("X: %f", X);
+    //printf(", Xhat: %f", Xhat);
+    //printf(", Ih: %f", Ih);
+    //printf(", Iw: %f", Iw);
+    //printf(", I: %f", I);
+    //printf(", U: %f", U);
+    //printf(", IoU: %f\n", I / U);
+//#endif
+
+    //Partial Derivatives, derivatives
+    float dX_wrt_t = -1 * (pred_r - pred_l);
+    float dX_wrt_b = pred_r - pred_l;
+    float dX_wrt_l = -1 * (pred_b - pred_t);
+    float dX_wrt_r = pred_b - pred_t;
+    // UNUSED
+    //// Ground truth
+    //float dXhat_wrt_t = -1 * (truth_tblr.right - truth_tblr.left);
+    //float dXhat_wrt_b = truth_tblr.right - truth_tblr.left;
+    //float dXhat_wrt_l = -1 * (truth_tblr.bot - truth_tblr.top);
+    //float dXhat_wrt_r = truth_tblr.bot - truth_tblr.top;
+
+    // gradient of I min/max in IoU calc (prediction)
+    float dI_wrt_t = pred_t > truth_tblr.top ? (-1 * Iw) : 0;
+    float dI_wrt_b = pred_b < truth_tblr.bot ? Iw : 0;
+    float dI_wrt_l = pred_l > truth_tblr.left ? (-1 * Ih) : 0;
+    float dI_wrt_r = pred_r < truth_tblr.right ? Ih : 0;
+    // derivative of U with regard to x
+    float dU_wrt_t = dX_wrt_t - dI_wrt_t;
+    float dU_wrt_b = dX_wrt_b - dI_wrt_b;
+    float dU_wrt_l = dX_wrt_l - dI_wrt_l;
+    float dU_wrt_r = dX_wrt_r - dI_wrt_r;
+    // gradient of C min/max in IoU calc (prediction)
+    float dC_wrt_t = pred_t < truth_tblr.top ? (-1 * giou_Cw) : 0;
+    float dC_wrt_b = pred_b > truth_tblr.bot ? giou_Cw : 0;
+    float dC_wrt_l = pred_l < truth_tblr.left ? (-1 * giou_Ch) : 0;
+    float dC_wrt_r = pred_r > truth_tblr.right ? giou_Ch : 0;
+
+    float p_dt = 0;
+    float p_db = 0;
+    float p_dl = 0;
+    float p_dr = 0;
+    if (U > 0 ) {
+      p_dt = ((U * dI_wrt_t) - (I * dU_wrt_t)) / (U * U);
+      p_db = ((U * dI_wrt_b) - (I * dU_wrt_b)) / (U * U);
+      p_dl = ((U * dI_wrt_l) - (I * dU_wrt_l)) / (U * U);
+      p_dr = ((U * dI_wrt_r) - (I * dU_wrt_r)) / (U * U);
+    }
+    // apply grad from prediction min/max for correct corner selection
+    p_dt = pred_tblr.top < pred_tblr.bot ? p_dt : p_db;
+    p_db = pred_tblr.top < pred_tblr.bot ? p_db : p_dt;
+    p_dl = pred_tblr.left < pred_tblr.right ? p_dl : p_dr;
+    p_dr = pred_tblr.left < pred_tblr.right ? p_dr : p_dl;
+
+    if (iou_loss == GIOU) {
+      if (giou_C > 0) {
+        // apply "C" term from gIOU
+        p_dt += ((giou_C * dU_wrt_t) - (U * dC_wrt_t)) / (giou_C * giou_C);
+        p_db += ((giou_C * dU_wrt_b) - (U * dC_wrt_b)) / (giou_C * giou_C);
+        p_dl += ((giou_C * dU_wrt_l) - (U * dC_wrt_l)) / (giou_C * giou_C);
+        p_dr += ((giou_C * dU_wrt_r) - (U * dC_wrt_r)) / (giou_C * giou_C);
+      }
+      if (Iw<=0||Ih<=0) {
+        p_dt = ((giou_C * dU_wrt_t) - (U * dC_wrt_t)) / (giou_C * giou_C);
+        p_db = ((giou_C * dU_wrt_b) - (U * dC_wrt_b)) / (giou_C * giou_C);
+        p_dl = ((giou_C * dU_wrt_l) - (U * dC_wrt_l)) / (giou_C * giou_C);
+        p_dr = ((giou_C * dU_wrt_r) - (U * dC_wrt_r)) / (giou_C * giou_C);
+      }
+    }
+
+    float Ct = fmin(pred.y - pred.h / 2,truth.y - truth.h / 2);
+    float Cb = fmax(pred.y + pred.h / 2,truth.y + truth.h / 2);
+    float Cl = fmin(pred.x - pred.w / 2,truth.x - truth.w / 2);
+    float Cr = fmax(pred.x + pred.w / 2,truth.x + truth.w / 2);
+    float Cw = Cr - Cl;
+    float Ch = Cb - Ct;
+    float C = Cw * Cw + Ch * Ch;
+
+    float dCt_dx = 0;
+    float dCt_dy = pred_t < truth_tblr.top ? 1 : 0;
+    float dCt_dw = 0;
+    float dCt_dh = pred_t < truth_tblr.top ? -0.5 : 0;
+
+    float dCb_dx = 0;
+    float dCb_dy = pred_b > truth_tblr.bot ? 1 : 0;
+    float dCb_dw = 0;
+    float dCb_dh = pred_b > truth_tblr.bot ? 0.5: 0;
+
+    float dCl_dx = pred_l < truth_tblr.left ? 1 : 0;
+    float dCl_dy = 0;
+    float dCl_dw = pred_l < truth_tblr.left ? -0.5 : 0;
+    float dCl_dh = 0;
+
+    float dCr_dx = pred_r > truth_tblr.right ? 1 : 0;
+    float dCr_dy = 0;
+    float dCr_dw = pred_r > truth_tblr.right ? 0.5 : 0;
+    float dCr_dh = 0;
+
+    float dCw_dx = dCr_dx - dCl_dx;
+    float dCw_dy = dCr_dy - dCl_dy;
+    float dCw_dw = dCr_dw - dCl_dw;
+    float dCw_dh = dCr_dh - dCl_dh;
+
+    float dCh_dx = dCb_dx - dCt_dx;
+    float dCh_dy = dCb_dy - dCt_dy;
+    float dCh_dw = dCb_dw - dCt_dw;
+    float dCh_dh = dCb_dh - dCt_dh;
+
+    // UNUSED
+    //// ground truth
+    //float dI_wrt_xhat_t = pred_t < truth_tblr.top ? (-1 * Iw) : 0;
+    //float dI_wrt_xhat_b = pred_b > truth_tblr.bot ? Iw : 0;
+    //float dI_wrt_xhat_l = pred_l < truth_tblr.left ? (-1 * Ih) : 0;
+    //float dI_wrt_xhat_r = pred_r > truth_tblr.right ? Ih : 0;
+
+    // Final IOU loss (prediction) (negative of IOU gradient, we want the negative loss)
+    float p_dx = 0;
+    float p_dy = 0;
+    float p_dw = 0;
+    float p_dh = 0;
+
+    p_dx = p_dl + p_dr;           //p_dx, p_dy, p_dw and p_dh are the gradient of IoU or GIoU.
+    p_dy = p_dt + p_db;
+    p_dw = (p_dr - p_dl);         //For dw and dh, we do not divided by 2.
+    p_dh = (p_db - p_dt);
+
+    // https://github.com/Zzh-tju/DIoU-darknet
+    // https://arxiv.org/abs/1911.08287
+    if (iou_loss == DIOU) {
+        if (C > 0) {
+            p_dx += (2*(truth.x-pred.x)*C-(2*Cw*dCw_dx+2*Ch*dCh_dx)*S) / (C * C);
+            p_dy += (2*(truth.y-pred.y)*C-(2*Cw*dCw_dy+2*Ch*dCh_dy)*S) / (C * C);
+            p_dw += (2*Cw*dCw_dw+2*Ch*dCh_dw)*S / (C * C);
+            p_dh += (2*Cw*dCw_dh+2*Ch*dCh_dh)*S / (C * C);
+        }
+    if (Iw<=0||Ih<=0){
+            p_dx = (2*(truth.x-pred.x)*C-(2*Cw*dCw_dx+2*Ch*dCh_dx)*S) / (C * C);
+            p_dy = (2*(truth.y-pred.y)*C-(2*Cw*dCw_dy+2*Ch*dCh_dy)*S) / (C * C);
+            p_dw = (2*Cw*dCw_dw+2*Ch*dCh_dw)*S / (C * C);
+            p_dh = (2*Cw*dCw_dh+2*Ch*dCh_dh)*S / (C * C);
+        }
+    }
+    //The following codes are calculating the gradient of ciou.
+
+    if (iou_loss == CIOU) {
+    float ar_gt = truth.w / truth.h;
+        float ar_pred = pred.w / pred.h;
+        float ar_loss = 4 / (M_PI * M_PI) * (atan(ar_gt) - atan(ar_pred)) * (atan(ar_gt) - atan(ar_pred));
+    float alpha = ar_loss / (1 - I/U + ar_loss + 0.000001);
+    float ar_dw=8/(M_PI*M_PI)*(atan(ar_gt)-atan(ar_pred))*pred.h;
+        float ar_dh=-8/(M_PI*M_PI)*(atan(ar_gt)-atan(ar_pred))*pred.w;
+        if (C > 0) {
+        // dar*
+            p_dx += (2*(truth.x-pred.x)*C-(2*Cw*dCw_dx+2*Ch*dCh_dx)*S) / (C * C);
+            p_dy += (2*(truth.y-pred.y)*C-(2*Cw*dCw_dy+2*Ch*dCh_dy)*S) / (C * C);
+            p_dw += (2*Cw*dCw_dw+2*Ch*dCh_dw)*S / (C * C) + alpha * ar_dw;
+            p_dh += (2*Cw*dCw_dh+2*Ch*dCh_dh)*S / (C * C) + alpha * ar_dh;
+        }
+    if (Iw<=0||Ih<=0){
+            p_dx = (2*(truth.x-pred.x)*C-(2*Cw*dCw_dx+2*Ch*dCh_dx)*S) / (C * C);
+            p_dy = (2*(truth.y-pred.y)*C-(2*Cw*dCw_dy+2*Ch*dCh_dy)*S) / (C * C);
+            p_dw = (2*Cw*dCw_dw+2*Ch*dCh_dw)*S / (C * C) + alpha * ar_dw;
+            p_dh = (2*Cw*dCw_dh+2*Ch*dCh_dh)*S / (C * C) + alpha * ar_dh;
+        }
+    }
+
+    ddx.dt = p_dx;      //We follow the original code released from GDarknet. So in yolo_layer.c, dt, db, dl, dr are already dx, dy, dw, dh.
+    ddx.db = p_dy;
+    ddx.dl = p_dw;
+    ddx.dr = p_dh;
+
+    // UNUSED
+    //// ground truth
+    //float gt_dt = ((U * dI_wrt_xhat_t) - (I * (dXhat_wrt_t - dI_wrt_xhat_t))) / (U * U);
+    //float gt_db = ((U * dI_wrt_xhat_b) - (I * (dXhat_wrt_b - dI_wrt_xhat_b))) / (U * U);
+    //float gt_dl = ((U * dI_wrt_xhat_l) - (I * (dXhat_wrt_l - dI_wrt_xhat_l))) / (U * U);
+    //float gt_dr = ((U * dI_wrt_xhat_r) - (I * (dXhat_wrt_r - dI_wrt_xhat_r))) / (U * U);
+
+    // no min/max grad applied
+    //dx.dt = dt;
+    //dx.db = db;
+    //dx.dl = dl;
+    //dx.dr = dr;
+
+    //// sum in gt -- THIS DOESNT WORK
+    //dx.dt += gt_dt;
+    //dx.db += gt_db;
+    //dx.dl += gt_dl;
+    //dx.dr += gt_dr;
+
+    //// instead, look at the change between pred and gt, and weight t/b/l/r appropriately...
+    //// need the real derivative here (I think?)
+    //float delta_t = fmax(truth_tblr.top, pred_t) - fmin(truth_tblr.top, pred_t);
+    //float delta_b = fmax(truth_tblr.bot, pred_b) - fmin(truth_tblr.bot, pred_b);
+    //float delta_l = fmax(truth_tblr.left, pred_l) - fmin(truth_tblr.left, pred_l);
+    //float delta_r = fmax(truth_tblr.right, pred_r) - fmin(truth_tblr.right, pred_r);
+
+    //dx.dt *= delta_t / (delta_t + delta_b);
+    //dx.db *= delta_b / (delta_t + delta_b);
+    //dx.dl *= delta_l / (delta_l + delta_r);
+    //dx.dr *= delta_r / (delta_l + delta_r);
+
+    // UNUSED
+    //// ground truth
+    //float gt_dt = ((U * dI_wrt_xhat_t) - (I * (dXhat_wrt_t - dI_wrt_xhat_t))) / (U * U);
+    //float gt_db = ((U * dI_wrt_xhat_b) - (I * (dXhat_wrt_b - dI_wrt_xhat_b))) / (U * U);
+    //float gt_dl = ((U * dI_wrt_xhat_l) - (I * (dXhat_wrt_l - dI_wrt_xhat_l))) / (U * U);
+    //float gt_dr = ((U * dI_wrt_xhat_r) - (I * (dXhat_wrt_r - dI_wrt_xhat_r))) / (U * U);
+
+    // no min/max grad applied
+    //dx.dt = dt;
+    //dx.db = db;
+    //dx.dl = dl;
+    //dx.dr = dr;
+
+    // apply grad from prediction min/max for correct corner selection
+    //dx.dt = pred_tblr.top < pred_tblr.bot ? p_dt : p_db;
+    //dx.db = pred_tblr.top < pred_tblr.bot ? p_db : p_dt;
+    //dx.dl = pred_tblr.left < pred_tblr.right ? p_dl : p_dr;
+    //dx.dr = pred_tblr.left < pred_tblr.right ? p_dr : p_dl;
+
+    //// sum in gt -- THIS DOESNT WORK
+    //dx.dt += gt_dt;
+    //dx.db += gt_db;
+    //dx.dl += gt_dl;
+    //dx.dr += gt_dr;
+
+    //// instead, look at the change between pred and gt, and weight t/b/l/r appropriately...
+    //// need the real derivative here (I think?)
+    //float delta_t = fmax(truth_tblr.top, pred_t) - fmin(truth_tblr.top, pred_t);
+    //float delta_b = fmax(truth_tblr.bot, pred_b) - fmin(truth_tblr.bot, pred_b);
+    //float delta_l = fmax(truth_tblr.left, pred_l) - fmin(truth_tblr.left, pred_l);
+    //float delta_r = fmax(truth_tblr.right, pred_r) - fmin(truth_tblr.right, pred_r);
+
+    //dx.dt *= delta_t / (delta_t + delta_b);
+    //dx.db *= delta_b / (delta_t + delta_b);
+    //dx.dl *= delta_l / (delta_l + delta_r);
+    //dx.dr *= delta_r / (delta_l + delta_r);
+
+//#ifdef DEBUG_PRINTS
+    /*printf("  directions dt: ");
+    if ((pred_tblr.top < truth_tblr.top && dx.dt > 0) || (pred_tblr.top > truth_tblr.top && dx.dt < 0)) {
+      printf("✓");
+    } else {
+      printf("𝒙");
+    }
+    printf(", ");
+    if ((pred_tblr.bot < truth_tblr.bot && dx.db > 0) || (pred_tblr.bot > truth_tblr.bot && dx.db < 0)) {
+      printf("✓");
+    } else {
+      printf("𝒙");
+    }
+    printf(", ");
+    if ((pred_tblr.left < truth_tblr.left && dx.dl > 0) || (pred_tblr.left > truth_tblr.left && dx.dl < 0)) {
+      printf("✓");
+    } else {
+      printf("𝒙");
+    }
+    printf(", ");
+    if ((pred_tblr.right < truth_tblr.right && dx.dr > 0) || (pred_tblr.right > truth_tblr.right && dx.dr < 0)) {
+      printf("✓");
+    } else {
+      printf("𝒙");
+    }
+    printf("\n");
+
+    printf("dx dt:%f", dx.dt);
+    printf(", db: %f", dx.db);
+    printf(", dl: %f", dx.dl);
+    printf(", dr: %f | ", dx.dr);
+#endif
+
+#ifdef DEBUG_NAN
+    if (isnan(dx.dt)) { printf("dt isnan\n"); }
+    if (isnan(dx.db)) { printf("db isnan\n"); }
+    if (isnan(dx.dl)) { printf("dl isnan\n"); }
+    if (isnan(dx.dr)) { printf("dr isnan\n"); }
+#endif
+
+//    // No update if 0 or nan
+//    if (dx.dt == 0 || isnan(dx.dt)) { dx.dt = 1; }
+//    if (dx.db == 0 || isnan(dx.db)) { dx.db = 1; }
+//    if (dx.dl == 0 || isnan(dx.dl)) { dx.dl = 1; }
+//    if (dx.dr == 0 || isnan(dx.dr)) { dx.dr = 1; }
+//
+//#ifdef DEBUG_PRINTS
+//    printf("dx dt:%f (t: %f, p: %f)", dx.dt, gt_dt, p_dt);
+//    printf(", db: %f (t: %f, p: %f)", dx.db, gt_db, p_db);
+//    printf(", dl: %f (t: %f, p: %f)", dx.dl, gt_dl, p_dl);
+//    printf(", dr: %f (t: %f, p: %f) | ", dx.dr, gt_dr, p_dr);
+//#endif */
+    return ddx;
+}
+
+float box_rmse(box a, box b)
+{
+    return sqrt(pow(a.x-b.x, 2) +
+                pow(a.y-b.y, 2) +
+                pow(a.w-b.w, 2) +
+                pow(a.h-b.h, 2));
+}
+
+dbox dintersect(box a, box b)
+{
+    float w = overlap(a.x, a.w, b.x, b.w);
+    float h = overlap(a.y, a.h, b.y, b.h);
+    dbox dover = derivative(a, b);
+    dbox di;
+
+    di.dw = dover.dw*h;
+    di.dx = dover.dx*h;
+    di.dh = dover.dh*w;
+    di.dy = dover.dy*w;
+
+    return di;
+}
+
+dbox dunion(box a, box b)
+{
+    dbox du;
+
+    dbox di = dintersect(a, b);
+    du.dw = a.h - di.dw;
+    du.dh = a.w - di.dh;
+    du.dx = -di.dx;
+    du.dy = -di.dy;
+
+    return du;
+}
+
+
+void test_dunion()
+{
+    box a = {0, 0, 1, 1};
+    box dxa= {0+.0001, 0, 1, 1};
+    box dya= {0, 0+.0001, 1, 1};
+    box dwa= {0, 0, 1+.0001, 1};
+    box dha= {0, 0, 1, 1+.0001};
+
+    box b = {.5, .5, .2, .2};
+    dbox di = dunion(a,b);
+    printf("Union: %f %f %f %f\n", di.dx, di.dy, di.dw, di.dh);
+    float inter =  box_union(a, b);
+    float xinter = box_union(dxa, b);
+    float yinter = box_union(dya, b);
+    float winter = box_union(dwa, b);
+    float hinter = box_union(dha, b);
+    xinter = (xinter - inter)/(.0001);
+    yinter = (yinter - inter)/(.0001);
+    winter = (winter - inter)/(.0001);
+    hinter = (hinter - inter)/(.0001);
+    printf("Union Manual %f %f %f %f\n", xinter, yinter, winter, hinter);
+}
+void test_dintersect()
+{
+    box a = {0, 0, 1, 1};
+    box dxa= {0+.0001, 0, 1, 1};
+    box dya= {0, 0+.0001, 1, 1};
+    box dwa= {0, 0, 1+.0001, 1};
+    box dha= {0, 0, 1, 1+.0001};
+
+    box b = {.5, .5, .2, .2};
+    dbox di = dintersect(a,b);
+    printf("Inter: %f %f %f %f\n", di.dx, di.dy, di.dw, di.dh);
+    float inter =  box_intersection(a, b);
+    float xinter = box_intersection(dxa, b);
+    float yinter = box_intersection(dya, b);
+    float winter = box_intersection(dwa, b);
+    float hinter = box_intersection(dha, b);
+    xinter = (xinter - inter)/(.0001);
+    yinter = (yinter - inter)/(.0001);
+    winter = (winter - inter)/(.0001);
+    hinter = (hinter - inter)/(.0001);
+    printf("Inter Manual %f %f %f %f\n", xinter, yinter, winter, hinter);
+}
+
+void test_box()
+{
+    test_dintersect();
+    test_dunion();
+    box a = {0, 0, 1, 1};
+    box dxa= {0+.00001, 0, 1, 1};
+    box dya= {0, 0+.00001, 1, 1};
+    box dwa= {0, 0, 1+.00001, 1};
+    box dha= {0, 0, 1, 1+.00001};
+
+    box b = {.5, 0, .2, .2};
+
+    float iou = box_iou(a,b);
+    iou = (1-iou)*(1-iou);
+    printf("%f\n", iou);
+    dbox d = diou(a, b);
+    printf("%f %f %f %f\n", d.dx, d.dy, d.dw, d.dh);
+
+    float xiou = box_iou(dxa, b);
+    float yiou = box_iou(dya, b);
+    float wiou = box_iou(dwa, b);
+    float hiou = box_iou(dha, b);
+    xiou = ((1-xiou)*(1-xiou) - iou)/(.00001);
+    yiou = ((1-yiou)*(1-yiou) - iou)/(.00001);
+    wiou = ((1-wiou)*(1-wiou) - iou)/(.00001);
+    hiou = ((1-hiou)*(1-hiou) - iou)/(.00001);
+    printf("manual %f %f %f %f\n", xiou, yiou, wiou, hiou);
+}
+
+dbox diou(box a, box b)
+{
+    float u = box_union(a, b);
+    float i = box_intersection(a, b);
+    dbox di = dintersect(a, b);
+    dbox du = dunion(a, b);
+    dbox dd = { 0,0,0,0 };
+
+    if (i <= 0 || 1) {
+        dd.dx = b.x - a.x;
+        dd.dy = b.y - a.y;
+        dd.dw = b.w - a.w;
+        dd.dh = b.h - a.h;
+        return dd;
+    }
+
+    dd.dx = (di.dx*u - du.dx*i) / (u*u);
+    dd.dy = (di.dy*u - du.dy*i) / (u*u);
+    dd.dw = (di.dw*u - du.dw*i) / (u*u);
+    dd.dh = (di.dh*u - du.dh*i) / (u*u);
+    return dd;
+}
+
+typedef struct{
+    int index;
+    int class_id;
+    float **probs;
+} sortable_bbox;
+
+int nms_comparator(const void *pa, const void *pb)
+{
+    sortable_bbox a = *(sortable_bbox *)pa;
+    sortable_bbox b = *(sortable_bbox *)pb;
+    float diff = a.probs[a.index][b.class_id] - b.probs[b.index][b.class_id];
+    if(diff < 0) return 1;
+    else if(diff > 0) return -1;
+    return 0;
+}
+
+void do_nms_sort_v2(box *boxes, float **probs, int total, int classes, float thresh)
+{
+    int i, j, k;
+    sortable_bbox* s = (sortable_bbox*)xcalloc(total, sizeof(sortable_bbox));
+
+    for(i = 0; i < total; ++i){
+        s[i].index = i;
+        s[i].class_id = 0;
+        s[i].probs = probs;
+    }
+
+    for(k = 0; k < classes; ++k){
+        for(i = 0; i < total; ++i){
+            s[i].class_id = k;
+        }
+        qsort(s, total, sizeof(sortable_bbox), nms_comparator);
+        for(i = 0; i < total; ++i){
+            if(probs[s[i].index][k] == 0) continue;
+            box a = boxes[s[i].index];
+            for(j = i+1; j < total; ++j){
+                box b = boxes[s[j].index];
+                if (box_iou(a, b) > thresh){
+                    probs[s[j].index][k] = 0;
+                }
+            }
+        }
+    }
+    free(s);
+}
+
+int nms_comparator_v3(const void *pa, const void *pb)
+{
+    detection a = *(detection *)pa;
+    detection b = *(detection *)pb;
+    float diff = 0;
+    if (b.sort_class >= 0) {
+        diff = a.prob[b.sort_class] - b.prob[b.sort_class]; // there is already: prob = objectness*prob
+    }
+    else {
+        diff = a.objectness - b.objectness;
+    }
+    if (diff < 0) return 1;
+    else if (diff > 0) return -1;
+    return 0;
+}
+
+void do_nms_obj(detection *dets, int total, int classes, float thresh)
+{
+    int i, j, k;
+    k = total - 1;
+    for (i = 0; i <= k; ++i) {
+        if (dets[i].objectness == 0) {
+            detection swap = dets[i];
+            dets[i] = dets[k];
+            dets[k] = swap;
+            --k;
+            --i;
+        }
+    }
+    total = k + 1;
+
+    for (i = 0; i < total; ++i) {
+        dets[i].sort_class = -1;
+    }
+
+    qsort(dets, total, sizeof(detection), nms_comparator_v3);
+    for (i = 0; i < total; ++i) {
+        if (dets[i].objectness == 0) continue;
+        box a = dets[i].bbox;
+        for (j = i + 1; j < total; ++j) {
+            if (dets[j].objectness == 0) continue;
+            box b = dets[j].bbox;
+            if (box_iou(a, b) > thresh) {
+                dets[j].objectness = 0;
+                for (k = 0; k < classes; ++k) {
+                    dets[j].prob[k] = 0;
+                }
+            }
+        }
+    }
+}
+
+void do_nms_sort(detection *dets, int total, int classes, float thresh)
+{
+    int i, j, k;
+    k = total - 1;
+    for (i = 0; i <= k; ++i) {
+        if (dets[i].objectness == 0) {
+            detection swap = dets[i];
+            dets[i] = dets[k];
+            dets[k] = swap;
+            --k;
+            --i;
+        }
+    }
+    total = k + 1;
+
+    for (k = 0; k < classes; ++k) {
+        for (i = 0; i < total; ++i) {
+            dets[i].sort_class = k;
+        }
+        qsort(dets, total, sizeof(detection), nms_comparator_v3);
+        for (i = 0; i < total; ++i) {
+            //printf("  k = %d, \t i = %d \n", k, i);
+            if (dets[i].prob[k] == 0) continue;
+            box a = dets[i].bbox;
+            for (j = i + 1; j < total; ++j) {
+                box b = dets[j].bbox;
+                if (box_iou(a, b) > thresh) {
+                    dets[j].prob[k] = 0;
+                }
+            }
+        }
+    }
+}
+
+void do_nms(box *boxes, float **probs, int total, int classes, float thresh)
+{
+    int i, j, k;
+    for(i = 0; i < total; ++i){
+        int any = 0;
+        for(k = 0; k < classes; ++k) any = any || (probs[i][k] > 0);
+        if(!any) {
+            continue;
+        }
+        for(j = i+1; j < total; ++j){
+            if (box_iou(boxes[i], boxes[j]) > thresh){
+                for(k = 0; k < classes; ++k){
+                    if (probs[i][k] < probs[j][k]) probs[i][k] = 0;
+                    else probs[j][k] = 0;
+                }
+            }
+        }
+    }
+}
+
+// https://github.com/Zzh-tju/DIoU-darknet
+// https://arxiv.org/abs/1911.08287
+void diounms_sort(detection *dets, int total, int classes, float thresh, NMS_KIND nms_kind, float beta1)
+{
+    int i, j, k;
+    k = total - 1;
+    for (i = 0; i <= k; ++i) {
+        if (dets[i].objectness == 0) {
+            detection swap = dets[i];
+            dets[i] = dets[k];
+            dets[k] = swap;
+            --k;
+            --i;
+        }
+    }
+    total = k + 1;
+
+    for (k = 0; k < classes; ++k) {
+        for (i = 0; i < total; ++i) {
+            dets[i].sort_class = k;
+        }
+        qsort(dets, total, sizeof(detection), nms_comparator_v3);
+        for (i = 0; i < total; ++i)
+        {
+            if (dets[i].prob[k] == 0) continue;
+            box a = dets[i].bbox;
+            for (j = i + 1; j < total; ++j) {
+                box b = dets[j].bbox;
+                if (box_iou(a, b) > thresh && nms_kind == CORNERS_NMS)
+                {
+                    float sum_prob = pow(dets[i].prob[k], 2) + pow(dets[j].prob[k], 2);
+                    float alpha_prob = pow(dets[i].prob[k], 2) / sum_prob;
+                    float beta_prob = pow(dets[j].prob[k], 2) / sum_prob;
+                    //dets[i].bbox.x = (dets[i].bbox.x*alpha_prob + dets[j].bbox.x*beta_prob);
+                    //dets[i].bbox.y = (dets[i].bbox.y*alpha_prob + dets[j].bbox.y*beta_prob);
+                    //dets[i].bbox.w = (dets[i].bbox.w*alpha_prob + dets[j].bbox.w*beta_prob);
+                    //dets[i].bbox.h = (dets[i].bbox.h*alpha_prob + dets[j].bbox.h*beta_prob);
+                    /*
+                    if (dets[j].points == YOLO_CENTER && (dets[i].points & dets[j].points) == 0) {
+                        dets[i].bbox.x = (dets[i].bbox.x*alpha_prob + dets[j].bbox.x*beta_prob);
+                        dets[i].bbox.y = (dets[i].bbox.y*alpha_prob + dets[j].bbox.y*beta_prob);
+                    }
+                    else if ((dets[i].points & dets[j].points) == 0) {
+                        dets[i].bbox.w = (dets[i].bbox.w*alpha_prob + dets[j].bbox.w*beta_prob);
+                        dets[i].bbox.h = (dets[i].bbox.h*alpha_prob + dets[j].bbox.h*beta_prob);
+                    }
+                    dets[i].points |= dets[j].points;
+                    */
+                    dets[j].prob[k] = 0;
+                }
+                else if (box_diou(a, b) > thresh && nms_kind == GREEDY_NMS) {
+                    dets[j].prob[k] = 0;
+                }
+                else {
+                    if (box_diounms(a, b, beta1) > thresh && nms_kind == DIOU_NMS) {
+                        dets[j].prob[k] = 0;
+                    }
+                }
+            }
+
+            //if ((nms_kind == CORNERS_NMS) && (dets[i].points != (YOLO_CENTER | YOLO_LEFT_TOP | YOLO_RIGHT_BOTTOM)))
+            //    dets[i].prob[k] = 0;
+        }
+    }
+}
+
+box encode_box(box b, box anchor)
+{
+    box encode;
+    encode.x = (b.x - anchor.x) / anchor.w;
+    encode.y = (b.y - anchor.y) / anchor.h;
+    encode.w = log2(b.w / anchor.w);
+    encode.h = log2(b.h / anchor.h);
+    return encode;
+}
+
+box decode_box(box b, box anchor)
+{
+    box decode;
+    decode.x = b.x * anchor.w + anchor.x;
+    decode.y = b.y * anchor.h + anchor.y;
+    decode.w = pow(2., b.w) * anchor.w;
+    decode.h = pow(2., b.h) * anchor.h;
+    return decode;
+}
diff --git a/darknet-master/src/box.h b/darknet-master/src/box.h
new file mode 100644
index 0000000..f72e26c
--- /dev/null
+++ b/darknet-master/src/box.h
@@ -0,0 +1,59 @@
+#ifndef BOX_H
+#define BOX_H
+
+#include "darknet.h"
+
+//typedef struct{
+//    float x, y, w, h;
+//} box;
+
+typedef struct{
+    float dx, dy, dw, dh;
+} dbox;
+
+//typedef struct detection {
+//    box bbox;
+//    int classes;
+//    float *prob;
+//    float *mask;
+//    float objectness;
+//    int sort_class;
+//} detection;
+
+typedef struct detection_with_class {
+    detection det;
+    // The most probable class id: the best class index in this->prob.
+    // Is filled temporary when processing results, otherwise not initialized
+    int best_class;
+} detection_with_class;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+box float_to_box(float *f);
+box float_to_box_stride(float *f, int stride);
+float box_iou(box a, box b);
+float box_iou_kind(box a, box b, IOU_LOSS iou_kind);
+float box_rmse(box a, box b);
+dxrep dx_box_iou(box a, box b, IOU_LOSS iou_loss);
+float box_giou(box a, box b);
+float box_diou(box a, box b);
+float box_ciou(box a, box b);
+dbox diou(box a, box b);
+boxabs to_tblr(box a);
+void do_nms(box *boxes, float **probs, int total, int classes, float thresh);
+void do_nms_sort_v2(box *boxes, float **probs, int total, int classes, float thresh);
+//LIB_API void do_nms_sort(detection *dets, int total, int classes, float thresh);
+//LIB_API void do_nms_obj(detection *dets, int total, int classes, float thresh);
+//LIB_API void diounms_sort(detection *dets, int total, int classes, float thresh, NMS_KIND nms_kind, float beta1);
+box decode_box(box b, box anchor);
+box encode_box(box b, box anchor);
+
+// Creates array of detections with prob > thresh and fills best_class for them
+// Return number of selected detections in *selected_detections_num
+detection_with_class* get_actual_detections(detection *dets, int dets_num, float thresh, int* selected_detections_num, char **names);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/captcha.c b/darknet-master/src/captcha.c
new file mode 100644
index 0000000..5fd565d
--- /dev/null
+++ b/darknet-master/src/captcha.c
@@ -0,0 +1,363 @@
+#include "network.h"
+#include "utils.h"
+#include "parser.h"
+
+void fix_data_captcha(data d, int mask)
+{
+    matrix labels = d.y;
+    int i, j;
+    for(i = 0; i < d.y.rows; ++i){
+        for(j = 0; j < d.y.cols; j += 2){
+            if (mask){
+                if(!labels.vals[i][j]){
+                    labels.vals[i][j] = SECRET_NUM;
+                    labels.vals[i][j+1] = SECRET_NUM;
+                }else if(labels.vals[i][j+1]){
+                    labels.vals[i][j] = 0;
+                }
+            } else{
+                if (labels.vals[i][j]) {
+                    labels.vals[i][j+1] = 0;
+                } else {
+                    labels.vals[i][j+1] = 1;
+                }
+            }
+        }
+    }
+}
+
+void train_captcha(char *cfgfile, char *weightfile)
+{
+    srand(time(0));
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = 1024;
+    int i = *net.seen/imgs;
+    int solved = 1;
+    list *plist;
+    char** labels = get_labels("data/captcha/reimgs.labels.list");
+    if (solved){
+        plist = get_paths("data/captcha/reimgs.solved.list");
+    }else{
+        plist = get_paths("data/captcha/reimgs.raw.list");
+    }
+    char **paths = (char **)list_to_array(plist);
+    printf("%d\n", plist->size);
+    clock_t time;
+    pthread_t load_thread;
+    data train;
+    data buffer;
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.paths = paths;
+    args.classes = 26;
+    args.n = imgs;
+    args.m = plist->size;
+    args.labels = labels;
+    args.d = &buffer;
+    args.type = CLASSIFICATION_DATA;
+
+    load_thread = load_data_in_thread(args);
+    while(1){
+        ++i;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        fix_data_captcha(train, solved);
+
+        /*
+           image im = float_to_image(256, 256, 3, train.X.vals[114]);
+           show_image(im, "training");
+           cvWaitKey(0);
+         */
+
+        load_thread = load_data_in_thread(args);
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+        time=clock();
+        float loss = train_network(net, train);
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+        printf("%d: %f, %f avg, %lf seconds, %" PRIu64 " images\n", i, loss, avg_loss, sec(clock()-time), *net.seen);
+        free_data(train);
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "imagenet_backup/%s_%d.weights", base, i);
+            save_weights(net, buff);
+        }
+    }
+}
+
+void test_captcha(char *cfgfile, char *weightfile, char *filename)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    srand(2222222);
+    int i = 0;
+    char** names = get_labels("data/captcha/reimgs.labels.list");
+    char buff[256];
+    char *input = buff;
+    int indexes[26];
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            //printf("Enter Image Path: ");
+            //fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, net.w, net.h);
+        float *X = im.data;
+        float *predictions = network_predict(net, X);
+        top_predictions(net, 26, indexes);
+        //printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        for(i = 0; i < 26; ++i){
+            int index = indexes[i];
+            if(i != 0) printf(", ");
+            printf("%s %f", names[index], predictions[index]);
+        }
+        printf("\n");
+        fflush(stdout);
+        free_image(im);
+        if (filename) break;
+    }
+}
+
+void valid_captcha(char *cfgfile, char *weightfile, char *filename)
+{
+    char** labels = get_labels("data/captcha/reimgs.labels.list");
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    list* plist = get_paths("data/captcha/reimgs.fg.list");
+    char **paths = (char **)list_to_array(plist);
+    int N = plist->size;
+    int outputs = net.outputs;
+
+    set_batch_network(&net, 1);
+    srand(2222222);
+    int i, j;
+    for(i = 0; i < N; ++i){
+        if (i%100 == 0) fprintf(stderr, "%d\n", i);
+        image im = load_image_color(paths[i], net.w, net.h);
+        float *X = im.data;
+        float *predictions = network_predict(net, X);
+        //printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        int truth = -1;
+        for(j = 0; j < 13; ++j){
+            if (strstr(paths[i], labels[j])) truth = j;
+        }
+        if (truth == -1){
+            fprintf(stderr, "bad: %s\n", paths[i]);
+            return;
+        }
+        printf("%d, ", truth);
+        for(j = 0; j < outputs; ++j){
+            if (j != 0) printf(", ");
+            printf("%f", predictions[j]);
+        }
+        printf("\n");
+        fflush(stdout);
+        free_image(im);
+        if (filename) break;
+    }
+}
+
+/*
+   void train_captcha(char *cfgfile, char *weightfile)
+   {
+   float avg_loss = -1;
+   srand(time(0));
+   char *base = basecfg(cfgfile);
+   printf("%s\n", base);
+   network net = parse_network_cfg(cfgfile);
+   if(weightfile){
+   load_weights(&net, weightfile);
+   }
+   printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+   int imgs = 1024;
+   int i = net.seen/imgs;
+   list *plist = get_paths("/data/captcha/train.auto5");
+   char **paths = (char **)list_to_array(plist);
+   printf("%d\n", plist->size);
+   clock_t time;
+   while(1){
+   ++i;
+   time=clock();
+   data train = load_data_captcha(paths, imgs, plist->size, 10, 200, 60);
+   translate_data_rows(train, -128);
+   scale_data_rows(train, 1./128);
+   printf("Loaded: %lf seconds\n", sec(clock()-time));
+   time=clock();
+   float loss = train_network(net, train);
+   net.seen += imgs;
+   if(avg_loss == -1) avg_loss = loss;
+   avg_loss = avg_loss*.9 + loss*.1;
+   printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen);
+   free_data(train);
+   if(i%10==0){
+   char buff[256];
+   sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i);
+   save_weights(net, buff);
+   }
+   }
+   }
+
+   void decode_captcha(char *cfgfile, char *weightfile)
+   {
+   setbuf(stdout, NULL);
+   srand(time(0));
+   network net = parse_network_cfg(cfgfile);
+   set_batch_network(&net, 1);
+   if(weightfile){
+   load_weights(&net, weightfile);
+   }
+   char filename[256];
+   while(1){
+   printf("Enter filename: ");
+   fgets(filename, 256, stdin);
+   strtok(filename, "\n");
+   image im = load_image_color(filename, 300, 57);
+   scale_image(im, 1./255.);
+   float *X = im.data;
+   float *predictions = network_predict(net, X);
+   image out  = float_to_image(300, 57, 1, predictions);
+   show_image(out, "decoded");
+#ifdef OPENCV
+cvWaitKey(0);
+#endif
+free_image(im);
+}
+}
+
+void encode_captcha(char *cfgfile, char *weightfile)
+{
+float avg_loss = -1;
+srand(time(0));
+char *base = basecfg(cfgfile);
+printf("%s\n", base);
+network net = parse_network_cfg(cfgfile);
+if(weightfile){
+    load_weights(&net, weightfile);
+}
+printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+int imgs = 1024;
+int i = net.seen/imgs;
+list *plist = get_paths("/data/captcha/encode.list");
+char **paths = (char **)list_to_array(plist);
+printf("%d\n", plist->size);
+clock_t time;
+while(1){
+    ++i;
+    time=clock();
+    data train = load_data_captcha_encode(paths, imgs, plist->size, 300, 57);
+    scale_data_rows(train, 1./255);
+    printf("Loaded: %lf seconds\n", sec(clock()-time));
+    time=clock();
+    float loss = train_network(net, train);
+    net.seen += imgs;
+    if(avg_loss == -1) avg_loss = loss;
+    avg_loss = avg_loss*.9 + loss*.1;
+    printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen);
+    free_matrix(train.X);
+    if(i%100==0){
+        char buff[256];
+        sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i);
+        save_weights(net, buff);
+    }
+}
+}
+
+void validate_captcha(char *cfgfile, char *weightfile)
+{
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    int numchars = 37;
+    list *plist = get_paths("/data/captcha/solved.hard");
+    char **paths = (char **)list_to_array(plist);
+    int imgs = plist->size;
+    data valid = load_data_captcha(paths, imgs, 0, 10, 200, 60);
+    translate_data_rows(valid, -128);
+    scale_data_rows(valid, 1./128);
+    matrix pred = network_predict_data(net, valid);
+    int i, k;
+    int correct = 0;
+    int total = 0;
+    int accuracy = 0;
+    for(i = 0; i < imgs; ++i){
+        int allcorrect = 1;
+        for(k = 0; k < 10; ++k){
+            char truth = int_to_alphanum(max_index(valid.y.vals[i]+k*numchars, numchars));
+            char prediction = int_to_alphanum(max_index(pred.vals[i]+k*numchars, numchars));
+            if (truth != prediction) allcorrect=0;
+            if (truth != '.' && truth == prediction) ++correct;
+            if (truth != '.' || truth != prediction) ++total;
+        }
+        accuracy += allcorrect;
+    }
+    printf("Word Accuracy: %f, Char Accuracy %f\n", (float)accuracy/imgs, (float)correct/total);
+    free_data(valid);
+}
+
+void test_captcha(char *cfgfile, char *weightfile)
+{
+    setbuf(stdout, NULL);
+    srand(time(0));
+    //char *base = basecfg(cfgfile);
+    //printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    set_batch_network(&net, 1);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    char filename[256];
+    while(1){
+        //printf("Enter filename: ");
+        fgets(filename, 256, stdin);
+        strtok(filename, "\n");
+        image im = load_image_color(filename, 200, 60);
+        translate_image(im, -128);
+        scale_image(im, 1/128.);
+        float *X = im.data;
+        float *predictions = network_predict(net, X);
+        print_letters(predictions, 10);
+        free_image(im);
+    }
+}
+    */
+void run_captcha(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *filename = (argc > 5) ? argv[5]: 0;
+    if(0==strcmp(argv[2], "train")) train_captcha(cfg, weights);
+    else if(0==strcmp(argv[2], "test")) test_captcha(cfg, weights, filename);
+    else if(0==strcmp(argv[2], "valid")) valid_captcha(cfg, weights, filename);
+    //if(0==strcmp(argv[2], "test")) test_captcha(cfg, weights);
+    //else if(0==strcmp(argv[2], "encode")) encode_captcha(cfg, weights);
+    //else if(0==strcmp(argv[2], "decode")) decode_captcha(cfg, weights);
+    //else if(0==strcmp(argv[2], "valid")) validate_captcha(cfg, weights);
+}
diff --git a/darknet-master/src/cifar.c b/darknet-master/src/cifar.c
new file mode 100644
index 0000000..1ef221e
--- /dev/null
+++ b/darknet-master/src/cifar.c
@@ -0,0 +1,271 @@
+#include "network.h"
+#include "utils.h"
+#include "parser.h"
+#include "option_list.h"
+#include "blas.h"
+
+void train_cifar(char *cfgfile, char *weightfile)
+{
+    srand(time(0));
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+
+    char* backup_directory = "backup/";
+    int classes = 10;
+    int N = 50000;
+
+    char **labels = get_labels("data/cifar/labels.txt");
+    int epoch = (*net.seen)/N;
+    data train = load_all_cifar10();
+    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
+        clock_t time=clock();
+
+        float loss = train_network_sgd(net, train, 1);
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.95 + loss*.05;
+        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %" PRIu64 " images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
+        if(*net.seen/N > epoch){
+            epoch = *net.seen/N;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            save_weights(net, buff);
+        }
+        if(get_current_batch(net)%100 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            save_weights(net, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s.weights", backup_directory, base);
+    save_weights(net, buff);
+
+    free_network(net);
+    free_ptrs((void**)labels, classes);
+    free(base);
+    free_data(train);
+}
+
+void train_cifar_distill(char *cfgfile, char *weightfile)
+{
+    srand(time(0));
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+
+    char* backup_directory = "backup/";
+    int classes = 10;
+    int N = 50000;
+
+    char **labels = get_labels("data/cifar/labels.txt");
+    int epoch = (*net.seen)/N;
+
+    data train = load_all_cifar10();
+    matrix soft = csv_to_matrix("results/ensemble.csv");
+
+    float weight = .9;
+    scale_matrix(soft, weight);
+    scale_matrix(train.y, 1. - weight);
+    matrix_add_matrix(soft, train.y);
+
+    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
+        clock_t time=clock();
+
+        float loss = train_network_sgd(net, train, 1);
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.95 + loss*.05;
+        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %" PRIu64 " images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
+        if(*net.seen/N > epoch){
+            epoch = *net.seen/N;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            save_weights(net, buff);
+        }
+        if(get_current_batch(net)%100 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            save_weights(net, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s.weights", backup_directory, base);
+    save_weights(net, buff);
+
+    free_network(net);
+    free_ptrs((void**)labels, classes);
+    free(base);
+    free_data(train);
+}
+
+void test_cifar_multi(char *filename, char *weightfile)
+{
+    network net = parse_network_cfg(filename);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    srand(time(0));
+
+    float avg_acc = 0;
+    data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
+
+    int i;
+    for(i = 0; i < test.X.rows; ++i){
+        image im = float_to_image(32, 32, 3, test.X.vals[i]);
+
+        float pred[10] = {0};
+
+        float *p = network_predict(net, im.data);
+        axpy_cpu(10, 1, p, 1, pred, 1);
+        flip_image(im);
+        p = network_predict(net, im.data);
+        axpy_cpu(10, 1, p, 1, pred, 1);
+
+        int index = max_index(pred, 10);
+        int class_id = max_index(test.y.vals[i], 10);
+        if(index == class_id) avg_acc += 1;
+        free_image(im);
+        printf("%4d: %.2f%%\n", i, 100.*avg_acc/(i+1));
+    }
+}
+
+void test_cifar(char *filename, char *weightfile)
+{
+    network net = parse_network_cfg(filename);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    clock_t time;
+    float avg_acc = 0;
+    float avg_top5 = 0;
+    data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
+
+    time=clock();
+
+    float *acc = network_accuracies(net, test, 2);
+    avg_acc += acc[0];
+    avg_top5 += acc[1];
+    printf("top1: %f, %lf seconds, %d images\n", avg_acc, sec(clock()-time), test.X.rows);
+    free_data(test);
+}
+
+void extract_cifar()
+{
+char *labels[] = {"airplane","automobile","bird","cat","deer","dog","frog","horse","ship","truck"};
+    int i;
+    data train = load_all_cifar10();
+    data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
+    for(i = 0; i < train.X.rows; ++i){
+        image im = float_to_image(32, 32, 3, train.X.vals[i]);
+        int class_id = max_index(train.y.vals[i], 10);
+        char buff[256];
+        sprintf(buff, "data/cifar/train/%d_%s",i,labels[class_id]);
+        save_image_png(im, buff);
+    }
+    for(i = 0; i < test.X.rows; ++i){
+        image im = float_to_image(32, 32, 3, test.X.vals[i]);
+        int class_id = max_index(test.y.vals[i], 10);
+        char buff[256];
+        sprintf(buff, "data/cifar/test/%d_%s",i,labels[class_id]);
+        save_image_png(im, buff);
+    }
+}
+
+void test_cifar_csv(char *filename, char *weightfile)
+{
+    network net = parse_network_cfg(filename);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
+
+    matrix pred = network_predict_data(net, test);
+
+    int i;
+    for(i = 0; i < test.X.rows; ++i){
+        image im = float_to_image(32, 32, 3, test.X.vals[i]);
+        flip_image(im);
+    }
+    matrix pred2 = network_predict_data(net, test);
+    scale_matrix(pred, .5);
+    scale_matrix(pred2, .5);
+    matrix_add_matrix(pred2, pred);
+
+    matrix_to_csv(pred);
+    fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1));
+    free_data(test);
+}
+
+void test_cifar_csvtrain(char *filename, char *weightfile)
+{
+    network net = parse_network_cfg(filename);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    data test = load_all_cifar10();
+
+    matrix pred = network_predict_data(net, test);
+
+    int i;
+    for(i = 0; i < test.X.rows; ++i){
+        image im = float_to_image(32, 32, 3, test.X.vals[i]);
+        flip_image(im);
+    }
+    matrix pred2 = network_predict_data(net, test);
+    scale_matrix(pred, .5);
+    scale_matrix(pred2, .5);
+    matrix_add_matrix(pred2, pred);
+
+    matrix_to_csv(pred);
+    fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1));
+    free_data(test);
+}
+
+void eval_cifar_csv()
+{
+    data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
+
+    matrix pred = csv_to_matrix("results/combined.csv");
+    fprintf(stderr, "%d %d\n", pred.rows, pred.cols);
+
+    fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1));
+    free_data(test);
+    free_matrix(pred);
+}
+
+
+void run_cifar(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    if(0==strcmp(argv[2], "train")) train_cifar(cfg, weights);
+    else if(0==strcmp(argv[2], "extract")) extract_cifar();
+    else if(0==strcmp(argv[2], "distill")) train_cifar_distill(cfg, weights);
+    else if(0==strcmp(argv[2], "test")) test_cifar(cfg, weights);
+    else if(0==strcmp(argv[2], "multi")) test_cifar_multi(cfg, weights);
+    else if(0==strcmp(argv[2], "csv")) test_cifar_csv(cfg, weights);
+    else if(0==strcmp(argv[2], "csvtrain")) test_cifar_csvtrain(cfg, weights);
+    else if(0==strcmp(argv[2], "eval")) eval_cifar_csv();
+}
diff --git a/darknet-master/src/classifier.c b/darknet-master/src/classifier.c
new file mode 100644
index 0000000..f013033
--- /dev/null
+++ b/darknet-master/src/classifier.c
@@ -0,0 +1,1409 @@
+#include "network.h"
+#include "utils.h"
+#include "parser.h"
+#include "option_list.h"
+#include "blas.h"
+#include "assert.h"
+#include "classifier.h"
+#include "dark_cuda.h"
+#ifdef WIN32
+#include <time.h>
+#include "gettimeofday.h"
+#else
+#include <sys/time.h>
+#endif
+
+float validate_classifier_single(char *datacfg, char *filename, char *weightfile, network *existing_net, int topk_custom);
+
+float *get_regression_values(char **labels, int n)
+{
+    float* v = (float*)xcalloc(n, sizeof(float));
+    int i;
+    for(i = 0; i < n; ++i){
+        char *p = strchr(labels[i], ' ');
+        *p = 0;
+        v[i] = atof(p+1);
+    }
+    return v;
+}
+
+void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int dontuse_opencv, int dont_show, int mjpeg_port, int calc_topk, int show_imgs, char* chart_path)
+{
+    int i;
+
+    float avg_loss = -1;
+    float avg_contrastive_acc = 0;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    printf("%d\n", ngpus);
+    network* nets = (network*)xcalloc(ngpus, sizeof(network));
+
+    srand(time(0));
+    int seed = rand();
+    for(i = 0; i < ngpus; ++i){
+        srand(seed);
+#ifdef GPU
+        cuda_set_device(gpus[i]);
+#endif
+        nets[i] = parse_network_cfg(cfgfile);
+        if(weightfile){
+            load_weights(&nets[i], weightfile);
+        }
+        if (clear) {
+            *nets[i].seen = 0;
+            *nets[i].cur_iteration = 0;
+        }
+        nets[i].learning_rate *= ngpus;
+    }
+    srand(time(0));
+    network net = nets[0];
+
+    int imgs = net.batch * net.subdivisions * ngpus;
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    list *options = read_data_cfg(datacfg);
+
+    char *backup_directory = option_find_str(options, "backup", "/backup/");
+    char *label_list = option_find_str(options, "labels", "data/labels.list");
+    char *train_list = option_find_str(options, "train", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+    int topk_data = option_find_int(options, "top", 5);
+    char topk_buff[10];
+    sprintf(topk_buff, "top%d", topk_data);
+    layer l = net.layers[net.n - 1];
+    if (classes != l.outputs && (l.type == SOFTMAX || l.type == COST)) {
+        printf("\n Error: num of filters = %d in the last conv-layer in cfg-file doesn't match to classes = %d in data-file \n",
+            l.outputs, classes);
+        error("Error!", DARKNET_LOC);
+    }
+
+    char **labels = get_labels(label_list);
+    if (net.unsupervised) {
+        free(labels);
+        labels = NULL;
+    }
+    list *plist = get_paths(train_list);
+    char **paths = (char **)list_to_array(plist);
+    printf("%d\n", plist->size);
+    int train_images_num = plist->size;
+    clock_t time;
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.c = net.c;
+    args.threads = 32;
+    if (net.contrastive && args.threads > net.batch/2) args.threads = net.batch / 2;
+    args.hierarchy = net.hierarchy;
+
+    args.contrastive = net.contrastive;
+    args.dontuse_opencv = dontuse_opencv;
+    args.min = net.min_crop;
+    args.max = net.max_crop;
+    args.flip = net.flip;
+    args.blur = net.blur;
+    args.angle = net.angle;
+    args.aspect = net.aspect;
+    args.exposure = net.exposure;
+    args.saturation = net.saturation;
+    args.hue = net.hue;
+    args.size = net.w > net.h ? net.w : net.h;
+
+    args.label_smooth_eps = net.label_smooth_eps;
+    args.mixup = net.mixup;
+    if (dont_show && show_imgs) show_imgs = 2;
+    args.show_imgs = show_imgs;
+
+    args.paths = paths;
+    args.classes = classes;
+    args.n = imgs;
+    args.m = train_images_num;
+    args.labels = labels;
+    args.type = CLASSIFICATION_DATA;
+
+#ifdef OPENCV
+    //args.threads = 3;
+    mat_cv* img = NULL;
+    float max_img_loss = net.max_chart_loss;
+    int number_of_lines = 100;
+    int img_size = 1000;
+    char windows_name[100];
+    sprintf(windows_name, "chart_%s.png", base);
+    if (!dontuse_opencv) img = draw_train_chart(windows_name, max_img_loss, net.max_batches, number_of_lines, img_size, dont_show, chart_path);
+#endif  //OPENCV
+
+    data train;
+    data buffer;
+    pthread_t load_thread;
+    args.d = &buffer;
+    load_thread = load_data(args);
+
+    int iter_save = get_current_batch(net);
+    int iter_save_last = get_current_batch(net);
+    int iter_topk = get_current_batch(net);
+    float topk = 0;
+
+    int count = 0;
+    double start, time_remaining, avg_time = -1, alpha_time = 0.01;
+    start = what_time_is_it_now();
+
+    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
+        time=clock();
+
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+        time=clock();
+
+        float loss = 0;
+#ifdef GPU
+        if(ngpus == 1){
+            loss = train_network(net, train);
+        } else {
+            loss = train_networks(nets, ngpus, train, 4);
+        }
+#else
+        loss = train_network(net, train);
+#endif
+        if(avg_loss == -1 || isnan(avg_loss) || isinf(avg_loss)) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+
+        i = get_current_batch(net);
+
+        int calc_topk_for_each = iter_topk + 2 * train_images_num / (net.batch * net.subdivisions);  // calculate TOPk for each 2 Epochs
+        calc_topk_for_each = fmax(calc_topk_for_each, net.burn_in);
+        calc_topk_for_each = fmax(calc_topk_for_each, 100);
+        if (i % 10 == 0) {
+            if (calc_topk) {
+                fprintf(stderr, "\n (next TOP%d calculation at %d iterations) ", topk_data, calc_topk_for_each);
+                if (topk > 0) fprintf(stderr, " Last accuracy TOP%d = %2.2f %% \n", topk_data, topk * 100);
+            }
+
+            if (net.cudnn_half) {
+                if (i < net.burn_in * 3) fprintf(stderr, " Tensor Cores are disabled until the first %d iterations are reached.\n", 3 * net.burn_in);
+                else fprintf(stderr, " Tensor Cores are used.\n");
+            }
+        }
+
+        int draw_precision = 0;
+        if (calc_topk && (i >= calc_topk_for_each || i == net.max_batches)) {
+            iter_topk = i;
+            if (net.contrastive && l.type != SOFTMAX && l.type != COST) {
+                int k;
+                for (k = 0; k < net.n; ++k) if (net.layers[k].type == CONTRASTIVE) break;
+                topk = *(net.layers[k].loss) / 100;
+                sprintf(topk_buff, "Contr");
+            }
+            else {
+                topk = validate_classifier_single(datacfg, cfgfile, weightfile, &net, topk_data); // calc TOP-n
+                printf("\n accuracy %s = %f \n", topk_buff, topk);
+            }
+            draw_precision = 1;
+        }
+
+        time_remaining = ((net.max_batches - i) / ngpus) * (what_time_is_it_now() - start) / 60 / 60;
+        // set initial value, even if resume training from 10000 iteration
+        if (avg_time < 0) avg_time = time_remaining;
+        else avg_time = alpha_time * time_remaining + (1 -  alpha_time) * avg_time;
+        start = what_time_is_it_now();
+        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %" PRIu64 " images, %f hours left\n", get_current_batch(net), (float)(*net.seen)/ train_images_num, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen, avg_time);
+#ifdef OPENCV
+        if (net.contrastive) {
+            float cur_con_acc = -1;
+            int k;
+            for (k = 0; k < net.n; ++k)
+                if (net.layers[k].type == CONTRASTIVE) cur_con_acc = *net.layers[k].loss;
+            if (cur_con_acc >= 0) avg_contrastive_acc = avg_contrastive_acc*0.99 + cur_con_acc * 0.01;
+            printf("  avg_contrastive_acc = %f \n", avg_contrastive_acc);
+        }
+        if (!dontuse_opencv) draw_train_loss(windows_name, img, img_size, avg_loss, max_img_loss, i, net.max_batches, topk, draw_precision, topk_buff, avg_contrastive_acc / 100, dont_show, mjpeg_port, avg_time);
+#endif  // OPENCV
+
+        if (i >= (iter_save + 1000)) {
+            iter_save = i;
+#ifdef GPU
+            if (ngpus != 1) sync_nets(nets, ngpus, 0);
+#endif
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
+
+        if (i >= (iter_save_last + 100)) {
+            iter_save_last = i;
+#ifdef GPU
+            if (ngpus != 1) sync_nets(nets, ngpus, 0);
+#endif
+            char buff[256];
+            sprintf(buff, "%s/%s_last.weights", backup_directory, base);
+            save_weights(net, buff);
+        }
+        free_data(train);
+    }
+#ifdef GPU
+    if (ngpus != 1) sync_nets(nets, ngpus, 0);
+#endif
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+
+#ifdef OPENCV
+    release_mat(&img);
+    destroy_all_windows_cv();
+#endif
+
+    pthread_join(load_thread, 0);
+    free_data(buffer);
+
+    //free_network(net);
+    for (i = 0; i < ngpus; ++i) free_network(nets[i]);
+    free(nets);
+
+    //free_ptrs((void**)labels, classes);
+    if(labels) free(labels);
+    free_ptrs((void**)paths, plist->size);
+    free_list(plist);
+    free(base);
+
+    free_list_contents_kvp(options);
+    free_list(options);
+
+}
+
+
+/*
+   void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int clear)
+   {
+   srand(time(0));
+   float avg_loss = -1;
+   char *base = basecfg(cfgfile);
+   printf("%s\n", base);
+   network net = parse_network_cfg(cfgfile);
+   if(weightfile){
+   load_weights(&net, weightfile);
+   }
+   if(clear) *net.seen = 0;
+
+   int imgs = net.batch * net.subdivisions;
+
+   printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+   list *options = read_data_cfg(datacfg);
+
+   char *backup_directory = option_find_str(options, "backup", "/backup/");
+   char *label_list = option_find_str(options, "labels", "data/labels.list");
+   char *train_list = option_find_str(options, "train", "data/train.list");
+   int classes = option_find_int(options, "classes", 2);
+
+   char **labels = get_labels(label_list);
+   list *plist = get_paths(train_list);
+   char **paths = (char **)list_to_array(plist);
+   printf("%d\n", plist->size);
+   int N = plist->size;
+   clock_t time;
+
+   load_args args = {0};
+   args.w = net.w;
+   args.h = net.h;
+   args.threads = 8;
+
+   args.min = net.min_crop;
+   args.max = net.max_crop;
+   args.flip = net.flip;
+   args.angle = net.angle;
+   args.aspect = net.aspect;
+   args.exposure = net.exposure;
+   args.saturation = net.saturation;
+   args.hue = net.hue;
+   args.size = net.w;
+   args.hierarchy = net.hierarchy;
+
+   args.paths = paths;
+   args.classes = classes;
+   args.n = imgs;
+   args.m = N;
+   args.labels = labels;
+   args.type = CLASSIFICATION_DATA;
+
+   data train;
+   data buffer;
+   pthread_t load_thread;
+   args.d = &buffer;
+   load_thread = load_data(args);
+
+   int epoch = (*net.seen)/N;
+   while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
+   time=clock();
+
+   pthread_join(load_thread, 0);
+   train = buffer;
+   load_thread = load_data(args);
+
+   printf("Loaded: %lf seconds\n", sec(clock()-time));
+   time=clock();
+
+#ifdef OPENCV
+if(0){
+int u;
+for(u = 0; u < imgs; ++u){
+    image im = float_to_image(net.w, net.h, 3, train.X.vals[u]);
+    show_image(im, "loaded");
+    cvWaitKey(0);
+}
+}
+#endif
+
+float loss = train_network(net, train);
+free_data(train);
+
+if(avg_loss == -1) avg_loss = loss;
+avg_loss = avg_loss*.9 + loss*.1;
+printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
+if(*net.seen/N > epoch){
+    epoch = *net.seen/N;
+    char buff[256];
+    sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+    save_weights(net, buff);
+}
+if(get_current_batch(net)%100 == 0){
+    char buff[256];
+    sprintf(buff, "%s/%s.backup",backup_directory,base);
+    save_weights(net, buff);
+}
+}
+char buff[256];
+sprintf(buff, "%s/%s.weights", backup_directory, base);
+save_weights(net, buff);
+
+free_network(net);
+free_ptrs((void**)labels, classes);
+free_ptrs((void**)paths, plist->size);
+free_list(plist);
+free(base);
+}
+*/
+
+void validate_classifier_crop(char *datacfg, char *filename, char *weightfile)
+{
+    int i = 0;
+    network net = parse_network_cfg(filename);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    char *label_list = option_find_str(options, "labels", "data/labels.list");
+    char *valid_list = option_find_str(options, "valid", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+    int topk = option_find_int(options, "top", 1);
+    if (topk > classes) topk = classes;
+
+    char **labels = get_labels(label_list);
+    list *plist = get_paths(valid_list);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    clock_t time;
+    float avg_acc = 0;
+    float avg_topk = 0;
+    int splits = m/1000;
+    int num = (i+1)*m/splits - i*m/splits;
+
+    data val, buffer;
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+
+    args.paths = paths;
+    args.classes = classes;
+    args.n = num;
+    args.m = 0;
+    args.labels = labels;
+    args.d = &buffer;
+    args.type = OLD_CLASSIFICATION_DATA;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    for(i = 1; i <= splits; ++i){
+        time=clock();
+
+        pthread_join(load_thread, 0);
+        val = buffer;
+
+        num = (i+1)*m/splits - i*m/splits;
+        char **part = paths+(i*m/splits);
+        if(i != splits){
+            args.paths = part;
+            load_thread = load_data_in_thread(args);
+        }
+        printf("Loaded: %d images in %lf seconds\n", val.X.rows, sec(clock()-time));
+
+        time=clock();
+        float *acc = network_accuracies(net, val, topk);
+        avg_acc += acc[0];
+        avg_topk += acc[1];
+        printf("%d: top 1: %f, top %d: %f, %lf seconds, %d images\n", i, avg_acc/i, topk, avg_topk/i, sec(clock()-time), val.X.rows);
+        free_data(val);
+    }
+}
+
+void validate_classifier_10(char *datacfg, char *filename, char *weightfile)
+{
+    int i, j;
+    network net = parse_network_cfg(filename);
+    set_batch_network(&net, 1);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    char *label_list = option_find_str(options, "labels", "data/labels.list");
+    char *valid_list = option_find_str(options, "valid", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+    int topk = option_find_int(options, "top", 1);
+    if (topk > classes) topk = classes;
+
+    char **labels = get_labels(label_list);
+    list *plist = get_paths(valid_list);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    float avg_acc = 0;
+    float avg_topk = 0;
+    int* indexes = (int*)xcalloc(topk, sizeof(int));
+
+    for(i = 0; i < m; ++i){
+        int class_id = -1;
+        char *path = paths[i];
+        for(j = 0; j < classes; ++j){
+            if(strstr(path, labels[j])){
+                class_id = j;
+                break;
+            }
+        }
+        int w = net.w;
+        int h = net.h;
+        int shift = 32;
+        image im = load_image_color(paths[i], w+shift, h+shift);
+        image images[10];
+        images[0] = crop_image(im, -shift, -shift, w, h);
+        images[1] = crop_image(im, shift, -shift, w, h);
+        images[2] = crop_image(im, 0, 0, w, h);
+        images[3] = crop_image(im, -shift, shift, w, h);
+        images[4] = crop_image(im, shift, shift, w, h);
+        flip_image(im);
+        images[5] = crop_image(im, -shift, -shift, w, h);
+        images[6] = crop_image(im, shift, -shift, w, h);
+        images[7] = crop_image(im, 0, 0, w, h);
+        images[8] = crop_image(im, -shift, shift, w, h);
+        images[9] = crop_image(im, shift, shift, w, h);
+        float* pred = (float*)xcalloc(classes, sizeof(float));
+        for(j = 0; j < 10; ++j){
+            float *p = network_predict(net, images[j].data);
+            if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1);
+            axpy_cpu(classes, 1, p, 1, pred, 1);
+            free_image(images[j]);
+        }
+        free_image(im);
+        top_k(pred, classes, topk, indexes);
+        free(pred);
+        if(indexes[0] == class_id) avg_acc += 1;
+        for(j = 0; j < topk; ++j){
+            if(indexes[j] == class_id) avg_topk += 1;
+        }
+
+        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
+    }
+    free(indexes);
+}
+
+void validate_classifier_full(char *datacfg, char *filename, char *weightfile)
+{
+    int i, j;
+    network net = parse_network_cfg(filename);
+    set_batch_network(&net, 1);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    char *label_list = option_find_str(options, "labels", "data/labels.list");
+    char *valid_list = option_find_str(options, "valid", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+    int topk = option_find_int(options, "top", 1);
+    if (topk > classes) topk = classes;
+
+    char **labels = get_labels(label_list);
+    list *plist = get_paths(valid_list);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    float avg_acc = 0;
+    float avg_topk = 0;
+    int* indexes = (int*)xcalloc(topk, sizeof(int));
+
+    int size = net.w;
+    for(i = 0; i < m; ++i){
+        int class_id = -1;
+        char *path = paths[i];
+        for(j = 0; j < classes; ++j){
+            if(strstr(path, labels[j])){
+                class_id = j;
+                break;
+            }
+        }
+        image im = load_image_color(paths[i], 0, 0);
+        image resized = resize_min(im, size);
+        resize_network(&net, resized.w, resized.h);
+        //show_image(im, "orig");
+        //show_image(crop, "cropped");
+        //cvWaitKey(0);
+        float *pred = network_predict(net, resized.data);
+        if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1);
+
+        free_image(im);
+        free_image(resized);
+        top_k(pred, classes, topk, indexes);
+
+        if(indexes[0] == class_id) avg_acc += 1;
+        for(j = 0; j < topk; ++j){
+            if(indexes[j] == class_id) avg_topk += 1;
+        }
+
+        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
+    }
+    free(indexes);
+}
+
+
+float validate_classifier_single(char *datacfg, char *filename, char *weightfile, network *existing_net, int topk_custom)
+{
+    int i, j;
+    network net;
+    int old_batch = -1;
+    if (existing_net) {
+        net = *existing_net;    // for validation during training
+        old_batch = net.batch;
+        set_batch_network(&net, 1);
+    }
+    else {
+        net = parse_network_cfg_custom(filename, 1, 0);
+        if (weightfile) {
+            load_weights(&net, weightfile);
+        }
+        //set_batch_network(&net, 1);
+        fuse_conv_batchnorm(net);
+        calculate_binary_weights(net);
+    }
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    char *label_list = option_find_str(options, "labels", "data/labels.list");
+    char *leaf_list = option_find_str(options, "leaves", 0);
+    if(leaf_list) change_leaves(net.hierarchy, leaf_list);
+    char *valid_list = option_find_str(options, "valid", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+    int topk = option_find_int(options, "top", 1);
+    if (topk_custom > 0) topk = topk_custom;    // for validation during training
+    if (topk > classes) topk = classes;
+    printf(" TOP calculation...\n");
+
+    char **labels = get_labels(label_list);
+    list *plist = get_paths(valid_list);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    float avg_acc = 0;
+    float avg_topk = 0;
+    int* indexes = (int*)xcalloc(topk, sizeof(int));
+
+    for(i = 0; i < m; ++i){
+        int class_id = -1;
+        char *path = paths[i];
+        for(j = 0; j < classes; ++j){
+            if(strstr(path, labels[j])){
+                class_id = j;
+                break;
+            }
+        }
+        image im = load_image_color(paths[i], 0, 0);
+        image resized = resize_min(im, net.w);
+        image crop = crop_image(resized, (resized.w - net.w)/2, (resized.h - net.h)/2, net.w, net.h);
+        //show_image(im, "orig");
+        //show_image(crop, "cropped");
+        //cvWaitKey(0);
+        float *pred = network_predict(net, crop.data);
+        if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1);
+
+        if(resized.data != im.data) free_image(resized);
+        free_image(im);
+        free_image(crop);
+        top_k(pred, classes, topk, indexes);
+
+        if(indexes[0] == class_id) avg_acc += 1;
+        for(j = 0; j < topk; ++j){
+            if(indexes[j] == class_id) avg_topk += 1;
+        }
+
+        if (existing_net) printf("\r");
+        else printf("\n");
+        printf("%d: top 1: %f, top %d: %f", i, avg_acc/(i+1), topk, avg_topk/(i+1));
+    }
+    free(indexes);
+    if (existing_net) {
+        set_batch_network(&net, old_batch);
+    }
+    float topk_result = avg_topk / i;
+    return topk_result;
+}
+
+void validate_classifier_multi(char *datacfg, char *filename, char *weightfile)
+{
+    int i, j;
+    network net = parse_network_cfg(filename);
+    set_batch_network(&net, 1);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    char *label_list = option_find_str(options, "labels", "data/labels.list");
+    char *valid_list = option_find_str(options, "valid", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+    int topk = option_find_int(options, "top", 1);
+    if (topk > classes) topk = classes;
+
+    char **labels = get_labels(label_list);
+    list *plist = get_paths(valid_list);
+    int scales[] = {224, 288, 320, 352, 384};
+    int nscales = sizeof(scales)/sizeof(scales[0]);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    float avg_acc = 0;
+    float avg_topk = 0;
+    int* indexes = (int*)xcalloc(topk, sizeof(int));
+
+    for(i = 0; i < m; ++i){
+        int class_id = -1;
+        char *path = paths[i];
+        for(j = 0; j < classes; ++j){
+            if(strstr(path, labels[j])){
+                class_id = j;
+                break;
+            }
+        }
+        float* pred = (float*)xcalloc(classes, sizeof(float));
+        image im = load_image_color(paths[i], 0, 0);
+        for(j = 0; j < nscales; ++j){
+            image r = resize_min(im, scales[j]);
+            resize_network(&net, r.w, r.h);
+            float *p = network_predict(net, r.data);
+            if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1);
+            axpy_cpu(classes, 1, p, 1, pred, 1);
+            flip_image(r);
+            p = network_predict(net, r.data);
+            axpy_cpu(classes, 1, p, 1, pred, 1);
+            if(r.data != im.data) free_image(r);
+        }
+        free_image(im);
+        top_k(pred, classes, topk, indexes);
+        free(pred);
+        if(indexes[0] == class_id) avg_acc += 1;
+        for(j = 0; j < topk; ++j){
+            if(indexes[j] == class_id) avg_topk += 1;
+        }
+
+        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
+    }
+    free(indexes);
+}
+
+void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int layer_num)
+{
+    network net = parse_network_cfg_custom(cfgfile, 1, 0);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    srand(2222222);
+
+    list *options = read_data_cfg(datacfg);
+
+    char *name_list = option_find_str(options, "names", 0);
+    if(!name_list) name_list = option_find_str(options, "labels", "data/labels.list");
+    int classes = option_find_int(options, "classes", 2);
+    int top = option_find_int(options, "top", 1);
+    if (top > classes) top = classes;
+
+    char **names = get_labels(name_list);
+    clock_t time;
+    int* indexes = (int*)xcalloc(top, sizeof(int));
+    char buff[256];
+    char *input = buff;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) break;
+            strtok(input, "\n");
+        }
+        image orig = load_image_color(input, 0, 0);
+        image r = resize_min(orig, 256);
+        image im = crop_image(r, (r.w - 224 - 1)/2 + 1, (r.h - 224 - 1)/2 + 1, 224, 224);
+        float mean[] = {0.48263312050943, 0.45230225481413, 0.40099074308742};
+        float std[] = {0.22590347483426, 0.22120921437787, 0.22103996251583};
+        float var[3];
+        var[0] = std[0]*std[0];
+        var[1] = std[1]*std[1];
+        var[2] = std[2]*std[2];
+
+        normalize_cpu(im.data, mean, var, 1, 3, im.w*im.h);
+
+        float *X = im.data;
+        time=clock();
+        float *predictions = network_predict(net, X);
+
+        layer l = net.layers[layer_num];
+        int i;
+        for(i = 0; i < l.c; ++i){
+            if(l.rolling_mean) printf("%f %f %f\n", l.rolling_mean[i], l.rolling_variance[i], l.scales[i]);
+        }
+#ifdef GPU
+        cuda_pull_array(l.output_gpu, l.output, l.outputs);
+#endif
+        for(i = 0; i < l.outputs; ++i){
+            printf("%f\n", l.output[i]);
+        }
+        /*
+
+           printf("\n\nWeights\n");
+           for(i = 0; i < l.n*l.size*l.size*l.c; ++i){
+           printf("%f\n", l.filters[i]);
+           }
+
+           printf("\n\nBiases\n");
+           for(i = 0; i < l.n; ++i){
+           printf("%f\n", l.biases[i]);
+           }
+         */
+
+        top_predictions(net, top, indexes);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        for(i = 0; i < top; ++i){
+            int index = indexes[i];
+            printf("%s: %f\n", names[index], predictions[index]);
+        }
+        free_image(im);
+        if (filename) break;
+    }
+    free(indexes);
+}
+
+void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int top)
+{
+    network net = parse_network_cfg_custom(cfgfile, 1, 0);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    srand(2222222);
+
+    fuse_conv_batchnorm(net);
+    calculate_binary_weights(net);
+
+    list *options = read_data_cfg(datacfg);
+
+    char *name_list = option_find_str(options, "names", 0);
+    if(!name_list) name_list = option_find_str(options, "labels", "data/labels.list");
+    int classes = option_find_int(options, "classes", 2);
+    printf(" classes = %d, output in cfg = %d \n", classes, net.layers[net.n - 1].c);
+    layer l = net.layers[net.n - 1];
+    if (classes != l.outputs && (l.type == SOFTMAX || l.type == COST)) {
+        printf("\n Error: num of filters = %d in the last conv-layer in cfg-file doesn't match to classes = %d in data-file \n",
+            l.outputs, classes);
+        error("Error!", DARKNET_LOC);
+    }
+    if (top == 0) top = option_find_int(options, "top", 1);
+    if (top > classes) top = classes;
+
+    int i = 0;
+    char **names = get_labels(name_list);
+    clock_t time;
+    int* indexes = (int*)xcalloc(top, sizeof(int));
+    char buff[256];
+    char *input = buff;
+    //int size = net.w;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) break;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, 0, 0);
+        image resized = resize_min(im, net.w);
+        image cropped = crop_image(resized, (resized.w - net.w)/2, (resized.h - net.h)/2, net.w, net.h);
+        printf("%d %d\n", cropped.w, cropped.h);
+
+        float *X = cropped.data;
+
+        double time = get_time_point();
+        float *predictions = network_predict(net, X);
+        printf("%s: Predicted in %lf milli-seconds.\n", input, ((double)get_time_point() - time) / 1000);
+
+        if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 0);
+        top_k(predictions, net.outputs, top, indexes);
+
+        for(i = 0; i < top; ++i){
+            int index = indexes[i];
+            if(net.hierarchy) printf("%d, %s: %f, parent: %s \n",index, names[index], predictions[index], (net.hierarchy->parent[index] >= 0) ? names[net.hierarchy->parent[index]] : "Root");
+            else printf("%s: %f\n",names[index], predictions[index]);
+        }
+
+        free_image(cropped);
+        if (resized.data != im.data) {
+            free_image(resized);
+        }
+        free_image(im);
+
+        if (filename) break;
+    }
+    free(indexes);
+    free_network(net);
+    free_list_contents_kvp(options);
+    free_list(options);
+}
+
+
+void label_classifier(char *datacfg, char *filename, char *weightfile)
+{
+    int i;
+    network net = parse_network_cfg(filename);
+    set_batch_network(&net, 1);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    char *label_list = option_find_str(options, "names", "data/labels.list");
+    char *test_list = option_find_str(options, "test", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+
+    char **labels = get_labels(label_list);
+    list *plist = get_paths(test_list);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    for(i = 0; i < m; ++i){
+        image im = load_image_color(paths[i], 0, 0);
+        image resized = resize_min(im, net.w);
+        image crop = crop_image(resized, (resized.w - net.w)/2, (resized.h - net.h)/2, net.w, net.h);
+        float *pred = network_predict(net, crop.data);
+
+        if(resized.data != im.data) free_image(resized);
+        free_image(im);
+        free_image(crop);
+        int ind = max_index(pred, classes);
+
+        printf("%s\n", labels[ind]);
+    }
+}
+
+
+void test_classifier(char *datacfg, char *cfgfile, char *weightfile, int target_layer)
+{
+    int curr = 0;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+    fuse_conv_batchnorm(net);
+    calculate_binary_weights(net);
+
+    list *options = read_data_cfg(datacfg);
+
+    char *test_list = option_find_str(options, "test", "data/test.list");
+    int classes = option_find_int(options, "classes", 2);
+
+    list *plist = get_paths(test_list);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    clock_t time;
+
+    data val, buffer;
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.paths = paths;
+    args.classes = classes;
+    args.n = net.batch;
+    args.m = 0;
+    args.labels = 0;
+    args.d = &buffer;
+    args.type = OLD_CLASSIFICATION_DATA;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    for(curr = net.batch; curr < m; curr += net.batch){
+        time=clock();
+
+        pthread_join(load_thread, 0);
+        val = buffer;
+
+        if(curr < m){
+            args.paths = paths + curr;
+            if (curr + net.batch > m) args.n = m - curr;
+            load_thread = load_data_in_thread(args);
+        }
+        fprintf(stderr, "Loaded: %d images in %lf seconds\n", val.X.rows, sec(clock()-time));
+
+        time=clock();
+        matrix pred = network_predict_data(net, val);
+
+        int i, j;
+        if (target_layer >= 0){
+            //layer l = net.layers[target_layer];
+        }
+
+        for(i = 0; i < pred.rows; ++i){
+            printf("%s", paths[curr-net.batch+i]);
+            for(j = 0; j < pred.cols; ++j){
+                printf("\t%g", pred.vals[i][j]);
+            }
+            printf("\n");
+        }
+
+        free_matrix(pred);
+
+        fprintf(stderr, "%lf seconds, %d images, %d total\n", sec(clock()-time), val.X.rows, curr);
+        free_data(val);
+    }
+}
+
+
+void threat_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
+{
+#ifdef OPENCV
+    float threat = 0;
+    float roll = .2;
+
+    printf("Classifier Demo\n");
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    list *options = read_data_cfg(datacfg);
+
+    srand(2222222);
+    cap_cv * cap;
+
+    if (filename) {
+        //cap = cvCaptureFromFile(filename);
+        cap = get_capture_video_stream(filename);
+    }
+    else {
+        //cap = cvCaptureFromCAM(cam_index);
+        cap = get_capture_webcam(cam_index);
+    }
+
+    int classes = option_find_int(options, "classes", 2);
+    int top = option_find_int(options, "top", 1);
+    if (top > classes) top = classes;
+
+    char *name_list = option_find_str(options, "names", 0);
+    char **names = get_labels(name_list);
+
+    int* indexes = (int*)xcalloc(top, sizeof(int));
+
+    if(!cap) error("Couldn't connect to webcam.", DARKNET_LOC);
+    create_window_cv("Threat", 0, 512, 512);
+    float fps = 0;
+    int i;
+
+    int count = 0;
+
+    while(1){
+        ++count;
+        struct timeval tval_before, tval_after, tval_result;
+        gettimeofday(&tval_before, NULL);
+
+        //image in = get_image_from_stream(cap);
+        image in = get_image_from_stream_cpp(cap);
+        if(!in.data) break;
+        image in_s = resize_image(in, net.w, net.h);
+
+        image out = in;
+        int x1 = out.w / 20;
+        int y1 = out.h / 20;
+        int x2 = 2*x1;
+        int y2 = out.h - out.h/20;
+
+        int border = .01*out.h;
+        int h = y2 - y1 - 2*border;
+        int w = x2 - x1 - 2*border;
+
+        float *predictions = network_predict(net, in_s.data);
+        float curr_threat = 0;
+        if(1){
+            curr_threat = predictions[0] * 0 +
+                predictions[1] * .6 +
+                predictions[2];
+        } else {
+            curr_threat = predictions[218] +
+                predictions[539] +
+                predictions[540] +
+                predictions[368] +
+                predictions[369] +
+                predictions[370];
+        }
+        threat = roll * curr_threat + (1-roll) * threat;
+
+        draw_box_width(out, x2 + border, y1 + .02*h, x2 + .5 * w, y1 + .02*h + border, border, 0,0,0);
+        if(threat > .97) {
+            draw_box_width(out,  x2 + .5 * w + border,
+                    y1 + .02*h - 2*border,
+                    x2 + .5 * w + 6*border,
+                    y1 + .02*h + 3*border, 3*border, 1,0,0);
+        }
+        draw_box_width(out,  x2 + .5 * w + border,
+                y1 + .02*h - 2*border,
+                x2 + .5 * w + 6*border,
+                y1 + .02*h + 3*border, .5*border, 0,0,0);
+        draw_box_width(out, x2 + border, y1 + .42*h, x2 + .5 * w, y1 + .42*h + border, border, 0,0,0);
+        if(threat > .57) {
+            draw_box_width(out,  x2 + .5 * w + border,
+                    y1 + .42*h - 2*border,
+                    x2 + .5 * w + 6*border,
+                    y1 + .42*h + 3*border, 3*border, 1,1,0);
+        }
+        draw_box_width(out,  x2 + .5 * w + border,
+                y1 + .42*h - 2*border,
+                x2 + .5 * w + 6*border,
+                y1 + .42*h + 3*border, .5*border, 0,0,0);
+
+        draw_box_width(out, x1, y1, x2, y2, border, 0,0,0);
+        for(i = 0; i < threat * h ; ++i){
+            float ratio = (float) i / h;
+            float r = (ratio < .5) ? (2*(ratio)) : 1;
+            float g = (ratio < .5) ? 1 : 1 - 2*(ratio - .5);
+            draw_box_width(out, x1 + border, y2 - border - i, x2 - border, y2 - border - i, 1, r, g, 0);
+        }
+        top_predictions(net, top, indexes);
+        char buff[256];
+        sprintf(buff, "tmp/threat_%06d", count);
+        //save_image(out, buff);
+
+        printf("\033[H\033[J");
+        printf("\nFPS:%.0f\n", fps);
+
+        for(i = 0; i < top; ++i){
+            int index = indexes[i];
+            printf("%.1f%%: %s\n", predictions[index]*100, names[index]);
+        }
+
+        if(1){
+            show_image(out, "Threat");
+            wait_key_cv(10);
+        }
+        free_image(in_s);
+        free_image(in);
+
+        gettimeofday(&tval_after, NULL);
+        timersub(&tval_after, &tval_before, &tval_result);
+        float curr = 1000000.f/((long int)tval_result.tv_usec);
+        fps = .9*fps + .1*curr;
+    }
+#endif
+}
+
+
+void gun_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
+{
+#ifdef OPENCV_DISABLE
+    int bad_cats[] = {218, 539, 540, 1213, 1501, 1742, 1911, 2415, 4348, 19223, 368, 369, 370, 1133, 1200, 1306, 2122, 2301, 2537, 2823, 3179, 3596, 3639, 4489, 5107, 5140, 5289, 6240, 6631, 6762, 7048, 7171, 7969, 7984, 7989, 8824, 8927, 9915, 10270, 10448, 13401, 15205, 18358, 18894, 18895, 19249, 19697};
+
+    printf("Classifier Demo\n");
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    list *options = read_data_cfg(datacfg);
+
+    srand(2222222);
+    CvCapture * cap;
+
+    if (filename) {
+        //cap = cvCaptureFromFile(filename);
+        cap = get_capture_video_stream(filename);
+    }
+    else {
+        //cap = cvCaptureFromCAM(cam_index);
+        cap = get_capture_webcam(cam_index);
+    }
+
+    int classes = option_find_int(options, "classes", 2);
+    int top = option_find_int(options, "top", 1);
+    if (top > classes) top = classes;
+
+    char *name_list = option_find_str(options, "names", 0);
+    char **names = get_labels(name_list);
+
+    int* indexes = (int*)xcalloc(top, sizeof(int));
+
+    if(!cap) error("Couldn't connect to webcam.", DARKNET_LOC);
+    cvNamedWindow("Threat Detection", CV_WINDOW_NORMAL);
+    cvResizeWindow("Threat Detection", 512, 512);
+    float fps = 0;
+    int i;
+
+    while(1){
+        struct timeval tval_before, tval_after, tval_result;
+        gettimeofday(&tval_before, NULL);
+
+        //image in = get_image_from_stream(cap);
+        image in = get_image_from_stream_cpp(cap);
+        image in_s = resize_image(in, net.w, net.h);
+        show_image(in, "Threat Detection");
+
+        float *predictions = network_predict(net, in_s.data);
+        top_predictions(net, top, indexes);
+
+        printf("\033[H\033[J");
+
+        int threat = 0;
+        for(i = 0; i < sizeof(bad_cats)/sizeof(bad_cats[0]); ++i){
+            int index = bad_cats[i];
+            if(predictions[index] > .01){
+                printf("Threat Detected!\n");
+                threat = 1;
+                break;
+            }
+        }
+        if(!threat) printf("Scanning...\n");
+        for(i = 0; i < sizeof(bad_cats)/sizeof(bad_cats[0]); ++i){
+            int index = bad_cats[i];
+            if(predictions[index] > .01){
+                printf("%s\n", names[index]);
+            }
+        }
+
+        free_image(in_s);
+        free_image(in);
+
+        cvWaitKey(10);
+
+        gettimeofday(&tval_after, NULL);
+        timersub(&tval_after, &tval_before, &tval_result);
+        float curr = 1000000.f/((long int)tval_result.tv_usec);
+        fps = .9*fps + .1*curr;
+    }
+#endif
+}
+
+void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename, int benchmark, int benchmark_layers)
+{
+#ifdef OPENCV
+    printf("Classifier Demo\n");
+    network net = parse_network_cfg_custom(cfgfile, 1, 0);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    net.benchmark_layers = benchmark_layers;
+    set_batch_network(&net, 1);
+    list *options = read_data_cfg(datacfg);
+
+    fuse_conv_batchnorm(net);
+    calculate_binary_weights(net);
+
+    srand(2222222);
+    cap_cv * cap;
+
+    if(filename){
+        cap = get_capture_video_stream(filename);
+    }else{
+        cap = get_capture_webcam(cam_index);
+    }
+
+    int classes = option_find_int(options, "classes", 2);
+    int top = option_find_int(options, "top", 1);
+    if (top > classes) top = classes;
+
+    char *name_list = option_find_str(options, "names", 0);
+    char **names = get_labels(name_list);
+
+    int* indexes = (int*)xcalloc(top, sizeof(int));
+
+    if(!cap) error("Couldn't connect to webcam.", DARKNET_LOC);
+    if (!benchmark) create_window_cv("Classifier", 0, 512, 512);
+    float fps = 0;
+    int i;
+
+    double start_time = get_time_point();
+    float avg_fps = 0;
+    int frame_counter = 0;
+
+    while(1){
+        struct timeval tval_before, tval_after, tval_result;
+        gettimeofday(&tval_before, NULL);
+
+        //image in = get_image_from_stream(cap);
+        image in_s, in;
+        if (!benchmark) {
+            in = get_image_from_stream_cpp(cap);
+            in_s = resize_image(in, net.w, net.h);
+            show_image(in, "Classifier");
+        }
+        else {
+            static image tmp;
+            if (!tmp.data) tmp = make_image(net.w, net.h, 3);
+            in_s = tmp;
+        }
+
+        double time = get_time_point();
+        float *predictions = network_predict(net, in_s.data);
+        double frame_time_ms = (get_time_point() - time)/1000;
+        frame_counter++;
+
+        if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 1);
+        top_predictions(net, top, indexes);
+
+        printf("\033[H\033[J");
+
+        if (!benchmark) {
+            printf("\rFPS: %.2f  (use -benchmark command line flag for correct measurement)\n", fps);
+            for (i = 0; i < top; ++i) {
+                int index = indexes[i];
+                printf("%.1f%%: %s\n", predictions[index] * 100, names[index]);
+            }
+            printf("\n");
+
+            free_image(in_s);
+            free_image(in);
+
+            int c = wait_key_cv(10);// cvWaitKey(10);
+            if (c == 27 || c == 1048603) break;
+        }
+        else {
+            printf("\rFPS: %.2f \t AVG_FPS = %.2f ", fps, avg_fps);
+        }
+
+        //gettimeofday(&tval_after, NULL);
+        //timersub(&tval_after, &tval_before, &tval_result);
+        //float curr = 1000000.f/((long int)tval_result.tv_usec);
+        float curr = 1000.f / frame_time_ms;
+        if (fps == 0) fps = curr;
+        else fps = .9*fps + .1*curr;
+
+        float spent_time = (get_time_point() - start_time) / 1000000;
+        if (spent_time >= 3.0f) {
+            //printf(" spent_time = %f \n", spent_time);
+            avg_fps = frame_counter / spent_time;
+            frame_counter = 0;
+            start_time = get_time_point();
+        }
+    }
+#endif
+}
+
+
+void run_classifier(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    int mjpeg_port = find_int_arg(argc, argv, "-mjpeg_port", -1);
+    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
+    int *gpus = 0;
+    int gpu = 0;
+    int ngpus = 0;
+    if(gpu_list){
+        printf("%s\n", gpu_list);
+        int len = strlen(gpu_list);
+        ngpus = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (gpu_list[i] == ',') ++ngpus;
+        }
+        gpus = (int*)xcalloc(ngpus, sizeof(int));
+        for(i = 0; i < ngpus; ++i){
+            gpus[i] = atoi(gpu_list);
+            gpu_list = strchr(gpu_list, ',')+1;
+        }
+    } else {
+        gpu = gpu_index;
+        gpus = &gpu;
+        ngpus = 1;
+    }
+
+    int dont_show = find_arg(argc, argv, "-dont_show");
+    int benchmark = find_arg(argc, argv, "-benchmark");
+    int benchmark_layers = find_arg(argc, argv, "-benchmark_layers");
+    if (benchmark_layers) benchmark = 1;
+    int dontuse_opencv = find_arg(argc, argv, "-dontuse_opencv");
+    int show_imgs = find_arg(argc, argv, "-show_imgs");
+    int calc_topk = find_arg(argc, argv, "-topk");
+    int cam_index = find_int_arg(argc, argv, "-c", 0);
+    int top = find_int_arg(argc, argv, "-t", 0);
+    int clear = find_arg(argc, argv, "-clear");
+    char *data = argv[3];
+    char *cfg = argv[4];
+    char *weights = (argc > 5) ? argv[5] : 0;
+    char *filename = (argc > 6) ? argv[6]: 0;
+    char *layer_s = (argc > 7) ? argv[7]: 0;
+    int layer = layer_s ? atoi(layer_s) : -1;
+    char* chart_path = find_char_arg(argc, argv, "-chart", 0);
+    if(0==strcmp(argv[2], "predict")) predict_classifier(data, cfg, weights, filename, top);
+    else if(0==strcmp(argv[2], "try")) try_classifier(data, cfg, weights, filename, atoi(layer_s));
+    else if(0==strcmp(argv[2], "train")) train_classifier(data, cfg, weights, gpus, ngpus, clear, dontuse_opencv, dont_show, mjpeg_port, calc_topk, show_imgs, chart_path);
+    else if(0==strcmp(argv[2], "demo")) demo_classifier(data, cfg, weights, cam_index, filename, benchmark, benchmark_layers);
+    else if(0==strcmp(argv[2], "gun")) gun_classifier(data, cfg, weights, cam_index, filename);
+    else if(0==strcmp(argv[2], "threat")) threat_classifier(data, cfg, weights, cam_index, filename);
+    else if(0==strcmp(argv[2], "test")) test_classifier(data, cfg, weights, layer);
+    else if(0==strcmp(argv[2], "label")) label_classifier(data, cfg, weights);
+    else if(0==strcmp(argv[2], "valid")) validate_classifier_single(data, cfg, weights, NULL, -1);
+    else if(0==strcmp(argv[2], "validmulti")) validate_classifier_multi(data, cfg, weights);
+    else if(0==strcmp(argv[2], "valid10")) validate_classifier_10(data, cfg, weights);
+    else if(0==strcmp(argv[2], "validcrop")) validate_classifier_crop(data, cfg, weights);
+    else if(0==strcmp(argv[2], "validfull")) validate_classifier_full(data, cfg, weights);
+
+    if (gpus && gpu_list && ngpus > 1) free(gpus);
+}
diff --git a/darknet-master/src/classifier.h b/darknet-master/src/classifier.h
new file mode 100644
index 0000000..d94417d
--- /dev/null
+++ b/darknet-master/src/classifier.h
@@ -0,0 +1,12 @@
+#ifndef CLASSIFIER_H
+#define CLASSIFIER_H
+
+#include "list.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+list *read_data_cfg(char *filename);
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/coco.c b/darknet-master/src/coco.c
new file mode 100644
index 0000000..8ad1383
--- /dev/null
+++ b/darknet-master/src/coco.c
@@ -0,0 +1,417 @@
+#include <stdio.h>
+
+#include "network.h"
+#include "detection_layer.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "parser.h"
+#include "box.h"
+#include "demo.h"
+
+char *coco_classes[] = {"person","bicycle","car","motorcycle","airplane","bus","train","truck","boat","traffic light","fire hydrant","stop sign","parking meter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra","giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite","baseball bat","baseball glove","skateboard","surfboard","tennis racket","bottle","wine glass","cup","fork","knife","spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","couch","potted plant","bed","dining table","toilet","tv","laptop","mouse","remote","keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","book","clock","vase","scissors","teddy bear","hair drier","toothbrush"};
+
+int coco_ids[] = {1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90};
+
+void train_coco(char *cfgfile, char *weightfile)
+{
+    //char *train_images = "/home/pjreddie/data/voc/test/train.txt";
+    //char *train_images = "/home/pjreddie/data/coco/train.txt";
+    char *train_images = "data/coco.trainval.txt";
+    //char *train_images = "data/bags.train.list";
+    char* backup_directory = "backup/";
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    float avg_loss = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = net.batch*net.subdivisions;
+    int i = *net.seen/imgs;
+    data train, buffer;
+
+
+    layer l = net.layers[net.n - 1];
+
+    int side = l.side;
+    int classes = l.classes;
+    float jitter = l.jitter;
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.classes = classes;
+    args.jitter = jitter;
+    args.num_boxes = side;
+    args.d = &buffer;
+    args.type = REGION_DATA;
+
+    args.angle = net.angle;
+    args.exposure = net.exposure;
+    args.saturation = net.saturation;
+    args.hue = net.hue;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+    //while(i*imgs < N*120){
+    while(get_current_batch(net) < net.max_batches){
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        /*
+           image im = float_to_image(net.w, net.h, 3, train.X.vals[113]);
+           image copy = copy_image(im);
+           draw_coco(copy, train.y.vals[113], 7, "truth");
+           cvWaitKey(0);
+           free_image(copy);
+         */
+
+        time=clock();
+        float loss = train_network(net, train);
+        if (avg_loss < 0) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+
+        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
+        if(i%1000==0 || (i < 1000 && i%100 == 0)){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+        }
+        free_data(train);
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+}
+
+void print_cocos(FILE *fp, int image_id, box *boxes, float **probs, int num_boxes, int classes, int w, int h)
+{
+    int i, j;
+    for(i = 0; i < num_boxes; ++i){
+        float xmin = boxes[i].x - boxes[i].w/2.;
+        float xmax = boxes[i].x + boxes[i].w/2.;
+        float ymin = boxes[i].y - boxes[i].h/2.;
+        float ymax = boxes[i].y + boxes[i].h/2.;
+
+        if (xmin < 0) xmin = 0;
+        if (ymin < 0) ymin = 0;
+        if (xmax > w) xmax = w;
+        if (ymax > h) ymax = h;
+
+        float bx = xmin;
+        float by = ymin;
+        float bw = xmax - xmin;
+        float bh = ymax - ymin;
+
+        for(j = 0; j < classes; ++j){
+            if (probs[i][j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, probs[i][j]);
+        }
+    }
+}
+
+int get_coco_image_id(char *filename)
+{
+    char *p = strrchr(filename, '_');
+    return atoi(p+1);
+}
+
+void validate_coco(char *cfgfile, char *weightfile)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    srand(time(0));
+
+    char *base = "results/";
+    list *plist = get_paths("data/coco_val_5k.list");
+    //list *plist = get_paths("/home/pjreddie/data/people-art/test.txt");
+    //list *plist = get_paths("/home/pjreddie/data/voc/test/2007_test.txt");
+    char **paths = (char **)list_to_array(plist);
+
+    layer l = net.layers[net.n-1];
+    int classes = l.classes;
+    int side = l.side;
+
+    int j;
+    char buff[1024];
+    snprintf(buff, 1024, "%s/coco_results.json", base);
+    FILE *fp = fopen(buff, "w");
+    fprintf(fp, "[\n");
+
+    box* boxes = (box*)xcalloc(side * side * l.n, sizeof(box));
+    float** probs = (float**)xcalloc(side * side * l.n, sizeof(float*));
+    for(j = 0; j < side*side*l.n; ++j) probs[j] = (float*)xcalloc(classes, sizeof(float));
+
+    int m = plist->size;
+    int i=0;
+    int t;
+
+    float thresh = .01;
+    int nms = 1;
+    float iou_thresh = .5;
+
+    int nthreads = 8;
+    image* val = (image*)xcalloc(nthreads, sizeof(image));
+    image* val_resized = (image*)xcalloc(nthreads, sizeof(image));
+    image* buf = (image*)xcalloc(nthreads, sizeof(image));
+    image* buf_resized = (image*)xcalloc(nthreads, sizeof(image));
+    pthread_t* thr = (pthread_t*)xcalloc(nthreads, sizeof(pthread_t));
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.type = IMAGE_DATA;
+
+    for(t = 0; t < nthreads; ++t){
+        args.path = paths[i+t];
+        args.im = &buf[t];
+        args.resized = &buf_resized[t];
+        thr[t] = load_data_in_thread(args);
+    }
+    time_t start = time(0);
+    for(i = nthreads; i < m+nthreads; i += nthreads){
+        fprintf(stderr, "%d\n", i);
+        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+            pthread_join(thr[t], 0);
+            val[t] = buf[t];
+            val_resized[t] = buf_resized[t];
+        }
+        for(t = 0; t < nthreads && i+t < m; ++t){
+            args.path = paths[i+t];
+            args.im = &buf[t];
+            args.resized = &buf_resized[t];
+            thr[t] = load_data_in_thread(args);
+        }
+        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+            char *path = paths[i+t-nthreads];
+            int image_id = get_coco_image_id(path);
+            float *X = val_resized[t].data;
+            network_predict(net, X);
+            int w = val[t].w;
+            int h = val[t].h;
+            get_detection_boxes(l, w, h, thresh, probs, boxes, 0);
+            if (nms) do_nms_sort_v2(boxes, probs, side*side*l.n, classes, iou_thresh);
+            print_cocos(fp, image_id, boxes, probs, side*side*l.n, classes, w, h);
+            free_image(val[t]);
+            free_image(val_resized[t]);
+        }
+    }
+#ifdef WIN32
+    fseek(fp, -3, SEEK_CUR);
+#else
+    fseek(fp, -2, SEEK_CUR);
+#endif
+    fprintf(fp, "\n]\n");
+    fclose(fp);
+
+    if (val) free(val);
+    if (val_resized) free(val_resized);
+    if (buf) free(buf);
+    if (buf_resized) free(buf_resized);
+    if (thr) free(thr);
+
+    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
+}
+
+void validate_coco_recall(char *cfgfile, char *weightfile)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    srand(time(0));
+
+    char *base = "results/comp4_det_test_";
+    list* plist = get_paths("data/voc/test/2007_test.txt");
+    char **paths = (char **)list_to_array(plist);
+
+    layer l = net.layers[net.n-1];
+    int classes = l.classes;
+    int side = l.side;
+
+    int j, k;
+    /* unused code,why?
+    FILE** fps = (FILE**)xcalloc(classes, sizeof(FILE*));
+    for(j = 0; j < classes; ++j){
+        char buff[1024];
+        snprintf(buff, 1024, "%s%s.txt", base, coco_classes[j]);
+        fps[j] = fopen(buff, "w");
+    }
+    */
+    box* boxes = (box*)xcalloc(side * side * l.n, sizeof(box));
+    float** probs = (float**)xcalloc(side * side * l.n, sizeof(float*));
+    for(j = 0; j < side*side*l.n; ++j) {
+      probs[j] = (float*)xcalloc(classes, sizeof(float));
+    }
+
+    int m = plist->size;
+    int i=0;
+
+    float thresh = .001;
+    int nms = 0;
+    float iou_thresh = .5;
+    float nms_thresh = .5;
+
+    int total = 0;
+    int correct = 0;
+    int proposals = 0;
+    float avg_iou = 0;
+
+    for(i = 0; i < m; ++i){
+        char *path = paths[i];
+        image orig = load_image_color(path, 0, 0);
+        image sized = resize_image(orig, net.w, net.h);
+        char *id = basecfg(path);
+        network_predict(net, sized.data);
+        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 1);
+        if (nms) do_nms(boxes, probs, side*side*l.n, 1, nms_thresh);
+
+        char labelpath[4096];
+        replace_image_to_label(path, labelpath);
+
+        int num_labels = 0;
+        box_label *truth = read_boxes(labelpath, &num_labels);
+        for(k = 0; k < side*side*l.n; ++k){
+            if(probs[k][0] > thresh){
+                ++proposals;
+            }
+        }
+        for (j = 0; j < num_labels; ++j) {
+            ++total;
+            box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
+            float best_iou = 0;
+            for(k = 0; k < side*side*l.n; ++k){
+                float iou = box_iou(boxes[k], t);
+                if(probs[k][0] > thresh && iou > best_iou){
+                    best_iou = iou;
+                }
+            }
+            avg_iou += best_iou;
+            if(best_iou > iou_thresh){
+                ++correct;
+            }
+        }
+
+        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
+
+        //if (fps) free(fps);
+        if (id) free(id);
+        free(truth);
+        free_image(orig);
+        free_image(sized);
+    }
+    free(boxes);
+    for(j = 0; j < side*side*l.n; ++j) {
+        free(probs[j]);
+    }
+    free(probs);
+}
+
+void test_coco(char *cfgfile, char *weightfile, char *filename, float thresh)
+{
+    image **alphabet = load_alphabet();
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    detection_layer l = net.layers[net.n-1];
+    set_batch_network(&net, 1);
+    srand(2222222);
+    float nms = .4;
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    int j;
+    box* boxes = (box*)xcalloc(l.side * l.side * l.n, sizeof(box));
+    float** probs = (float**)xcalloc(l.side * l.side * l.n, sizeof(float*));
+    for(j = 0; j < l.side*l.side*l.n; ++j) {
+      probs[j] = (float*)xcalloc(l.classes, sizeof(float));
+    }
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        } else {
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) break;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input,0,0);
+        image sized = resize_image(im, net.w, net.h);
+        float *X = sized.data;
+        time=clock();
+        network_predict(net, X);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 0);
+        if (nms) do_nms_sort_v2(boxes, probs, l.side*l.side*l.n, l.classes, nms);
+        draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, coco_classes, alphabet, 80);
+        save_image(im, "prediction");
+        show_image(im, "predictions");
+        free_image(im);
+        free_image(sized);
+        free_alphabet(alphabet);
+        wait_until_press_key_cv();
+        destroy_all_windows_cv();
+        if (filename) break;
+    }
+    free(boxes);
+    for(j = 0; j < l.side*l.side*l.n; ++j) {
+        free(probs[j]);
+    }
+    free(probs);
+}
+
+void run_coco(int argc, char **argv)
+{
+    int dont_show = find_arg(argc, argv, "-dont_show");
+    int mjpeg_port = find_int_arg(argc, argv, "-mjpeg_port", -1);
+    int json_port = find_int_arg(argc, argv, "-json_port", -1);
+    char *out_filename = find_char_arg(argc, argv, "-out_filename", 0);
+    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
+    float thresh = find_float_arg(argc, argv, "-thresh", .2);
+    float hier_thresh = find_float_arg(argc, argv, "-hier", .5);
+    int cam_index = find_int_arg(argc, argv, "-c", 0);
+    int frame_skip = find_int_arg(argc, argv, "-s", 0);
+    int ext_output = find_arg(argc, argv, "-ext_output");
+    char *json_file_output = find_char_arg(argc, argv, "-json_file_output", 0);
+
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *filename = (argc > 5) ? argv[5]: 0;
+    if(0==strcmp(argv[2], "test")) test_coco(cfg, weights, filename, thresh);
+    else if(0==strcmp(argv[2], "train")) train_coco(cfg, weights);
+    else if(0==strcmp(argv[2], "valid")) validate_coco(cfg, weights);
+    else if(0==strcmp(argv[2], "recall")) validate_coco_recall(cfg, weights);
+    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, hier_thresh, cam_index, filename, coco_classes, 80, 1, frame_skip,
+        prefix, out_filename, mjpeg_port, 0, json_port, dont_show, ext_output, 0, 0, 0, 0, 0, json_file_output);
+}
diff --git a/darknet-master/src/col2im.c b/darknet-master/src/col2im.c
new file mode 100644
index 0000000..10138a0
--- /dev/null
+++ b/darknet-master/src/col2im.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include "col2im.h"
+void col2im_add_pixel(float *im, int height, int width, int channels,
+                        int row, int col, int channel, int pad, float val)
+{
+    row -= pad;
+    col -= pad;
+
+    if (row < 0 || col < 0 ||
+        row >= height || col >= width) return;
+    im[col + width*(row + height*channel)] += val;
+}
+//This one might be too, can't remember.
+void col2im_cpu(float* data_col,
+         int channels,  int height,  int width,
+         int ksize,  int stride, int pad, float* data_im)
+{
+    int c,h,w;
+    int height_col = (height + 2*pad - ksize) / stride + 1;
+    int width_col = (width + 2*pad - ksize) / stride + 1;
+
+    int channels_col = channels * ksize * ksize;
+    for (c = 0; c < channels_col; ++c) {
+        int w_offset = c % ksize;
+        int h_offset = (c / ksize) % ksize;
+        int c_im = c / ksize / ksize;
+        for (h = 0; h < height_col; ++h) {
+            for (w = 0; w < width_col; ++w) {
+                int im_row = h_offset + h * stride;
+                int im_col = w_offset + w * stride;
+                int col_index = (c * height_col + h) * width_col + w;
+                float val = data_col[col_index];
+                col2im_add_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad, val);
+            }
+        }
+    }
+}
+// ----------------------------------------
+void caffe_set(const int N, const float alpha, float* Y) {
+    if (alpha == 0) {
+        memset(Y, 0, sizeof(float) * N);  // NOLINT(caffe/alt_fn)
+        return;
+    }
+    int i;
+    for (i = 0; i < N; ++i) {
+        Y[i] = alpha;
+    }
+}
+
+inline static int is_a_ge_zero_and_a_lt_b(int a, int b) {
+    return (unsigned)(a) < (unsigned)(b);
+}
+
+// https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cpp
+void col2im_cpu_ext(const float* data_col, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    float* data_im)
+{
+    caffe_set(height * width * channels, 0.0F, data_im);
+    const int output_h = (height + 2 * pad_h -
+        (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int output_w = (width + 2 * pad_w -
+        (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+    const int channel_size = height * width;
+    int channel, kernel_row, kernel_col, output_rows, output_col;
+    for (channel = channels; channel--; data_im += channel_size) {
+        for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+            for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+                int input_row = -pad_h + kernel_row * dilation_h;
+                for (output_rows = output_h; output_rows; output_rows--) {
+                    if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+                        data_col += output_w;
+                    }
+                    else {
+                        int input_col = -pad_w + kernel_col * dilation_w;
+                        for (output_col = output_w; output_col; output_col--) {
+                            if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                                data_im[input_row * width + input_col] += *data_col;
+                            }
+                            data_col++;
+                            input_col += stride_w;
+                        }
+                    }
+                    input_row += stride_h;
+                }
+            }
+        }
+    }
+}
diff --git a/darknet-master/src/col2im.h b/darknet-master/src/col2im.h
new file mode 100644
index 0000000..984f7c4
--- /dev/null
+++ b/darknet-master/src/col2im.h
@@ -0,0 +1,33 @@
+#ifndef COL2IM_H
+#define COL2IM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void col2im_cpu(float* data_col,
+        int channels, int height, int width,
+        int ksize, int stride, int pad, float* data_im);
+
+void col2im_cpu_ext(const float* data_col, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    float* data_im);
+
+#ifdef GPU
+void col2im_ongpu(float *data_col,
+        int channels, int height, int width,
+        int ksize, int stride, int pad, float *data_im);
+
+
+void col2im_gpu_ext(const float* data_col, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    float* data_im);
+#endif
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/col2im_kernels.cu b/darknet-master/src/col2im_kernels.cu
new file mode 100644
index 0000000..ae651c4
--- /dev/null
+++ b/darknet-master/src/col2im_kernels.cu
@@ -0,0 +1,136 @@
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+
+#include "col2im.h"
+#include "dark_cuda.h"
+
+// src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
+// You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE
+
+__global__ void col2im_gpu_kernel(const int n, const float* data_col,
+        const int height, const int width, const int ksize,
+        const int pad,
+        const int stride,
+        const int height_col, const int width_col,
+        float *data_im) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    for(; index < n; index += blockDim.x*gridDim.x){
+        float val = 0;
+        int w = index % width + pad;
+        int h = (index / width) % height + pad;
+        int c = index / (width * height);
+        // compute the start and end of the output
+        int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+        int w_col_end = min(w / stride + 1, width_col);
+        int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+        int h_col_end = min(h / stride + 1, height_col);
+        // equivalent implementation
+        int offset =
+            (c * ksize * ksize + h * ksize + w) * height_col * width_col;
+        int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
+        int coeff_w_col = (1 - stride * height_col * width_col);
+        for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+            for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+                val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+            }
+        }
+        data_im[index] += val;
+    }
+}
+
+void col2im_ongpu(float *data_col,
+        int channels, int height, int width,
+        int ksize, int stride, int pad, float *data_im){
+    // We are going to launch channels * height_col * width_col kernels, each
+    // kernel responsible for copying a single-channel grid.
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    int num_kernels = channels * height * width;
+    col2im_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,
+        BLOCK, 0, get_cuda_stream() >>>(
+                num_kernels, data_col, height, width, ksize, pad,
+                stride, height_col,
+                width_col, data_im);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+// -----------------------------------------
+
+// CUDA: use 512 threads per block
+const int CAFFE_CUDA_NUM_THREADS = 512;
+
+// CUDA: number of blocks for threads.
+inline int CAFFE_GET_BLOCKS(const int N) {
+    return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
+}
+
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n); \
+       i += blockDim.x * gridDim.x)
+
+// https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
+__global__ void col2im_gpu_kernel_ext(const int n, const float* data_col,
+    const int height, const int width, const int channels,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int height_col, const int width_col,
+    float* data_im) {
+    CUDA_KERNEL_LOOP(index, n) {
+        float val = 0;
+        const int w_im = index % width + pad_w;
+        const int h_im = (index / width) % height + pad_h;
+        const int c_im = index / (width * height);
+        int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+        int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+        // compute the start and end of the output
+        const int w_col_start =
+            (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+        const int w_col_end = min(w_im / stride_w + 1, width_col);
+        const int h_col_start =
+            (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+        const int h_col_end = min(h_im / stride_h + 1, height_col);
+        // TODO: use LCM of stride and dilation to avoid unnecessary loops
+        for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+            for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+                int h_k = (h_im - h_col * stride_h);
+                int w_k = (w_im - w_col * stride_w);
+                if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+                    h_k /= dilation_h;
+                    w_k /= dilation_w;
+                    int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
+                        height_col + h_col) * width_col + w_col;
+                    val += data_col[data_col_index];
+                }
+            }
+        }
+        data_im[index] = val;
+    }
+}
+
+void col2im_gpu_ext(const float* data_col, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    float* data_im)
+{
+    int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) /
+        stride_h + 1;
+    int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) /
+        stride_w + 1;
+    int num_kernels = channels * height * width;
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    col2im_gpu_kernel_ext<<<CAFFE_GET_BLOCKS(num_kernels),
+        CAFFE_CUDA_NUM_THREADS >>>(
+            num_kernels, data_col, height, width, channels, kernel_h, kernel_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            height_col, width_col, data_im);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
diff --git a/darknet-master/src/compare.c b/darknet-master/src/compare.c
new file mode 100644
index 0000000..4bacda4
--- /dev/null
+++ b/darknet-master/src/compare.c
@@ -0,0 +1,352 @@
+#include "network.h"
+#include "detection_layer.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "parser.h"
+#include "box.h"
+
+#include <stdio.h>
+
+void train_compare(char *cfgfile, char *weightfile)
+{
+    srand(time(0));
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    char* backup_directory = "backup/";
+    printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = 1024;
+    list *plist = get_paths("data/compare.train.list");
+    char **paths = (char **)list_to_array(plist);
+    int N = plist->size;
+    printf("%d\n", N);
+    clock_t time;
+    pthread_t load_thread;
+    data train;
+    data buffer;
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.paths = paths;
+    args.classes = 20;
+    args.n = imgs;
+    args.m = N;
+    args.d = &buffer;
+    args.type = COMPARE_DATA;
+
+    load_thread = load_data_in_thread(args);
+    int epoch = *net.seen/N;
+    int i = 0;
+    while(1){
+        ++i;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+
+        load_thread = load_data_in_thread(args);
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+        time=clock();
+        float loss = train_network(net, train);
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+        printf("%.3f: %f, %f avg, %lf seconds, %" PRIu64 " images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen);
+        free_data(train);
+        if(i%100 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d_minor_%d.weights",backup_directory,base, epoch, i);
+            save_weights(net, buff);
+        }
+        if(*net.seen/N > epoch){
+            epoch = *net.seen/N;
+            i = 0;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            save_weights(net, buff);
+            if(epoch%22 == 0) net.learning_rate *= .1;
+        }
+    }
+    pthread_join(load_thread, 0);
+    free_data(buffer);
+    free_network(net);
+    free_ptrs((void**)paths, plist->size);
+    free_list(plist);
+    free(base);
+}
+
+void validate_compare(char *filename, char *weightfile)
+{
+    int i = 0;
+    network net = parse_network_cfg(filename);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    list *plist = get_paths("data/compare.val.list");
+    //list *plist = get_paths("data/compare.val.old");
+    char **paths = (char **)list_to_array(plist);
+    int N = plist->size/2;
+    free_list(plist);
+
+    clock_t time;
+    int correct = 0;
+    int total = 0;
+    int splits = 10;
+    int num = (i+1)*N/splits - i*N/splits;
+
+    data val, buffer;
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.paths = paths;
+    args.classes = 20;
+    args.n = num;
+    args.m = 0;
+    args.d = &buffer;
+    args.type = COMPARE_DATA;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    for(i = 1; i <= splits; ++i){
+        time=clock();
+
+        pthread_join(load_thread, 0);
+        val = buffer;
+
+        num = (i+1)*N/splits - i*N/splits;
+        char **part = paths+(i*N/splits);
+        if(i != splits){
+            args.paths = part;
+            load_thread = load_data_in_thread(args);
+        }
+        printf("Loaded: %d images in %lf seconds\n", val.X.rows, sec(clock()-time));
+
+        time=clock();
+        matrix pred = network_predict_data(net, val);
+        int j,k;
+        for(j = 0; j < val.y.rows; ++j){
+            for(k = 0; k < 20; ++k){
+                if(val.y.vals[j][k*2] != val.y.vals[j][k*2+1]){
+                    ++total;
+                    if((val.y.vals[j][k*2] < val.y.vals[j][k*2+1]) == (pred.vals[j][k*2] < pred.vals[j][k*2+1])){
+                        ++correct;
+                    }
+                }
+            }
+        }
+        free_matrix(pred);
+        printf("%d: Acc: %f, %lf seconds, %d images\n", i, (float)correct/total, sec(clock()-time), val.X.rows);
+        free_data(val);
+    }
+}
+
+typedef struct {
+    network net;
+    char *filename;
+    int class_id;
+    int classes;
+    float elo;
+    float *elos;
+} sortable_bbox;
+
+int total_compares = 0;
+int current_class_id = 0;
+
+int elo_comparator(const void*a, const void *b)
+{
+    sortable_bbox box1 = *(sortable_bbox*)a;
+    sortable_bbox box2 = *(sortable_bbox*)b;
+    if(box1.elos[current_class_id] == box2.elos[current_class_id]) return 0;
+    if(box1.elos[current_class_id] >  box2.elos[current_class_id]) return -1;
+    return 1;
+}
+
+int bbox_comparator(const void *a, const void *b)
+{
+    ++total_compares;
+    sortable_bbox box1 = *(sortable_bbox*)a;
+    sortable_bbox box2 = *(sortable_bbox*)b;
+    network net = box1.net;
+    int class_id   = box1.class_id;
+
+    image im1 = load_image_color(box1.filename, net.w, net.h);
+    image im2 = load_image_color(box2.filename, net.w, net.h);
+    float* X = (float*)xcalloc(net.w * net.h * net.c, sizeof(float));
+    memcpy(X,                   im1.data, im1.w*im1.h*im1.c*sizeof(float));
+    memcpy(X+im1.w*im1.h*im1.c, im2.data, im2.w*im2.h*im2.c*sizeof(float));
+    float *predictions = network_predict(net, X);
+
+    free_image(im1);
+    free_image(im2);
+    free(X);
+    if (predictions[class_id*2] > predictions[class_id*2+1]){
+        return 1;
+    }
+    return -1;
+}
+
+void bbox_update(sortable_bbox *a, sortable_bbox *b, int class_id, int result)
+{
+    int k = 32;
+    float EA = 1./(1+pow(10, (b->elos[class_id] - a->elos[class_id])/400.));
+    float EB = 1./(1+pow(10, (a->elos[class_id] - b->elos[class_id])/400.));
+    float SA = result ? 1 : 0;
+    float SB = result ? 0 : 1;
+    a->elos[class_id] += k*(SA - EA);
+    b->elos[class_id] += k*(SB - EB);
+}
+
+void bbox_fight(network net, sortable_bbox *a, sortable_bbox *b, int classes, int class_id)
+{
+    image im1 = load_image_color(a->filename, net.w, net.h);
+    image im2 = load_image_color(b->filename, net.w, net.h);
+    float* X = (float*)xcalloc(net.w * net.h * net.c, sizeof(float));
+    memcpy(X,                   im1.data, im1.w*im1.h*im1.c*sizeof(float));
+    memcpy(X+im1.w*im1.h*im1.c, im2.data, im2.w*im2.h*im2.c*sizeof(float));
+    float *predictions = network_predict(net, X);
+    ++total_compares;
+
+    int i;
+    for(i = 0; i < classes; ++i){
+        if(class_id < 0 || class_id == i){
+            int result = predictions[i*2] > predictions[i*2+1];
+            bbox_update(a, b, i, result);
+        }
+    }
+
+    free_image(im1);
+    free_image(im2);
+    free(X);
+}
+
+void SortMaster3000(char *filename, char *weightfile)
+{
+    int i = 0;
+    network net = parse_network_cfg(filename);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+    set_batch_network(&net, 1);
+
+    list *plist = get_paths("data/compare.sort.list");
+    //list *plist = get_paths("data/compare.val.old");
+    char **paths = (char **)list_to_array(plist);
+    int N = plist->size;
+    free_list(plist);
+    sortable_bbox* boxes = (sortable_bbox*)xcalloc(N, sizeof(sortable_bbox));
+    printf("Sorting %d boxes...\n", N);
+    for(i = 0; i < N; ++i){
+        boxes[i].filename = paths[i];
+        boxes[i].net = net;
+        boxes[i].class_id = 7;
+        boxes[i].elo = 1500;
+    }
+    clock_t time=clock();
+    qsort(boxes, N, sizeof(sortable_bbox), bbox_comparator);
+    for(i = 0; i < N; ++i){
+        printf("%s\n", boxes[i].filename);
+    }
+    printf("Sorted in %d compares, %f secs\n", total_compares, sec(clock()-time));
+}
+
+void BattleRoyaleWithCheese(char *filename, char *weightfile)
+{
+    int classes = 20;
+    int i,j;
+    network net = parse_network_cfg(filename);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+    set_batch_network(&net, 1);
+
+    list *plist = get_paths("data/compare.sort.list");
+    //list *plist = get_paths("data/compare.small.list");
+    //list *plist = get_paths("data/compare.cat.list");
+    //list *plist = get_paths("data/compare.val.old");
+    char **paths = (char **)list_to_array(plist);
+    int N = plist->size;
+    int total = N;
+    free_list(plist);
+    sortable_bbox* boxes = (sortable_bbox*)xcalloc(N, sizeof(sortable_bbox));
+    printf("Battling %d boxes...\n", N);
+    for(i = 0; i < N; ++i){
+        boxes[i].filename = paths[i];
+        boxes[i].net = net;
+        boxes[i].classes = classes;
+        boxes[i].elos = (float*)xcalloc(classes, sizeof(float));
+        for(j = 0; j < classes; ++j){
+            boxes[i].elos[j] = 1500;
+        }
+    }
+    int round;
+    clock_t time=clock();
+    for(round = 1; round <= 4; ++round){
+        clock_t round_time=clock();
+        printf("Round: %d\n", round);
+        shuffle(boxes, N, sizeof(sortable_bbox));
+        for(i = 0; i < N/2; ++i){
+            bbox_fight(net, boxes+i*2, boxes+i*2+1, classes, -1);
+        }
+        printf("Round: %f secs, %d remaining\n", sec(clock()-round_time), N);
+    }
+
+    int class_id;
+
+    for (class_id = 0; class_id < classes; ++class_id){
+
+        N = total;
+        current_class_id = class_id;
+        qsort(boxes, N, sizeof(sortable_bbox), elo_comparator);
+        N /= 2;
+
+        for(round = 1; round <= 100; ++round){
+            clock_t round_time=clock();
+            printf("Round: %d\n", round);
+
+            sorta_shuffle(boxes, N, sizeof(sortable_bbox), 10);
+            for(i = 0; i < N/2; ++i){
+                bbox_fight(net, boxes+i*2, boxes+i*2+1, classes, class_id);
+            }
+            qsort(boxes, N, sizeof(sortable_bbox), elo_comparator);
+            if(round <= 20) N = (N*9/10)/2*2;
+
+            printf("Round: %f secs, %d remaining\n", sec(clock()-round_time), N);
+        }
+        char buff[256];
+        sprintf(buff, "results/battle_%d.log", class_id);
+        FILE *outfp = fopen(buff, "w");
+        for(i = 0; i < N; ++i){
+            fprintf(outfp, "%s %f\n", boxes[i].filename, boxes[i].elos[class_id]);
+        }
+        fclose(outfp);
+    }
+    printf("Tournament in %d compares, %f secs\n", total_compares, sec(clock()-time));
+}
+
+void run_compare(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    //char *filename = (argc > 5) ? argv[5]: 0;
+    if(0==strcmp(argv[2], "train")) train_compare(cfg, weights);
+    else if(0==strcmp(argv[2], "valid")) validate_compare(cfg, weights);
+    else if(0==strcmp(argv[2], "sort")) SortMaster3000(cfg, weights);
+    else if(0==strcmp(argv[2], "battle")) BattleRoyaleWithCheese(cfg, weights);
+    /*
+       else if(0==strcmp(argv[2], "train")) train_coco(cfg, weights);
+       else if(0==strcmp(argv[2], "extract")) extract_boxes(cfg, weights);
+       else if(0==strcmp(argv[2], "valid")) validate_recall(cfg, weights);
+     */
+}
diff --git a/darknet-master/src/connected_layer.c b/darknet-master/src/connected_layer.c
new file mode 100644
index 0000000..244e82f
--- /dev/null
+++ b/darknet-master/src/connected_layer.c
@@ -0,0 +1,447 @@
+#include "connected_layer.h"
+#include "batchnorm_layer.h"
+#include "convolutional_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include "gemm.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+size_t get_connected_workspace_size(layer l)
+{
+#ifdef CUDNN
+    return get_convolutional_workspace_size(l);
+    /*
+    if (gpu_index >= 0) {
+        size_t most = 0;
+        size_t s = 0;
+        CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
+            l.srcTensorDesc,
+            l.weightDesc,
+            l.convDesc,
+            l.dstTensorDesc,
+            l.fw_algo,
+            &s));
+        if (s > most) most = s;
+        CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
+            l.srcTensorDesc,
+            l.ddstTensorDesc,
+            l.convDesc,
+            l.dweightDesc,
+            l.bf_algo,
+            &s));
+        if (s > most) most = s;
+        CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
+            l.weightDesc,
+            l.ddstTensorDesc,
+            l.convDesc,
+            l.dsrcTensorDesc,
+            l.bd_algo,
+            &s));
+        if (s > most) most = s;
+        return most;
+    }
+    */
+#endif
+    return 0;
+}
+
+connected_layer make_connected_layer(int batch, int steps, int inputs, int outputs, ACTIVATION activation, int batch_normalize)
+{
+    int total_batch = batch*steps;
+    int i;
+    connected_layer l = { (LAYER_TYPE)0 };
+    l.type = CONNECTED;
+
+    l.inputs = inputs;
+    l.outputs = outputs;
+    l.batch= batch;
+    l.batch_normalize = batch_normalize;
+    l.h = 1;
+    l.w = 1;
+    l.c = inputs;
+    l.out_h = 1;
+    l.out_w = 1;
+    l.out_c = outputs;
+    l.n = l.out_c;
+    l.size = 1;
+    l.stride = l.stride_x = l.stride_y = 1;
+    l.pad = 0;
+    l.activation = activation;
+    l.learning_rate_scale = 1;
+    l.groups = 1;
+    l.dilation = 1;
+
+    l.output = (float*)xcalloc(total_batch * outputs, sizeof(float));
+    l.delta = (float*)xcalloc(total_batch * outputs, sizeof(float));
+
+    l.weight_updates = (float*)xcalloc(inputs * outputs, sizeof(float));
+    l.bias_updates = (float*)xcalloc(outputs, sizeof(float));
+
+    l.weights = (float*)xcalloc(outputs * inputs, sizeof(float));
+    l.biases = (float*)xcalloc(outputs, sizeof(float));
+
+    l.forward = forward_connected_layer;
+    l.backward = backward_connected_layer;
+    l.update = update_connected_layer;
+
+    //float scale = 1./sqrt(inputs);
+    float scale = sqrt(2.f/inputs);
+    for(i = 0; i < outputs*inputs; ++i){
+        l.weights[i] = scale*rand_uniform(-1, 1);
+    }
+
+    for(i = 0; i < outputs; ++i){
+        l.biases[i] = 0;
+    }
+
+    if(batch_normalize){
+        l.scales = (float*)xcalloc(outputs, sizeof(float));
+        l.scale_updates = (float*)xcalloc(outputs, sizeof(float));
+        for(i = 0; i < outputs; ++i){
+            l.scales[i] = 1;
+        }
+
+        l.mean = (float*)xcalloc(outputs, sizeof(float));
+        l.mean_delta = (float*)xcalloc(outputs, sizeof(float));
+        l.variance = (float*)xcalloc(outputs, sizeof(float));
+        l.variance_delta = (float*)xcalloc(outputs, sizeof(float));
+
+        l.rolling_mean = (float*)xcalloc(outputs, sizeof(float));
+        l.rolling_variance = (float*)xcalloc(outputs, sizeof(float));
+
+        l.x = (float*)xcalloc(total_batch * outputs, sizeof(float));
+        l.x_norm = (float*)xcalloc(total_batch * outputs, sizeof(float));
+    }
+
+#ifdef GPU
+    l.forward_gpu = forward_connected_layer_gpu;
+    l.backward_gpu = backward_connected_layer_gpu;
+    l.update_gpu = update_connected_layer_gpu;
+
+    l.weights_gpu = cuda_make_array(l.weights, outputs*inputs);
+    l.biases_gpu = cuda_make_array(l.biases, outputs);
+
+    l.weight_updates_gpu = cuda_make_array(l.weight_updates, outputs*inputs);
+    l.bias_updates_gpu = cuda_make_array(l.bias_updates, outputs);
+
+    l.output_gpu = cuda_make_array(l.output, outputs*total_batch);
+    l.delta_gpu = cuda_make_array(l.delta, outputs*total_batch);
+    if (batch_normalize) {
+        l.scales_gpu = cuda_make_array(l.scales, outputs);
+        l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);
+
+        l.mean_gpu = cuda_make_array(l.mean, outputs);
+        l.variance_gpu = cuda_make_array(l.variance, outputs);
+
+        l.rolling_mean_gpu = cuda_make_array(l.mean, outputs);
+        l.rolling_variance_gpu = cuda_make_array(l.variance, outputs);
+
+        l.mean_delta_gpu = cuda_make_array(l.mean, outputs);
+        l.variance_delta_gpu = cuda_make_array(l.variance, outputs);
+
+        l.x_gpu = cuda_make_array(l.output, total_batch*outputs);
+        l.x_norm_gpu = cuda_make_array(l.output, total_batch*outputs);
+    }
+#ifdef CUDNN
+    create_convolutional_cudnn_tensors(&l);
+    cudnn_convolutional_setup(&l, cudnn_fastest, 0);   // cudnn_fastest, cudnn_smallest
+    l.workspace_size = get_connected_workspace_size(l);
+#endif  // CUDNN
+#endif  // GPU
+    fprintf(stderr, "connected                            %4d  ->  %4d\n", inputs, outputs);
+    return l;
+}
+
+void update_connected_layer(connected_layer l, int batch, float learning_rate, float momentum, float decay)
+{
+    axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
+    scal_cpu(l.outputs, momentum, l.bias_updates, 1);
+
+    if(l.batch_normalize){
+        axpy_cpu(l.outputs, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
+        scal_cpu(l.outputs, momentum, l.scale_updates, 1);
+    }
+
+    axpy_cpu(l.inputs*l.outputs, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1);
+}
+
+void forward_connected_layer(connected_layer l, network_state state)
+{
+    int i;
+    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+    int m = l.batch;
+    int k = l.inputs;
+    int n = l.outputs;
+    float *a = state.input;
+    float *b = l.weights;
+    float *c = l.output;
+    gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
+    if(l.batch_normalize){
+        if(state.train){
+            mean_cpu(l.output, l.batch, l.outputs, 1, l.mean);
+            variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance);
+
+            scal_cpu(l.outputs, .95f, l.rolling_mean, 1);
+            axpy_cpu(l.outputs, .05f, l.mean, 1, l.rolling_mean, 1);
+            scal_cpu(l.outputs, .95f, l.rolling_variance, 1);
+            axpy_cpu(l.outputs, .05f, l.variance, 1, l.rolling_variance, 1);
+
+            copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
+            normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1);
+            copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
+        } else {
+            normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1);
+        }
+        scale_bias(l.output, l.scales, l.batch, l.outputs, 1);
+    }
+    for(i = 0; i < l.batch; ++i){
+        axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1);
+    }
+    activate_array(l.output, l.outputs*l.batch, l.activation);
+}
+
+void backward_connected_layer(connected_layer l, network_state state)
+{
+    int i;
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+    for(i = 0; i < l.batch; ++i){
+        axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
+    }
+    if(l.batch_normalize){
+        backward_scale_cpu(l.x_norm, l.delta, l.batch, l.outputs, 1, l.scale_updates);
+
+        scale_bias(l.delta, l.scales, l.batch, l.outputs, 1);
+
+        mean_delta_cpu(l.delta, l.variance, l.batch, l.outputs, 1, l.mean_delta);
+        variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.outputs, 1, l.variance_delta);
+        normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.outputs, 1, l.delta);
+    }
+
+    int m = l.outputs;
+    int k = l.batch;
+    int n = l.inputs;
+    float *a = l.delta;
+    float *b = state.input;
+    float *c = l.weight_updates;
+    gemm(1,0,m,n,k,1,a,m,b,n,1,c,n);
+
+    m = l.batch;
+    k = l.outputs;
+    n = l.inputs;
+
+    a = l.delta;
+    b = l.weights;
+    c = state.delta;
+
+    if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+}
+
+
+void denormalize_connected_layer(layer l)
+{
+    int i, j;
+    for(i = 0; i < l.outputs; ++i){
+        float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .000001f);
+        for(j = 0; j < l.inputs; ++j){
+            l.weights[i*l.inputs + j] *= scale;
+        }
+        l.biases[i] -= l.rolling_mean[i] * scale;
+        l.scales[i] = 1;
+        l.rolling_mean[i] = 0;
+        l.rolling_variance[i] = 1;
+    }
+}
+
+
+void statistics_connected_layer(layer l)
+{
+    if(l.batch_normalize){
+        printf("Scales ");
+        print_statistics(l.scales, l.outputs);
+        /*
+        printf("Rolling Mean ");
+        print_statistics(l.rolling_mean, l.outputs);
+        printf("Rolling Variance ");
+        print_statistics(l.rolling_variance, l.outputs);
+        */
+    }
+    printf("Biases ");
+    print_statistics(l.biases, l.outputs);
+    printf("Weights ");
+    print_statistics(l.weights, l.outputs);
+}
+
+#ifdef GPU
+
+void pull_connected_layer(connected_layer l)
+{
+    cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
+    cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
+    cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.inputs*l.outputs);
+    cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
+    if (l.batch_normalize){
+        cuda_pull_array(l.scales_gpu, l.scales, l.outputs);
+        cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs);
+        cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs);
+    }
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+void push_connected_layer(connected_layer l)
+{
+    cuda_push_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
+    cuda_push_array(l.biases_gpu, l.biases, l.outputs);
+    cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.inputs*l.outputs);
+    cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
+    if (l.batch_normalize){
+        cuda_push_array(l.scales_gpu, l.scales, l.outputs);
+        cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs);
+        cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs);
+    }
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
+{
+    float learning_rate = learning_rate_init * l.learning_rate_scale;
+
+    // Loss scale for Mixed-Precision on Tensor-Cores
+    if (loss_scale != 1.0) {
+        scal_ongpu(l.inputs*l.outputs, 1.0 / loss_scale, l.weight_updates_gpu, 1);
+        scal_ongpu(l.outputs, 1.0 / loss_scale, l.bias_updates_gpu, 1);
+        scal_ongpu(l.outputs, 1.0 / loss_scale, l.scale_updates_gpu, 1);
+    }
+
+    axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+    scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
+
+    if(l.batch_normalize){
+        axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+        scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1);
+    }
+
+    axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+    axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+    scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
+}
+
+void forward_connected_layer_gpu(connected_layer l, network_state state)
+{
+    fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+
+    int m = l.batch;
+    int k = l.inputs;
+    int n = l.outputs;
+    float * a = state.input;
+    float * b = l.weights_gpu;
+    float * c = l.output_gpu;
+#ifdef CUDNN
+    float one = 1;    // alpha[0], beta[0]
+    float alpha = 1, beta = 0;
+
+    CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(),
+        &alpha, //&one,
+        l.srcTensorDesc,
+        state.input,
+        l.weightDesc,
+        l.weights_gpu,
+        l.convDesc,
+        l.fw_algo,
+        state.workspace,
+        l.workspace_size,
+        &beta,  //&one,
+        l.dstTensorDesc,
+        l.output_gpu));
+#else // CUDNN
+    gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
+#endif // CUDNN
+
+    if (l.batch_normalize) {
+        forward_batchnorm_layer_gpu(l, state);
+    }
+    else {
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.outputs, 1);
+    }
+    //for(i = 0; i < l.batch; ++i) axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
+    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+}
+
+void backward_connected_layer_gpu(connected_layer l, network_state state)
+{
+    int i;
+    constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
+    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    for(i = 0; i < l.batch; ++i){
+        axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
+    }
+
+    if(l.batch_normalize){
+        backward_batchnorm_layer_gpu(l, state);
+    }
+
+#ifdef CUDNN_DISABLED
+    float one = 1;
+    // calculate conv weight updates
+    // if used: beta=1 then loss decreases faster
+    CHECK_CUDNN(cudnnConvolutionBackwardFilter(cudnn_handle(),
+        &one,
+        l.srcTensorDesc,
+        state.input,
+        l.ddstTensorDesc,
+        l.delta_gpu,
+        l.convDesc,
+        l.bf_algo,
+        state.workspace,
+        l.workspace_size,
+        &one,
+        l.dweightDesc,
+        l.weight_updates_gpu));
+
+    if (state.delta) {
+        // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
+        // calculate delta for the next layer
+
+        CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn_handle(),
+            &one,
+            l.weightDesc,
+            l.weights_gpu,
+            l.ddstTensorDesc,
+            l.delta_gpu,
+            l.convDesc,
+            l.bd_algo,
+            state.workspace,
+            l.workspace_size,
+            &one,
+            l.dsrcTensorDesc,
+            state.delta));
+    }
+#else // CUDNN
+
+    int m = l.outputs;
+    int k = l.batch;
+    int n = l.inputs;
+    float * a = l.delta_gpu;
+    float * b = state.input;
+    float * c = l.weight_updates_gpu;
+
+    gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
+
+    m = l.batch;
+    k = l.outputs;
+    n = l.inputs;
+
+    a = l.delta_gpu;
+    b = l.weights_gpu;
+    c = state.delta;
+
+    if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
+#endif // CUDNN
+}
+#endif
diff --git a/darknet-master/src/connected_layer.h b/darknet-master/src/connected_layer.h
new file mode 100644
index 0000000..284c512
--- /dev/null
+++ b/darknet-master/src/connected_layer.h
@@ -0,0 +1,34 @@
+#ifndef CONNECTED_LAYER_H
+#define CONNECTED_LAYER_H
+
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+
+typedef layer connected_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+connected_layer make_connected_layer(int batch, int steps, int inputs, int outputs, ACTIVATION activation, int batch_normalize);
+size_t get_connected_workspace_size(layer l);
+
+void forward_connected_layer(connected_layer layer, network_state state);
+void backward_connected_layer(connected_layer layer, network_state state);
+void update_connected_layer(connected_layer layer, int batch, float learning_rate, float momentum, float decay);
+void denormalize_connected_layer(layer l);
+void statistics_connected_layer(layer l);
+
+#ifdef GPU
+void forward_connected_layer_gpu(connected_layer layer, network_state state);
+void backward_connected_layer_gpu(connected_layer layer, network_state state);
+void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay, float loss_scale);
+void push_connected_layer(connected_layer layer);
+void pull_connected_layer(connected_layer layer);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/conv_lstm_layer.c b/darknet-master/src/conv_lstm_layer.c
new file mode 100644
index 0000000..a7804e7
--- /dev/null
+++ b/darknet-master/src/conv_lstm_layer.c
@@ -0,0 +1,1497 @@
+// Page 4: https://arxiv.org/abs/1506.04214v2
+// Page 3: https://arxiv.org/pdf/1705.06368v3.pdf
+// https://wikimedia.org/api/rest_v1/media/math/render/svg/1edbece2559479959fe829e9c6657efb380debe7
+
+#include "conv_lstm_layer.h"
+#include "connected_layer.h"
+#include "convolutional_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include "gemm.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void increment_layer(layer *l, int steps)
+{
+    int num = l->outputs*l->batch*steps;
+    l->output += num;
+    l->delta += num;
+    l->x += num;
+    l->x_norm += num;
+
+#ifdef GPU
+    l->output_gpu += num;
+    l->delta_gpu += num;
+    l->x_gpu += num;
+    l->x_norm_gpu += num;
+#endif
+}
+
+
+layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor, int bottleneck, int train)
+{
+    fprintf(stderr, "CONV_LSTM Layer: %d x %d x %d image, %d filters\n", h, w, c, output_filters);
+    /*
+    batch = batch / steps;
+    layer l = { (LAYER_TYPE)0 };
+    l.batch = batch;
+    l.type = LSTM;
+    l.steps = steps;
+    l.inputs = inputs;
+    l.out_w = 1;
+    l.out_h = 1;
+    l.out_c = outputs;
+    */
+    batch = batch / steps;
+    layer l = { (LAYER_TYPE)0 };
+    l.train = train;
+    l.batch = batch;
+    l.type = CONV_LSTM;
+    l.bottleneck = bottleneck;
+    l.steps = steps;
+    l.size = size;
+    l.stride = stride;
+    l.dilation = dilation;
+    l.pad = pad;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.groups = groups;
+    l.out_c = output_filters;
+    l.inputs = h * w * c;
+    l.xnor = xnor;
+    l.peephole = peephole;
+
+    // U
+    l.uf = (layer*)xcalloc(1, sizeof(layer));
+    *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+    l.uf->batch = batch;
+    if (l.workspace_size < l.uf->workspace_size) l.workspace_size = l.uf->workspace_size;
+
+    l.ui = (layer*)xcalloc(1, sizeof(layer));
+    *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+    l.ui->batch = batch;
+    if (l.workspace_size < l.ui->workspace_size) l.workspace_size = l.ui->workspace_size;
+
+    l.ug = (layer*)xcalloc(1, sizeof(layer));
+    *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+    l.ug->batch = batch;
+    if (l.workspace_size < l.ug->workspace_size) l.workspace_size = l.ug->workspace_size;
+
+    l.uo = (layer*)xcalloc(1, sizeof(layer));
+    *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+    l.uo->batch = batch;
+    if (l.workspace_size < l.uo->workspace_size) l.workspace_size = l.uo->workspace_size;
+
+    if (l.bottleneck) {
+        // bottleneck-conv with 2x channels
+        l.wf = (layer*)xcalloc(1, sizeof(layer));
+        l.wi = (layer*)xcalloc(1, sizeof(layer));
+        l.wg = (layer*)xcalloc(1, sizeof(layer));
+        l.wo = (layer*)xcalloc(1, sizeof(layer));
+        *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters*2, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+        l.wf->batch = batch;
+        if (l.workspace_size < l.wf->workspace_size) l.workspace_size = l.wf->workspace_size;
+    }
+    else {
+        // W
+        l.wf = (layer*)xcalloc(1, sizeof(layer));
+        *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+        l.wf->batch = batch;
+        if (l.workspace_size < l.wf->workspace_size) l.workspace_size = l.wf->workspace_size;
+
+        l.wi = (layer*)xcalloc(1, sizeof(layer));
+        *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+        l.wi->batch = batch;
+        if (l.workspace_size < l.wi->workspace_size) l.workspace_size = l.wi->workspace_size;
+
+        l.wg = (layer*)xcalloc(1, sizeof(layer));
+        *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+        l.wg->batch = batch;
+        if (l.workspace_size < l.wg->workspace_size) l.workspace_size = l.wg->workspace_size;
+
+        l.wo = (layer*)xcalloc(1, sizeof(layer));
+        *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+        l.wo->batch = batch;
+        if (l.workspace_size < l.wo->workspace_size) l.workspace_size = l.wo->workspace_size;
+    }
+
+    // V
+    l.vf = (layer*)xcalloc(1, sizeof(layer));
+    if (l.peephole) {
+        *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+        l.vf->batch = batch;
+        if (l.workspace_size < l.vf->workspace_size) l.workspace_size = l.vf->workspace_size;
+    }
+
+    l.vi = (layer*)xcalloc(1, sizeof(layer));
+    if (l.peephole) {
+        *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+        l.vi->batch = batch;
+        if (l.workspace_size < l.vi->workspace_size) l.workspace_size = l.vi->workspace_size;
+    }
+
+    l.vo = (layer*)xcalloc(1, sizeof(layer));
+    if (l.peephole) {
+        *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+        l.vo->batch = batch;
+        if (l.workspace_size < l.vo->workspace_size) l.workspace_size = l.vo->workspace_size;
+    }
+
+
+    l.batch_normalize = batch_normalize;
+
+    l.out_h = l.uo->out_h;
+    l.out_w = l.uo->out_w;
+    l.outputs = l.uo->outputs;
+    int outputs = l.outputs;
+    l.inputs = w*h*c;
+
+    if (!l.bottleneck) assert(l.wo->outputs == l.uo->outputs);
+    assert(l.wf->outputs == l.uf->outputs);
+
+    l.output = (float*)xcalloc(outputs * batch * steps, sizeof(float));
+    //l.state = (float*)xcalloc(outputs * batch, sizeof(float));
+
+    l.forward = forward_conv_lstm_layer;
+    l.update = update_conv_lstm_layer;
+    l.backward = backward_conv_lstm_layer;
+
+    l.prev_state_cpu =  (float*)xcalloc(batch*outputs, sizeof(float));
+    l.prev_cell_cpu =   (float*)xcalloc(batch*outputs, sizeof(float));
+    l.cell_cpu =        (float*)xcalloc(batch*outputs*steps, sizeof(float));
+
+    l.f_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.i_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.g_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.o_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.c_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.stored_c_cpu = (float*)xcalloc(batch*outputs, sizeof(float));
+    l.h_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.stored_h_cpu = (float*)xcalloc(batch*outputs, sizeof(float));
+    l.temp_cpu =        (float*)xcalloc(batch*outputs, sizeof(float));
+    l.temp2_cpu =       (float*)xcalloc(batch*outputs, sizeof(float));
+    l.temp3_cpu =       (float*)xcalloc(batch*outputs, sizeof(float));
+    l.dc_cpu =          (float*)xcalloc(batch*outputs, sizeof(float));
+    l.dh_cpu =          (float*)xcalloc(batch*outputs, sizeof(float));
+
+    /*
+    {
+        int k;
+        for (k = 0; k < l.uf->n; ++k) {
+            l.uf->biases[k] = 2;    // ~0.9
+            l.ui->biases[k] = -22;  // ~0.1
+            l.uo->biases[k] = 5;    // ~1.0
+        }
+#ifdef GPU
+        cuda_push_array(l.uf->biases_gpu, l.uf->biases, l.n);
+        cuda_push_array(l.ui->biases_gpu, l.ui->biases, l.n);
+        cuda_push_array(l.uo->biases_gpu, l.uo->biases, l.n);
+#endif// GPU
+    }
+    */
+
+#ifdef GPU
+    l.forward_gpu = forward_conv_lstm_layer_gpu;
+    l.backward_gpu = backward_conv_lstm_layer_gpu;
+    l.update_gpu = update_conv_lstm_layer_gpu;
+
+    //l.state_gpu = cuda_make_array(l.state, batch*l.outputs);
+
+    l.output_gpu = cuda_make_array(0, batch*outputs*steps);
+    l.delta_gpu = cuda_make_array(0, batch*l.outputs*steps);
+
+    l.prev_state_gpu = cuda_make_array(0, batch*outputs);
+    l.prev_cell_gpu = cuda_make_array(0, batch*outputs);
+    l.cell_gpu = cuda_make_array(0, batch*outputs*steps);
+
+    l.f_gpu = cuda_make_array(0, batch*outputs);
+    l.i_gpu = cuda_make_array(0, batch*outputs);
+    l.g_gpu = cuda_make_array(0, batch*outputs);
+    l.o_gpu = cuda_make_array(0, batch*outputs);
+    l.c_gpu = cuda_make_array(0, batch*outputs);
+    if (l.bottleneck) {
+        l.bottelneck_hi_gpu = cuda_make_array(0, batch*outputs * 2);
+        l.bottelneck_delta_gpu = cuda_make_array(0, batch*outputs * 2);
+    }
+    l.h_gpu = cuda_make_array(0, batch*outputs);
+    l.stored_c_gpu = cuda_make_array(0, batch*outputs);
+    l.stored_h_gpu = cuda_make_array(0, batch*outputs);
+    l.temp_gpu =  cuda_make_array(0, batch*outputs);
+    l.temp2_gpu = cuda_make_array(0, batch*outputs);
+    l.temp3_gpu = cuda_make_array(0, batch*outputs);
+    l.dc_gpu = cuda_make_array(0, batch*outputs);
+    l.dh_gpu = cuda_make_array(0, batch*outputs);
+    l.last_prev_state_gpu = cuda_make_array(0, l.batch*l.outputs);
+    l.last_prev_cell_gpu = cuda_make_array(0, l.batch*l.outputs);
+#endif
+
+    l.bflops = l.uf->bflops + l.ui->bflops + l.ug->bflops + l.uo->bflops +
+        l.wf->bflops + l.wi->bflops + l.wg->bflops + l.wo->bflops +
+        l.vf->bflops + l.vi->bflops + l.vo->bflops;
+
+    if(l.peephole) l.bflops += 12 * l.outputs*l.batch / 1000000000.;
+    else l.bflops += 9 * l.outputs*l.batch / 1000000000.;
+
+    return l;
+}
+
+layer make_history_layer(int batch, int h, int w, int c, int history_size, int steps, int train)
+{
+    layer l = { (LAYER_TYPE)0 };
+    l.train = train;
+    l.batch = batch;
+    l.type = HISTORY;
+    l.steps = steps;
+    l.history_size = history_size;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.out_h = h;
+    l.out_w = w;
+    l.out_c = c * history_size;
+    l.inputs = h * w * c;
+    l.outputs = h * w * c * history_size;
+
+    l.forward = forward_history_layer;
+    l.backward = backward_history_layer;
+
+    fprintf(stderr, "HISTORY b = %d, s = %2d, steps = %2d   %4d x%4d x%4d -> %4d x%4d x%4d \n", l.batch / l.steps, l.history_size, l.steps, w, h, c, l.out_w, l.out_h, l.out_c);
+
+    l.output = (float*)xcalloc(l.batch * l.outputs, sizeof(float));
+    l.delta = (float*)xcalloc(l.batch * l.outputs, sizeof(float));
+
+    l.prev_state_cpu = (float*)xcalloc(l.batch*l.outputs, sizeof(float));
+
+#ifdef GPU
+
+    l.forward_gpu = forward_history_layer_gpu;
+    l.backward_gpu = backward_history_layer_gpu;
+
+    l.output_gpu = cuda_make_array(0, l.batch * l.outputs);
+    l.delta_gpu = cuda_make_array(0, l.batch * l.outputs);
+
+    l.prev_state_gpu = cuda_make_array(0, l.batch*l.outputs);
+
+#endif  // GPU
+
+    //l.batch = 4;
+    //l.steps = 1;
+
+    return l;
+}
+
+void forward_history_layer(layer l, network_state state)
+{
+    if (l.steps == 1) {
+        copy_cpu(l.inputs*l.batch, state.input, 1, l.output, 1);
+        return;
+    }
+
+    const int batch = l.batch / l.steps;
+
+    float *prev_output = l.prev_state_cpu;
+
+    int i;
+    for (i = 0; i < l.steps; ++i) {
+        // shift cell
+        int shift_size = l.inputs * (l.history_size - 1);
+        int output_sift = l.inputs;
+
+        int b;
+        for (b = 0; b < batch; ++b) {
+            int input_start = b*l.inputs + i*l.inputs*batch;
+            int output_start = b*l.outputs + i*l.outputs*batch;
+            float *input = state.input + input_start;
+            float *output = l.output + output_start;
+
+            copy_cpu(shift_size, prev_output + b*l.outputs, 1, output + output_sift, 1);
+
+            copy_cpu(l.inputs, input, 1, output, 1);
+        }
+        prev_output = l.output + i*l.outputs*batch;
+    }
+
+    int output_start = (l.steps-1)*l.outputs*batch;
+    copy_cpu(batch*l.outputs, l.output + output_start, 1, l.prev_state_cpu, 1);
+}
+
+void backward_history_layer(layer l, network_state state)
+{
+    if (l.steps == 1) {
+        axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, state.delta, 1);
+        return;
+    }
+
+    const int batch = l.batch / l.steps;
+
+    // l.delta -> state.delta
+    int i;
+    for (i = 0; i < l.steps; ++i) {
+        int b;
+        for (b = 0; b < batch; ++b) {
+            int input_start = b*l.inputs + i*l.inputs*batch;
+            int output_start = b*l.outputs + i*l.outputs*batch;
+            float *state_delta = state.delta + input_start;
+            float *l_delta = l.delta + output_start;
+
+            //copy_cpu(l.inputs, l_delta, 1, state_delta, 1);
+            axpy_cpu(l.inputs, 1, l_delta, 1, state_delta, 1);
+        }
+    }
+}
+
+#ifdef GPU
+void forward_history_layer_gpu(const layer l, network_state state)
+{
+    if (l.steps == 1) {
+        simple_copy_ongpu(l.inputs*l.batch, state.input, l.output_gpu);
+        return;
+    }
+
+    const int batch = l.batch / l.steps;
+
+    //int copy_size = l.inputs*batch*l.steps;
+    //printf(" copy_size = %d, inputs = %d, batch = %d, steps = %d, l.history_size = %d \n", copy_size, l.inputs, batch, l.steps, l.history_size);
+    //simple_copy_ongpu(copy_size, state.input, l.output_gpu);
+    //return;
+
+    //fill_ongpu(batch*l.outputs, 0, l.prev_state_gpu, 1);
+    float *prev_output = l.prev_state_gpu;
+
+    int i;
+    for (i = 0; i < l.steps; ++i) {
+        // shift cell
+        int shift_size = l.inputs * (l.history_size - 1);
+        int output_sift = l.inputs;
+
+        int b;
+        for (b = 0; b < batch; ++b) {
+            //printf(" hist-fw: i = %d, b = %d \n", i, b);
+
+            int input_start = b*l.inputs + i*l.inputs*batch;
+            int output_start = b*l.outputs + i*l.outputs*batch;
+            float *input = state.input + input_start;
+            float *output = l.output_gpu + output_start;
+
+            //copy_cpu(shift_size, prev_output + b*l.outputs, 1, output + output_sift, 1);
+            simple_copy_ongpu(shift_size, prev_output + b*l.outputs, output + output_sift);
+
+            //copy_cpu(l.inputs, input, 1, output, 1);
+            simple_copy_ongpu(l.inputs, input, output);
+
+            int h;
+            for (h = 1; h < l.history_size; ++h) {
+                //scal_ongpu(l.inputs, (l.history_size - h)/ (float)l.history_size, output + h*l.inputs, 1);
+                //scal_ongpu(l.inputs, 0, output + h*l.inputs, 1);
+            }
+        }
+        prev_output = l.output_gpu + i*l.outputs*batch;
+    }
+
+    int output_start = (l.steps - 1)*l.outputs*batch;
+    //copy_cpu(batch*l.outputs, l.output + output_start, 1, l.prev_state_cpu, 1);
+    simple_copy_ongpu(batch*l.outputs, l.output_gpu + output_start, l.prev_state_gpu);
+}
+
+void backward_history_layer_gpu(const layer l, network_state state)
+{
+    if (l.steps == 1) {
+        axpy_ongpu(l.inputs*l.batch, 1, l.delta_gpu, 1, state.delta, 1);
+        return;
+    }
+
+    const int batch = l.batch / l.steps;
+
+    //int copy_size = l.inputs*batch*l.steps;
+    //printf(" copy_size = %d, inputs = %d, batch = %d, steps = %d, l.history_size = %d \n", copy_size, l.inputs, batch, l.steps, l.history_size);
+    //axpy_ongpu(copy_size, 1, l.delta_gpu, 1, state.delta, 1);
+    //return;
+
+    // l.delta -> state.delta
+    int i;
+    for (i = 0; i < l.steps; ++i) {
+        int b;
+        for (b = 0; b < batch; ++b) {
+            //printf(" hist-bw: i = %d, b = %d \n", i, b);
+
+            int input_start = b*l.inputs + i*l.inputs*batch;
+            int output_start = b*l.outputs + i*l.outputs*batch;
+            float *state_delta = state.delta + input_start;
+            float *l_delta = l.delta_gpu + output_start;
+
+            //copy_cpu(l.inputs, l_delta, 1, state_delta, 1);
+            axpy_ongpu(l.inputs, 1, l_delta, 1, state_delta, 1);
+        }
+    }
+}
+#endif
+
+
+void update_conv_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+{
+    if (l.peephole) {
+        update_convolutional_layer(*(l.vf), batch, learning_rate, momentum, decay);
+        update_convolutional_layer(*(l.vi), batch, learning_rate, momentum, decay);
+        update_convolutional_layer(*(l.vo), batch, learning_rate, momentum, decay);
+    }
+    update_convolutional_layer(*(l.wf), batch, learning_rate, momentum, decay);
+    update_convolutional_layer(*(l.wi), batch, learning_rate, momentum, decay);
+    update_convolutional_layer(*(l.wg), batch, learning_rate, momentum, decay);
+    update_convolutional_layer(*(l.wo), batch, learning_rate, momentum, decay);
+    update_convolutional_layer(*(l.uf), batch, learning_rate, momentum, decay);
+    update_convolutional_layer(*(l.ui), batch, learning_rate, momentum, decay);
+    update_convolutional_layer(*(l.ug), batch, learning_rate, momentum, decay);
+    update_convolutional_layer(*(l.uo), batch, learning_rate, momentum, decay);
+}
+
+void resize_conv_lstm_layer(layer *l, int w, int h)
+{
+    if (l->peephole) {
+        resize_convolutional_layer(l->vf, w, h);
+        if (l->workspace_size < l->vf->workspace_size) l->workspace_size = l->vf->workspace_size;
+
+        resize_convolutional_layer(l->vi, w, h);
+        if (l->workspace_size < l->vi->workspace_size) l->workspace_size = l->vi->workspace_size;
+
+        resize_convolutional_layer(l->vo, w, h);
+        if (l->workspace_size < l->vo->workspace_size) l->workspace_size = l->vo->workspace_size;
+    }
+
+    resize_convolutional_layer(l->wf, w, h);
+    if (l->workspace_size < l->wf->workspace_size) l->workspace_size = l->wf->workspace_size;
+
+    resize_convolutional_layer(l->wi, w, h);
+    if (l->workspace_size < l->wi->workspace_size) l->workspace_size = l->wi->workspace_size;
+
+    resize_convolutional_layer(l->wg, w, h);
+    if (l->workspace_size < l->wg->workspace_size) l->workspace_size = l->wg->workspace_size;
+
+    resize_convolutional_layer(l->wo, w, h);
+    if (l->workspace_size < l->wo->workspace_size) l->workspace_size = l->wo->workspace_size;
+
+
+    resize_convolutional_layer(l->uf, w, h);
+    if (l->workspace_size < l->uf->workspace_size) l->workspace_size = l->uf->workspace_size;
+
+    resize_convolutional_layer(l->ui, w, h);
+    if (l->workspace_size < l->ui->workspace_size) l->workspace_size = l->ui->workspace_size;
+
+    resize_convolutional_layer(l->ug, w, h);
+    if (l->workspace_size < l->ug->workspace_size) l->workspace_size = l->ug->workspace_size;
+
+    resize_convolutional_layer(l->uo, w, h);
+    if (l->workspace_size < l->uo->workspace_size) l->workspace_size = l->uo->workspace_size;
+
+    l->w = w;
+    l->h = h;
+    l->out_h = l->wo->out_h;
+    l->out_w = l->wo->out_w;
+    l->outputs = l->wo->outputs;
+    int outputs = l->outputs;
+    l->inputs = w*h*l->c;
+    int steps = l->steps;
+    int batch = l->batch;
+
+    assert(l->wo->outputs == l->uo->outputs);
+
+    l->output = (float*)xrealloc(l->output, outputs * batch * steps * sizeof(float));
+    //l->state = (float*)xrealloc(l->state, outputs * batch * sizeof(float));
+
+    l->prev_state_cpu = (float*)xrealloc(l->prev_state_cpu, batch*outputs * sizeof(float));
+    l->prev_cell_cpu = (float*)xrealloc(l->prev_cell_cpu, batch*outputs * sizeof(float));
+    l->cell_cpu = (float*)xrealloc(l->cell_cpu, batch*outputs*steps * sizeof(float));
+
+    l->f_cpu = (float*)xrealloc(l->f_cpu, batch*outputs * sizeof(float));
+    l->i_cpu = (float*)xrealloc(l->i_cpu, batch*outputs * sizeof(float));
+    l->g_cpu = (float*)xrealloc(l->g_cpu, batch*outputs * sizeof(float));
+    l->o_cpu = (float*)xrealloc(l->o_cpu, batch*outputs * sizeof(float));
+    l->c_cpu = (float*)xrealloc(l->c_cpu, batch*outputs * sizeof(float));
+    l->h_cpu = (float*)xrealloc(l->h_cpu, batch*outputs * sizeof(float));
+    l->temp_cpu = (float*)xrealloc(l->temp_cpu, batch*outputs * sizeof(float));
+    l->temp2_cpu = (float*)xrealloc(l->temp2_cpu, batch*outputs * sizeof(float));
+    l->temp3_cpu = (float*)xrealloc(l->temp3_cpu, batch*outputs * sizeof(float));
+    l->dc_cpu = (float*)xrealloc(l->dc_cpu, batch*outputs * sizeof(float));
+    l->dh_cpu = (float*)xrealloc(l->dh_cpu, batch*outputs * sizeof(float));
+    l->stored_c_cpu = (float*)xrealloc(l->stored_c_cpu, batch*outputs * sizeof(float));
+    l->stored_h_cpu = (float*)xrealloc(l->stored_h_cpu, batch*outputs * sizeof(float));
+
+#ifdef GPU
+    //if (l->state_gpu) cudaFree(l->state_gpu);
+    //l->state_gpu = cuda_make_array(l->state, batch*l->outputs);
+
+    if (l->output_gpu) cudaFree(l->output_gpu);
+    l->output_gpu = cuda_make_array(0, batch*outputs*steps);
+
+    if (l->delta_gpu) cudaFree(l->delta_gpu);
+    l->delta_gpu = cuda_make_array(0, batch*outputs*steps);
+
+    if (l->prev_state_gpu) cudaFree(l->prev_state_gpu);
+    l->prev_state_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->prev_cell_gpu) cudaFree(l->prev_cell_gpu);
+    l->prev_cell_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->cell_gpu) cudaFree(l->cell_gpu);
+    l->cell_gpu = cuda_make_array(0, batch*outputs*steps);
+
+    if (l->f_gpu) cudaFree(l->f_gpu);
+    l->f_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->i_gpu) cudaFree(l->i_gpu);
+    l->i_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->g_gpu) cudaFree(l->g_gpu);
+    l->g_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->o_gpu) cudaFree(l->o_gpu);
+    l->o_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->c_gpu) cudaFree(l->c_gpu);
+    l->c_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->h_gpu) cudaFree(l->h_gpu);
+    l->h_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->temp_gpu) cudaFree(l->temp_gpu);
+    l->temp_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->temp2_gpu) cudaFree(l->temp2_gpu);
+    l->temp2_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->temp3_gpu) cudaFree(l->temp3_gpu);
+    l->temp3_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->dc_gpu) cudaFree(l->dc_gpu);
+    l->dc_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->dh_gpu) cudaFree(l->dh_gpu);
+    l->dh_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->stored_c_gpu) cudaFree(l->stored_c_gpu);
+    l->stored_c_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->stored_h_gpu) cudaFree(l->stored_h_gpu);
+    l->stored_h_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->last_prev_state_gpu) cudaFree(l->last_prev_state_gpu);
+    l->last_prev_state_gpu = cuda_make_array(0, batch*outputs);
+
+    if (l->last_prev_cell_gpu) cudaFree(l->last_prev_cell_gpu);
+    l->last_prev_cell_gpu = cuda_make_array(0, batch*outputs);
+#endif
+}
+
+void free_state_conv_lstm(layer l)
+{
+    int i;
+    for (i = 0; i < l.outputs * l.batch; ++i) l.h_cpu[i] = 0;
+    for (i = 0; i < l.outputs * l.batch; ++i) l.c_cpu[i] = 0;
+
+#ifdef GPU
+    cuda_push_array(l.h_gpu, l.h_cpu, l.outputs * l.batch);
+    cuda_push_array(l.c_gpu, l.c_cpu, l.outputs * l.batch);
+
+    //fill_ongpu(l.outputs * l.batch, 0, l.dc_gpu, 1);   //  dont use
+    //fill_ongpu(l.outputs * l.batch, 0, l.dh_gpu, 1);   //  dont use
+#endif  // GPU
+}
+
+void randomize_state_conv_lstm(layer l)
+{
+    int i;
+    for (i = 0; i < l.outputs * l.batch; ++i) l.h_cpu[i] = rand_uniform(-1, 1);
+    for (i = 0; i < l.outputs * l.batch; ++i) l.c_cpu[i] = rand_uniform(-1, 1);
+
+#ifdef GPU
+    cuda_push_array(l.h_gpu, l.h_cpu, l.outputs * l.batch);
+    cuda_push_array(l.c_gpu, l.c_cpu, l.outputs * l.batch);
+#endif  // GPU
+}
+
+
+void remember_state_conv_lstm(layer l)
+{
+    memcpy(l.stored_c_cpu, l.c_cpu, l.outputs * l.batch * sizeof(float));
+    memcpy(l.stored_h_cpu, l.h_cpu, l.outputs * l.batch * sizeof(float));
+
+#ifdef GPU
+    copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.stored_c_gpu, 1);
+    copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.stored_h_gpu, 1);
+#endif  // GPU
+}
+
+void restore_state_conv_lstm(layer l)
+{
+    memcpy(l.c_cpu, l.stored_c_cpu, l.outputs * l.batch * sizeof(float));
+    memcpy(l.h_cpu, l.stored_h_cpu, l.outputs * l.batch * sizeof(float));
+
+#ifdef GPU
+    copy_ongpu(l.outputs*l.batch, l.stored_c_gpu, 1, l.c_gpu, 1);
+    copy_ongpu(l.outputs*l.batch, l.stored_h_gpu, 1, l.h_gpu, 1);
+#endif  // GPU
+}
+
+void forward_conv_lstm_layer(layer l, network_state state)
+{
+    network_state s = { 0 };
+    s.train = state.train;
+    s.workspace = state.workspace;
+    s.net = state.net;
+    int i;
+    layer vf = *(l.vf);
+    layer vi = *(l.vi);
+    layer vo = *(l.vo);
+
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    if (state.train) {
+        if (l.peephole) {
+            fill_cpu(l.outputs * l.batch * l.steps, 0, vf.delta, 1);
+            fill_cpu(l.outputs * l.batch * l.steps, 0, vi.delta, 1);
+            fill_cpu(l.outputs * l.batch * l.steps, 0, vo.delta, 1);
+        }
+
+        fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1);
+        fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1);
+        fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1);
+        fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1);
+
+        fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1);
+        fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1);
+        fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1);
+        fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1);
+
+        fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i)
+    {
+        if (l.peephole) {
+            assert(l.outputs == vf.out_w * vf.out_h * vf.out_c);
+            s.input = l.c_cpu;
+            forward_convolutional_layer(vf, s);
+            forward_convolutional_layer(vi, s);
+            // vo below
+        }
+
+        assert(l.outputs == wf.out_w * wf.out_h * wf.out_c);
+        assert(wf.c == l.out_c && wi.c == l.out_c && wg.c == l.out_c && wo.c == l.out_c);
+
+        s.input = l.h_cpu;
+        forward_convolutional_layer(wf, s);
+        forward_convolutional_layer(wi, s);
+        forward_convolutional_layer(wg, s);
+        forward_convolutional_layer(wo, s);
+
+        assert(l.inputs == uf.w * uf.h * uf.c);
+        assert(uf.c == l.c && ui.c == l.c && ug.c == l.c && uo.c == l.c);
+
+        s.input = state.input;
+        forward_convolutional_layer(uf, s);
+        forward_convolutional_layer(ui, s);
+        forward_convolutional_layer(ug, s);
+        forward_convolutional_layer(uo, s);
+
+        // f = wf + uf + vf
+        copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);
+        if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vf.output, 1, l.f_cpu, 1);
+
+        // i = wi + ui + vi
+        copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);
+        if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vi.output, 1, l.i_cpu, 1);
+
+        // g = wg + ug
+        copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);
+
+        activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC);
+        activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC);
+        activate_array(l.g_cpu, l.outputs*l.batch, TANH);
+
+        // c = f*c + i*g
+        copy_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.c_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1);
+
+        // o = wo + uo + vo(c_new)
+        if (l.peephole) {
+            s.input = l.c_cpu;
+            forward_convolutional_layer(vo, s);
+        }
+        copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);
+        if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vo.output, 1, l.o_cpu, 1);
+        activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC);
+
+        // h = o * tanh(c)
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.h_cpu, 1);
+        activate_array(l.h_cpu, l.outputs*l.batch, TANH);
+        mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.h_cpu, 1);
+
+        if (l.state_constrain) constrain_cpu(l.outputs*l.batch, l.state_constrain, l.c_cpu);
+        fix_nan_and_inf_cpu(l.c_cpu, l.outputs*l.batch);
+        fix_nan_and_inf_cpu(l.h_cpu, l.outputs*l.batch);
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.cell_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.h_cpu, 1, l.output, 1);
+
+        state.input += l.inputs*l.batch;
+        l.output    += l.outputs*l.batch;
+        l.cell_cpu      += l.outputs*l.batch;
+
+        if (l.peephole) {
+            increment_layer(&vf, 1);
+            increment_layer(&vi, 1);
+            increment_layer(&vo, 1);
+        }
+
+        increment_layer(&wf, 1);
+        increment_layer(&wi, 1);
+        increment_layer(&wg, 1);
+        increment_layer(&wo, 1);
+
+        increment_layer(&uf, 1);
+        increment_layer(&ui, 1);
+        increment_layer(&ug, 1);
+        increment_layer(&uo, 1);
+    }
+}
+
+void backward_conv_lstm_layer(layer l, network_state state)
+{
+    network_state s = { 0 };
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer vf = *(l.vf);
+    layer vi = *(l.vi);
+    layer vo = *(l.vo);
+
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    if (l.peephole) {
+        increment_layer(&vf, l.steps - 1);
+        increment_layer(&vi, l.steps - 1);
+        increment_layer(&vo, l.steps - 1);
+    }
+
+    increment_layer(&wf, l.steps - 1);
+    increment_layer(&wi, l.steps - 1);
+    increment_layer(&wg, l.steps - 1);
+    increment_layer(&wo, l.steps - 1);
+
+    increment_layer(&uf, l.steps - 1);
+    increment_layer(&ui, l.steps - 1);
+    increment_layer(&ug, l.steps - 1);
+    increment_layer(&uo, l.steps - 1);
+
+    state.input += l.inputs*l.batch*(l.steps - 1);
+    if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1);
+
+    l.output += l.outputs*l.batch*(l.steps - 1);
+    l.cell_cpu += l.outputs*l.batch*(l.steps - 1);
+    l.delta += l.outputs*l.batch*(l.steps - 1);
+
+    for (i = l.steps - 1; i >= 0; --i) {
+        if (i != 0) copy_cpu(l.outputs*l.batch, l.cell_cpu - l.outputs*l.batch, 1, l.prev_cell_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.cell_cpu, 1, l.c_cpu, 1);
+        if (i != 0) copy_cpu(l.outputs*l.batch, l.output - l.outputs*l.batch, 1, l.prev_state_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.output, 1, l.h_cpu, 1);
+
+        l.dh_cpu = (i == 0) ? 0 : l.delta - l.outputs*l.batch;
+
+        // f = wf + uf + vf
+        copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);
+        if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vf.output, 1, l.f_cpu, 1);
+
+        // i = wi + ui + vi
+        copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);
+        if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vi.output, 1, l.i_cpu, 1);
+
+        // g = wg + ug
+        copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);
+
+        // o = wo + uo + vo
+        copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);
+        if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vo.output, 1, l.o_cpu, 1);
+
+        activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC);
+        activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC);
+        activate_array(l.g_cpu, l.outputs*l.batch, TANH);
+        activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC);
+
+        copy_cpu(l.outputs*l.batch, l.delta, 1, l.temp3_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);
+        activate_array(l.temp_cpu, l.outputs*l.batch, TANH);
+
+        copy_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.temp2_cpu, 1);
+
+        gradient_array(l.temp_cpu, l.outputs*l.batch, TANH, l.temp2_cpu);
+        axpy_cpu(l.outputs*l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1);
+        // temp  = tanh(c)
+        // temp2 = delta * o * grad_tanh(tanh(c))
+        // temp3 = delta
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);
+        activate_array(l.temp_cpu, l.outputs*l.batch, TANH);
+        mul_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp_cpu, 1);
+        gradient_array(l.o_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
+        // delta for o(w,u,v):       temp  = delta * tanh(c) * grad_logistic(o)
+        // delta for c,f,i,g(w,u,v): temp2 = delta * o * grad_tanh(tanh(c)) + delta_c(???)
+        // delta for output:         temp3 = delta
+
+        // o
+        // delta for O(w,u,v):     temp  = delta * tanh(c) * grad_logistic(o)
+        if (l.peephole) {
+            copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, vo.delta, 1);
+            s.input = l.cell_cpu;
+            //s.delta = l.dc_cpu;
+            backward_convolutional_layer(vo, s);
+        }
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wo.delta, 1);
+        s.input = l.prev_state_cpu;
+        //s.delta = l.dh_cpu;
+        backward_convolutional_layer(wo, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uo.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_convolutional_layer(uo, s);
+
+        // g
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);
+        gradient_array(l.g_cpu, l.outputs*l.batch, TANH, l.temp_cpu);
+        // delta for c,f,i,g(w,u,v): temp2 = (delta * o * grad_tanh(tanh(c)) + delta_c(???)) * g * grad_logistic(i)
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wg.delta, 1);
+        s.input = l.prev_state_cpu;
+        //s.delta = l.dh_cpu;
+        backward_convolutional_layer(wg, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ug.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_convolutional_layer(ug, s);
+
+        // i
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);
+        gradient_array(l.i_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
+        // delta for c,f,i,g(w,u,v): temp2 = (delta * o * grad_tanh(tanh(c)) + delta_c(???)) * g * grad_logistic(i)
+
+        if (l.peephole) {
+            copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, vi.delta, 1);
+            s.input = l.prev_cell_cpu;
+            //s.delta = l.dc_cpu;
+            backward_convolutional_layer(vi, s);
+        }
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wi.delta, 1);
+        s.input = l.prev_state_cpu;
+        //s.delta = l.dh_cpu;
+        backward_convolutional_layer(wi, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ui.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_convolutional_layer(ui, s);
+
+        // f
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.prev_cell_cpu, 1, l.temp_cpu, 1);
+        gradient_array(l.f_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
+        // delta for c,f,i,g(w,u,v): temp2 = (delta * o * grad_tanh(tanh(c)) + delta_c(???)) * c * grad_logistic(f)
+
+        if (l.peephole) {
+            copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, vf.delta, 1);
+            s.input = l.prev_cell_cpu;
+            //s.delta = l.dc_cpu;
+            backward_convolutional_layer(vf, s);
+        }
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wf.delta, 1);
+        s.input = l.prev_state_cpu;
+        //s.delta = l.dh_cpu;
+        backward_convolutional_layer(wf, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uf.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_convolutional_layer(uf, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.temp_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, l.dc_cpu, 1);
+
+        state.input -= l.inputs*l.batch;
+        if (state.delta) state.delta -= l.inputs*l.batch;
+        l.output -= l.outputs*l.batch;
+        l.cell_cpu -= l.outputs*l.batch;
+        l.delta -= l.outputs*l.batch;
+
+        if (l.peephole) {
+            increment_layer(&vf, -1);
+            increment_layer(&vi, -1);
+            increment_layer(&vo, -1);
+        }
+
+        increment_layer(&wf, -1);
+        increment_layer(&wi, -1);
+        increment_layer(&wg, -1);
+        increment_layer(&wo, -1);
+
+        increment_layer(&uf, -1);
+        increment_layer(&ui, -1);
+        increment_layer(&ug, -1);
+        increment_layer(&uo, -1);
+    }
+}
+
+#ifdef GPU
+void pull_conv_lstm_layer(layer l)
+{
+    if (l.peephole) {
+        pull_convolutional_layer(*(l.vf));
+        pull_convolutional_layer(*(l.vi));
+        pull_convolutional_layer(*(l.vo));
+    }
+    pull_convolutional_layer(*(l.wf));
+    if (!l.bottleneck) {
+        pull_convolutional_layer(*(l.wi));
+        pull_convolutional_layer(*(l.wg));
+        pull_convolutional_layer(*(l.wo));
+    }
+    pull_convolutional_layer(*(l.uf));
+    pull_convolutional_layer(*(l.ui));
+    pull_convolutional_layer(*(l.ug));
+    pull_convolutional_layer(*(l.uo));
+}
+
+void push_conv_lstm_layer(layer l)
+{
+    if (l.peephole) {
+        push_convolutional_layer(*(l.vf));
+        push_convolutional_layer(*(l.vi));
+        push_convolutional_layer(*(l.vo));
+    }
+    push_convolutional_layer(*(l.wf));
+    if (!l.bottleneck) {
+        push_convolutional_layer(*(l.wi));
+        push_convolutional_layer(*(l.wg));
+        push_convolutional_layer(*(l.wo));
+    }
+    push_convolutional_layer(*(l.uf));
+    push_convolutional_layer(*(l.ui));
+    push_convolutional_layer(*(l.ug));
+    push_convolutional_layer(*(l.uo));
+}
+
+void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
+{
+    if (l.peephole) {
+        update_convolutional_layer_gpu(*(l.vf), batch, learning_rate, momentum, decay, loss_scale);
+        update_convolutional_layer_gpu(*(l.vi), batch, learning_rate, momentum, decay, loss_scale);
+        update_convolutional_layer_gpu(*(l.vo), batch, learning_rate, momentum, decay, loss_scale);
+    }
+    update_convolutional_layer_gpu(*(l.wf), batch, learning_rate, momentum, decay, loss_scale);
+    if (!l.bottleneck) {
+        update_convolutional_layer_gpu(*(l.wi), batch, learning_rate, momentum, decay, loss_scale);
+        update_convolutional_layer_gpu(*(l.wg), batch, learning_rate, momentum, decay, loss_scale);
+        update_convolutional_layer_gpu(*(l.wo), batch, learning_rate, momentum, decay, loss_scale);
+    }
+    update_convolutional_layer_gpu(*(l.uf), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.ui), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.ug), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.uo), batch, learning_rate, momentum, decay, loss_scale);
+}
+
+void forward_conv_lstm_layer_gpu(layer l, network_state state)
+{
+    network_state s = { 0 };
+    s.train = state.train;
+    s.workspace = state.workspace;
+    s.net = state.net;
+    if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
+    int i;
+    layer vf = *(l.vf);
+    layer vi = *(l.vi);
+    layer vo = *(l.vo);
+
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    if (state.train) {
+        if (l.peephole) {
+            fill_ongpu(l.outputs * l.batch * l.steps, 0, vf.delta_gpu, 1);
+            fill_ongpu(l.outputs * l.batch * l.steps, 0, vi.delta_gpu, 1);
+            fill_ongpu(l.outputs * l.batch * l.steps, 0, vo.delta_gpu, 1);
+        }
+
+        fill_ongpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1);
+        if (!l.bottleneck) {
+            fill_ongpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1);
+            fill_ongpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1);
+            fill_ongpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1);
+        }
+
+        fill_ongpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1);
+        fill_ongpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1);
+        fill_ongpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1);
+        fill_ongpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1);
+
+        fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i)
+    {
+        if (l.peephole) {
+            assert(l.outputs == vf.out_w * vf.out_h * vf.out_c);
+            s.input = l.c_gpu;
+            forward_convolutional_layer_gpu(vf, s);
+            forward_convolutional_layer_gpu(vi, s);
+            // vo below
+        }
+
+        if (l.bottleneck) {
+            // l.bottelneck_hi_gpu size is 2x
+            simple_copy_ongpu(l.outputs*l.batch, l.h_gpu, l.bottelneck_hi_gpu);
+            simple_copy_ongpu(l.outputs*l.batch, state.input, l.bottelneck_hi_gpu + l.outputs*l.batch);
+            s.input = l.bottelneck_hi_gpu;
+            forward_convolutional_layer_gpu(wf, s); // 2x input channels
+            activate_array_ongpu(wf.output_gpu, l.outputs*l.batch, l.lstm_activation);
+            s.input = wf.output_gpu;
+        }
+        else {
+            assert(l.outputs == wf.out_w * wf.out_h * wf.out_c);
+            assert(wf.c == l.out_c && wi.c == l.out_c && wg.c == l.out_c && wo.c == l.out_c);
+
+            s.input = l.h_gpu;
+            forward_convolutional_layer_gpu(wf, s);
+            forward_convolutional_layer_gpu(wi, s);
+            forward_convolutional_layer_gpu(wg, s);
+            forward_convolutional_layer_gpu(wo, s);
+
+            s.input = state.input;
+        }
+
+        assert(l.inputs == uf.w * uf.h * uf.c);
+        assert(uf.c == l.c && ui.c == l.c && ug.c == l.c && uo.c == l.c);
+
+        forward_convolutional_layer_gpu(uf, s);
+        forward_convolutional_layer_gpu(ui, s);
+        forward_convolutional_layer_gpu(ug, s);
+        forward_convolutional_layer_gpu(uo, s);
+
+        // f = wf + uf + vf
+        add_3_arrays_activate((l.bottleneck)?NULL:wf.output_gpu, uf.output_gpu, (l.peephole)?vf.output_gpu:NULL, l.outputs*l.batch, LOGISTIC, l.f_gpu);
+        //copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
+        //axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
+        //if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vf.output_gpu, 1, l.f_gpu, 1);
+        //activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
+
+        // i = wi + ui + vi
+        add_3_arrays_activate((l.bottleneck)?NULL:wi.output_gpu, ui.output_gpu, (l.peephole) ? vi.output_gpu : NULL, l.outputs*l.batch, LOGISTIC, l.i_gpu);
+        //copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
+        //axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
+        //if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vi.output_gpu, 1, l.i_gpu, 1);
+        //activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
+
+        // g = wg + ug
+        add_3_arrays_activate((l.bottleneck)?NULL:wg.output_gpu, ug.output_gpu, NULL, l.outputs*l.batch, l.lstm_activation, l.g_gpu);
+        //copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
+        //axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
+        //activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH);
+
+        // c = f*c + i*g
+        sum_of_mults(l.f_gpu, l.c_gpu, l.i_gpu, l.g_gpu, l.outputs*l.batch, l.c_gpu);   // decreases mAP???
+        //copy_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
+        //mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
+        //mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1);
+        //axpy_ongpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1);
+
+        // o = wo + uo + vo(c_new)
+        if (l.peephole) {
+            s.input = l.c_gpu;
+            forward_convolutional_layer_gpu(vo, s);
+        }
+        add_3_arrays_activate((l.bottleneck)?NULL:wo.output_gpu, uo.output_gpu, (l.peephole) ? vo.output_gpu : NULL, l.outputs*l.batch, LOGISTIC, l.o_gpu);
+        //copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
+        //axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
+        //if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vo.output_gpu, 1, l.o_gpu, 1);
+        //activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
+
+        // h = o * tanh(c)
+        activate_and_mult(l.c_gpu, l.o_gpu, l.outputs*l.batch, l.lstm_activation, l.h_gpu);
+        //simple_copy_ongpu(l.outputs*l.batch, l.c_gpu, l.h_gpu);
+        //activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
+        //mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1);
+
+        fix_nan_and_inf(l.c_gpu, l.outputs*l.batch);    // should be fix_nan_and_inf()
+        fix_nan_and_inf(l.h_gpu, l.outputs*l.batch);    // should be fix_nan_and_inf()
+        if (l.state_constrain) constrain_ongpu(l.outputs*l.batch, l.state_constrain, l.c_gpu, 1);
+
+        if(state.train) simple_copy_ongpu(l.outputs*l.batch, l.c_gpu, l.cell_gpu);
+        simple_copy_ongpu(l.outputs*l.batch, l.h_gpu, l.output_gpu); // is required for both Detection and Training
+
+        if (l.shortcut) {
+            // partial residual connection
+            if (l.bottleneck) axpy_ongpu(l.outputs*l.batch/2, 1, wf.output_gpu, 1, l.output_gpu, 1);
+            //else axpy_ongpu(l.outputs*l.batch, 1, l.f_gpu, 1, l.output_gpu, 1);
+        }
+
+        state.input += l.inputs*l.batch;
+        l.output_gpu    += l.outputs*l.batch;
+        l.cell_gpu      += l.outputs*l.batch;
+
+        if (l.peephole) {
+            increment_layer(&vf, 1);
+            increment_layer(&vi, 1);
+            increment_layer(&vo, 1);
+        }
+
+        increment_layer(&wf, 1);
+        increment_layer(&wi, 1);
+        increment_layer(&wg, 1);
+        increment_layer(&wo, 1);
+
+        increment_layer(&uf, 1);
+        increment_layer(&ui, 1);
+        increment_layer(&ug, 1);
+        increment_layer(&uo, 1);
+    }
+}
+
+void backward_conv_lstm_layer_gpu(layer l, network_state state)
+{
+    float *last_output = l.output_gpu + l.outputs*l.batch*(l.steps - 1);
+    float *last_cell = l.cell_gpu + l.outputs*l.batch*(l.steps - 1);
+
+    network_state s = { 0 };
+    s.train = state.train;
+    s.workspace = state.workspace;
+    s.net = state.net;
+    int i;
+    layer vf = *(l.vf);
+    layer vi = *(l.vi);
+    layer vo = *(l.vo);
+
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    if (l.peephole) {
+        increment_layer(&vf, l.steps - 1);
+        increment_layer(&vi, l.steps - 1);
+        increment_layer(&vo, l.steps - 1);
+    }
+
+    increment_layer(&wf, l.steps - 1);
+    increment_layer(&wi, l.steps - 1);
+    increment_layer(&wg, l.steps - 1);
+    increment_layer(&wo, l.steps - 1);
+
+    increment_layer(&uf, l.steps - 1);
+    increment_layer(&ui, l.steps - 1);
+    increment_layer(&ug, l.steps - 1);
+    increment_layer(&uo, l.steps - 1);
+
+    state.input += l.inputs*l.batch*(l.steps - 1);
+    if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1);
+
+    l.output_gpu += l.outputs*l.batch*(l.steps - 1);
+    l.cell_gpu += l.outputs*l.batch*(l.steps - 1);
+    l.delta_gpu += l.outputs*l.batch*(l.steps - 1);
+
+    //fill_ongpu(l.outputs * l.batch, 0, l.dc_gpu, 1);   //  dont use
+    const int sequence = get_sequence_value(state.net);
+
+    for (i = l.steps - 1; i >= 0; --i) {
+        if (i != 0) simple_copy_ongpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, l.prev_cell_gpu);
+        //else fill_ongpu(l.outputs * l.batch, 0, l.prev_cell_gpu, 1);   //  dont use
+        else if (state.net.current_subdivision % sequence != 0) simple_copy_ongpu(l.outputs*l.batch, l.last_prev_cell_gpu, l.prev_cell_gpu);
+
+        simple_copy_ongpu(l.outputs*l.batch, l.cell_gpu, l.c_gpu);
+
+        if (i != 0) simple_copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, l.prev_state_gpu);
+        //else fill_ongpu(l.outputs * l.batch, 0, l.prev_state_gpu, 1);   //  dont use
+        else if (state.net.current_subdivision % sequence != 0) simple_copy_ongpu(l.outputs*l.batch, l.last_prev_state_gpu, l.prev_state_gpu);
+
+        simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.h_gpu);
+
+        l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
+
+        // f = wf + uf + vf
+        add_3_arrays_activate((l.bottleneck) ? NULL : wf.output_gpu, uf.output_gpu, (l.peephole) ? vf.output_gpu : NULL, l.outputs*l.batch, LOGISTIC, l.f_gpu);
+        //copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
+        //axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
+        //if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vf.output_gpu, 1, l.f_gpu, 1);
+        //activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
+
+        // i = wi + ui + vi
+        add_3_arrays_activate((l.bottleneck) ? NULL : wi.output_gpu, ui.output_gpu, (l.peephole) ? vi.output_gpu : NULL, l.outputs*l.batch, LOGISTIC, l.i_gpu);
+        //copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
+        //axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
+        //if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vi.output_gpu, 1, l.i_gpu, 1);
+        //activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
+
+        // g = wg + ug
+        add_3_arrays_activate((l.bottleneck) ? NULL : wg.output_gpu, ug.output_gpu, NULL, l.outputs*l.batch, l.lstm_activation, l.g_gpu);   // TANH
+        //copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
+        //axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
+        //activate_array_ongpu(l.g_gpu, l.outputs*l.batch, l.lstm_activation);
+
+        // o = wo + uo + vo
+        add_3_arrays_activate((l.bottleneck) ? NULL : wo.output_gpu, uo.output_gpu, (l.peephole) ? vo.output_gpu : NULL, l.outputs*l.batch, LOGISTIC, l.o_gpu);
+        //copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
+        //axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
+        //if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vo.output_gpu, 1, l.o_gpu, 1);
+        //activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
+
+
+        simple_copy_ongpu(l.outputs*l.batch, l.delta_gpu, l.temp3_gpu);  // temp3 = delta
+
+        simple_copy_ongpu(l.outputs*l.batch, l.c_gpu, l.temp_gpu);
+        activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, l.lstm_activation);  // temp  = tanh(c)
+
+        simple_copy_ongpu(l.outputs*l.batch, l.temp3_gpu, l.temp2_gpu);
+        mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1);   // temp2 = delta * o
+
+        gradient_array_ongpu(l.temp_gpu, l.outputs*l.batch, l.lstm_activation, l.temp2_gpu); // temp2 = delta * o * grad_tanh(tanh(c))
+        //???
+        axpy_ongpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1);          // temp2 = delta * o * grad_tanh(tanh(c)) + delta_c(???)
+        // temp  = tanh(c)
+        // temp2 = delta * o * grad_tanh(tanh(c)) + delta_c(???)
+        // temp3 = delta
+
+        simple_copy_ongpu(l.outputs*l.batch, l.c_gpu, l.temp_gpu);
+        activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, l.lstm_activation);    // temp  = tanh(c)
+
+        mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);  // temp  = delta * tanh(c)
+        gradient_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);  // temp  = delta * tanh(c) * grad_logistic(o)
+        // delta for o(w,u,v):       temp  = delta * tanh(c) * grad_logistic(o)
+        // delta for c,f,i,g(w,u,v): temp2 = delta * o * grad_tanh(tanh(c)) + delta_c(???)
+        // delta for output:         temp3 = delta
+
+        // o
+        // delta for O(w,u,v):     temp  = delta * tanh(c) * grad_logistic(o)
+        if (l.peephole) {
+            simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, vo.delta_gpu);
+            s.input = l.cell_gpu;
+            //s.delta = l.dc_gpu;
+            backward_convolutional_layer_gpu(vo, s);
+        }
+
+        if (!l.bottleneck) {
+            simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, wo.delta_gpu);
+            s.input = l.prev_state_gpu;
+            s.delta = l.temp3_gpu;// s.delta = l.dh_gpu;
+            fill_ongpu(l.outputs * l.batch, 0, l.temp3_gpu, 1);
+            backward_convolutional_layer_gpu(wo, s);
+        }
+
+        simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, uo.delta_gpu);
+        if (l.bottleneck) {
+            s.input = wf.output_gpu;
+            s.delta = wf.delta_gpu;
+        }
+        else {
+            s.input = state.input;
+            s.delta = state.delta;
+        }
+        backward_convolutional_layer_gpu(uo, s);
+
+        // g
+        simple_copy_ongpu(l.outputs*l.batch, l.temp2_gpu, l.temp_gpu);
+        mul_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
+        gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, l.lstm_activation, l.temp_gpu);
+        // delta for c,f,i,g(w,u,v): temp = (delta * o * grad_tanh(tanh(c)) + delta_c(???)) * i * grad_tanh(g)
+
+        if (!l.bottleneck) {
+            simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, wg.delta_gpu);
+            s.input = l.prev_state_gpu;
+            s.delta = l.temp3_gpu;// s.delta = l.dh_gpu;   // comment this
+            backward_convolutional_layer_gpu(wg, s);  // lead to nan
+        }
+
+        simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, ug.delta_gpu);
+        if (l.bottleneck) {
+            s.input = wf.output_gpu;
+            s.delta = wf.delta_gpu;
+        }
+        else {
+            s.input = state.input;
+            s.delta = state.delta;
+        }
+        backward_convolutional_layer_gpu(ug, s);
+
+        // i
+        simple_copy_ongpu(l.outputs*l.batch, l.temp2_gpu, l.temp_gpu);
+        mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
+        gradient_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
+        // delta for c,f,i,g(w,u,v): temp = (delta * o * grad_tanh(tanh(c)) + delta_c(???)) * g * grad_logistic(i)
+
+        if (l.peephole) {
+            simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, vi.delta_gpu);
+            s.input = l.prev_cell_gpu;
+            //s.delta = l.dc_gpu;
+            backward_convolutional_layer_gpu(vi, s);
+        }
+
+        if (!l.bottleneck) {
+            simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, wi.delta_gpu);
+            s.input = l.prev_state_gpu;
+            s.delta = l.temp3_gpu;// s.delta = l.dh_gpu;   // comment this
+            backward_convolutional_layer_gpu(wi, s);  // lead to nan (after 1000 it)
+        }
+
+        simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, ui.delta_gpu);
+        if (l.bottleneck) {
+            s.input = wf.output_gpu;
+            s.delta = wf.delta_gpu;
+        }
+        else {
+            s.input = state.input;
+            s.delta = state.delta;
+        }
+        backward_convolutional_layer_gpu(ui, s);
+
+        // f
+        simple_copy_ongpu(l.outputs*l.batch, l.temp2_gpu, l.temp_gpu);
+        mul_ongpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1);
+        gradient_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
+        // delta for c,f,i,g(w,u,v): temp = (delta * o * grad_tanh(tanh(c)) + delta_c(???)) * c * grad_logistic(f)
+
+        if (l.peephole) {
+            simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, vf.delta_gpu);
+            s.input = l.prev_cell_gpu;
+            //s.delta = l.dc_gpu;
+            backward_convolutional_layer_gpu(vf, s);
+        }
+
+        simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, uf.delta_gpu);
+        if (l.bottleneck) {
+            s.input = wf.output_gpu;
+            s.delta = wf.delta_gpu;
+        }
+        else {
+            s.input = state.input;
+            s.delta = state.delta;
+        }
+        backward_convolutional_layer_gpu(uf, s);
+
+
+        if (l.bottleneck) {
+            // l.bottelneck_hi_gpu size is 2x
+            simple_copy_ongpu(l.outputs*l.batch, l.prev_state_gpu, l.bottelneck_hi_gpu);
+            simple_copy_ongpu(l.outputs*l.batch, state.input, l.bottelneck_hi_gpu + l.outputs*l.batch);
+            fill_ongpu(l.outputs * l.batch * 2, 0, l.bottelneck_delta_gpu, 1);
+            s.input = l.bottelneck_hi_gpu;
+            s.delta = l.bottelneck_delta_gpu;
+            if (l.shortcut) axpy_ongpu(l.outputs*l.batch/2, 1, l.delta_gpu, 1, wf.delta_gpu, 1);    // partial residual connection
+            gradient_array_ongpu(wf.output_gpu, l.outputs*l.batch, l.lstm_activation, wf.delta_gpu);
+
+            reset_nan_and_inf(wf.delta_gpu, l.outputs*l.batch);
+            constrain_ongpu(l.outputs*l.batch, 1, wf.delta_gpu, 1);
+        }
+        else {
+            s.input = l.prev_state_gpu;
+            simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, wf.delta_gpu);
+            s.delta = l.temp3_gpu;// s.delta = l.dh_gpu;
+        }
+
+        // WF
+        backward_convolutional_layer_gpu(wf, s);
+
+        if (l.bottleneck) {
+            reset_nan_and_inf(l.bottelneck_delta_gpu, l.outputs*l.batch*2);
+            //constrain_ongpu(l.outputs*l.batch*2, 1, l.bottelneck_delta_gpu, 1);
+            if (l.dh_gpu) axpy_ongpu(l.outputs*l.batch, l.time_normalizer, l.bottelneck_delta_gpu, 1, l.dh_gpu, 1);
+            axpy_ongpu(l.outputs*l.batch, 1, l.bottelneck_delta_gpu + l.outputs*l.batch, 1, state.delta, 1);    // lead to nan
+        }
+        else {
+            // if (l.dh_gpu) axpy_ongpu(l.outputs*l.batch, l.time_normalizer, l.temp3_gpu, 1, l.dh_gpu, 1);
+        }
+
+        // c
+        simple_copy_ongpu(l.outputs*l.batch, l.temp2_gpu, l.temp_gpu);
+        mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1);
+        simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, l.dc_gpu);
+        reset_nan_and_inf(l.dc_gpu, l.outputs*l.batch);
+        if (i != 0) reset_nan_and_inf(l.dh_gpu, l.outputs*l.batch);
+        // delta for c,f,i,g(w,u,v): delta_c = temp = (delta * o * grad_tanh(tanh(c)) + delta_c(???)) * f    // (grad_linear(c)==1)
+
+        state.input -= l.inputs*l.batch;
+        if (state.delta) state.delta -= l.inputs*l.batch;   // new delta: state.delta = prev_layer.delta_gpu;
+        l.output_gpu -= l.outputs*l.batch;
+        l.cell_gpu -= l.outputs*l.batch;
+        l.delta_gpu -= l.outputs*l.batch;
+
+        if (l.peephole) {
+            increment_layer(&vf, -1);
+            increment_layer(&vi, -1);
+            increment_layer(&vo, -1);
+        }
+
+        increment_layer(&wf, -1);
+        increment_layer(&wi, -1);
+        increment_layer(&wg, -1);
+        increment_layer(&wo, -1);
+
+        increment_layer(&uf, -1);
+        increment_layer(&ui, -1);
+        increment_layer(&ug, -1);
+        increment_layer(&uo, -1);
+    }
+
+    simple_copy_ongpu(l.outputs*l.batch, last_output, l.last_prev_state_gpu);
+    simple_copy_ongpu(l.outputs*l.batch, last_cell, l.last_prev_cell_gpu);
+
+    // free state after each 100 iterations
+    //if (get_current_batch(state.net) % 100) free_state_conv_lstm(l);  // dont use
+}
+#endif
diff --git a/darknet-master/src/conv_lstm_layer.h b/darknet-master/src/conv_lstm_layer.h
new file mode 100644
index 0000000..fae59f1
--- /dev/null
+++ b/darknet-master/src/conv_lstm_layer.h
@@ -0,0 +1,40 @@
+#ifndef CONV_LSTM_LAYER_H
+#define CONV_LSTM_LAYER_H
+
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+#define USET
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor, int bottleneck, int train);
+void resize_conv_lstm_layer(layer *l, int w, int h);
+void free_state_conv_lstm(layer l);
+void randomize_state_conv_lstm(layer l);
+void remember_state_conv_lstm(layer l);
+void restore_state_conv_lstm(layer l);
+
+void forward_conv_lstm_layer(layer l, network_state state);
+void backward_conv_lstm_layer(layer l, network_state state);
+void update_conv_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+
+layer make_history_layer(int batch, int h, int w, int c, int history_size, int steps, int train);
+void forward_history_layer(layer l, network_state state);
+void backward_history_layer(layer l, network_state state);
+
+#ifdef GPU
+void forward_conv_lstm_layer_gpu(layer l, network_state state);
+void backward_conv_lstm_layer_gpu(layer l, network_state state);
+void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale);
+
+void forward_history_layer_gpu(const layer l, network_state state);
+void backward_history_layer_gpu(const layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // CONV_LSTM_LAYER_H
diff --git a/darknet-master/src/convolutional_kernels.cu b/darknet-master/src/convolutional_kernels.cu
new file mode 100644
index 0000000..debd6bf
--- /dev/null
+++ b/darknet-master/src/convolutional_kernels.cu
@@ -0,0 +1,1436 @@
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+
+#include "convolutional_layer.h"
+#include "batchnorm_layer.h"
+#include "gemm.h"
+#include "blas.h"
+#include "im2col.h"
+#include "col2im.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "box.h"
+
+
+__global__ void binarize_kernel(float *x, int n, float *binary)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+    binary[i] = (x[i] >= 0) ? 1 : -1;
+}
+
+void binarize_gpu(float *x, int n, float *binary)
+{
+    binarize_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, binary);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void binarize_input_kernel(float *input, int n, int size, float *binary)
+{
+    int s = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (s >= size) return;
+    int i = 0;
+    float mean = 0;
+    for(i = 0; i < n; ++i){
+        mean += fabs(input[i*size + s]);
+    }
+    mean = mean / n;
+    for(i = 0; i < n; ++i){
+        binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
+    }
+}
+
+void binarize_input_gpu(float *input, int n, int size, float *binary)
+{
+    binarize_input_kernel<<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>>(input, n, size, binary);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary)
+{
+    int f = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (f >= n) return;
+    int i = 0;
+    float mean = 0;
+    for (i = 0; i < size; ++i) {
+        mean += fabs(weights[f*size + i]);
+    }
+    mean = mean / size;
+    for (i = 0; i < size; ++i) {
+        binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
+        //binary[f*size + i] = weights[f*size + i];
+    }
+}
+
+void binarize_weights_gpu(float *weights, int n, int size, float *binary)
+{
+    binarize_weights_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(weights, n, size, binary);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void set_zero_kernel(float *src, int size)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size) src[i] = 0;
+}
+
+__inline__ __device__
+float warpAllReduceSum(float val) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2)
+#if CUDART_VERSION >= 9000
+        val += __shfl_xor_sync(0xffffffff, val, mask);
+#else
+        val += __shfl_xor(val, mask);
+#endif
+    return val;
+}
+
+// only if (size % 32 == 0)
+__global__ void reduce_kernel(float *weights, int n, int size, float *mean_arr_gpu)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int f = i / size;
+    if (f >= n) return;
+    float warp_mean = warpAllReduceSum(fabs(weights[i]));
+    if(i % 32 == 0)
+        atomicAdd(&mean_arr_gpu[f], warp_mean / size);
+}
+
+__global__ void binarize_weights_mean_kernel(float *weights, int n, int size, float *binary, float *mean_arr_gpu)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int f = i / size;
+    if (f >= n) return;
+    float mean = mean_arr_gpu[f];
+    binary[i] = (weights[i] > 0) ? mean : -mean;
+}
+
+void fast_binarize_weights_gpu(float *weights, int n, int size, float *binary, float *mean_arr_gpu)
+{
+    if (size % 32 == 0) {
+        size_t gridsize = n * size;
+        const int num_blocks = get_number_of_blocks(gridsize, BLOCK);// gridsize / BLOCK + 1;
+
+        set_zero_kernel <<<(n/BLOCK + 1), BLOCK, 0, get_cuda_stream() >>> (mean_arr_gpu, n);
+        reduce_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (weights, n, size, mean_arr_gpu);
+        binarize_weights_mean_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (weights, n, size, binary, mean_arr_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+    }
+    else {
+        binarize_weights_gpu(weights, n, size, binary);
+    }
+}
+
+
+__global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
+    //if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]); // can't be compiled on Linux without casting
+    // __float2half_ru, __float2half_rd, __float2half_rz, __float2half_rn
+    //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]);
+}
+
+void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16) {
+    cuda_f32_to_f16 <<< get_number_of_blocks(size, BLOCK), BLOCK, 0, get_cuda_stream() >>> (input_f32, size, (half *)output_f16);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+__global__ void cuda_f16_to_f32(half* input_f16, size_t size, float *output_f32)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) output_f32[idx] = __half2float(input_f16[idx]);
+    //if (idx < size) output_f32[idx] = __half2float(*((unsigned short *)input_f16 + idx));
+}
+
+void cuda_convert_f16_to_f32(float* input_f16, size_t size, float *output_f32) {
+    cuda_f16_to_f32 <<< get_number_of_blocks(size, BLOCK), BLOCK, 0, get_cuda_stream() >>> ((half *)input_f16, size, output_f32);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+half *cuda_make_f16_from_f32_array(float *src, size_t n)
+{
+    half *dst16;
+    size_t size = sizeof(half)*n;
+    CHECK_CUDA(cudaMalloc((void **)&dst16, size));
+    if (src) {
+        assert(n > 0);
+        cuda_convert_f32_to_f16(src, n, (float *)dst16);
+    }
+    if (!dst16) error("Cuda malloc failed", DARKNET_LOC);
+    return dst16;
+}
+
+void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
+{
+    if (l.train == 0) state.train = 0;
+
+    if (l.stream >= 0) {
+        switch_stream(l.stream);
+    }
+
+    if (l.wait_stream_id >= 0) {
+        wait_stream(l.wait_stream_id);
+    }
+
+    //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+    if(l.binary){
+        binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu);
+        swap_binary(&l);
+    }
+
+    if(l.xnor){
+        if (!l.align_bit_weights_gpu || state.train) {
+            //binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu);
+
+            fast_binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu);
+        }
+
+        if (l.align_bit_weights_gpu && !state.train && l.c >= 32 && l.stride_x == l.stride_y)
+        {
+            //return;
+            //cudaError_t status = cudaSuccess;
+            //int input_size = l.c*l.h*l.w*l.batch;
+
+            int m = l.n / l.groups;
+            int k = l.size*l.size*l.c / l.groups;
+            int n = l.out_w*l.out_h;
+            //float * a = l.weights_gpu;
+
+            // int i, j;
+            // for(i = 0; i < l.batch; ++i){
+            // for (j = 0; j < l.groups; ++j) {
+
+            int ldb_align = l.lda_align;
+            size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
+            //size_t t_intput_size = new_ldb * n;
+            //size_t t_bit_input_size = t_intput_size / 8;// +1;
+
+            if (l.c % 32 == 0)
+            {
+                //printf("\n\n l.index = %d, l.w = %d, l.c = %d, l.n = %d, l.stride = %d, l.pad = %d - new XNOR \n", l.index, l.w, l.c, l.n, l.stride, l.pad);
+                //printf("l.align_workspace_size = %d, (l.c * l.w * l.h)  = %d \n", l.align_workspace_size, (l.c * l.w * l.h));
+
+                //float *intput_cpu = (float *)calloc(l.inputs, sizeof(float));
+                // state.input
+                //cudaMemcpy(intput_cpu, state.input, l.inputs * sizeof(float), cudaMemcpyDefault);
+
+                int ldb_align = l.lda_align;
+                size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
+                //size_t t_intput_size = new_ldb * l.bit_align;// n;
+                //size_t t_bit_input_size = t_intput_size / 8;// +1;
+
+                const int new_c = l.c / 32;
+
+                //float *re_packed_input = (float *)calloc(l.c * l.w * l.h, sizeof(float));
+                //uint32_t *bin_re_packed_input = (uint32_t *)calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
+
+                // float32x4 by channel (as in cuDNN)
+                //repack_input(intput_cpu, re_packed_input, l.w, l.h, l.c);
+
+
+                // 32 x floats -> 1 x uint32_t
+                //float_to_bit(re_packed_input, (uint8_t *)bin_re_packed_input, l.c * l.w * l.h);
+
+                //cudaDeviceSynchronize();
+                //start_timer();
+
+                repack_input_gpu_bin(state.input, (uint32_t *)l.align_workspace_gpu, l.w, l.h, l.c);
+
+                //repack_input_gpu(state.input, state.workspace, l.w, l.h, l.c);
+
+                // 32 x floats -> 1 x uint32_t
+                //float_to_bit_gpu(state.workspace, (unsigned char *)l.align_workspace_gpu, l.c * l.w * l.h);// l.align_workspace_size);
+
+                //cudaDeviceSynchronize();
+                //stop_timer_and_show_name("repack_input_gpu + float_to_bit_gpu");
+
+                //free(re_packed_input);
+
+                // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN)
+                //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output,
+                //    l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr);
+
+                // // then exit from if()
+
+                //float *b = state.workspace;
+                //float *b = (float *)calloc(100 * 1024 * 1024, sizeof(float));
+                //float *c = l.output;
+                //memset(c, 0, l.outputs * sizeof(float));
+
+
+                //im2col_cpu_custom((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
+
+                //cudaMemcpy(l.align_workspace_gpu, bin_re_packed_input, (new_c * l.w * l.h + 1) * sizeof(uint32_t), cudaMemcpyDefault);
+
+                //start_timer();
+                im2col_ongpu(l.align_workspace_gpu, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
+                //cudaDeviceSynchronize();
+                //stop_timer_and_show_name("im2col_ongpu");
+
+                //free(bin_re_packed_input);
+
+                int new_k = l.size*l.size*l.c / 32;
+
+                // good for (l.c == 64)
+                //gemm_nn_bin_32bit_packed(m, n, new_k, 1,
+                //    l.align_bit_weights, l.new_lda/32,
+                //    b, n,
+                //    c, n, l.mean_arr);
+
+                // // then exit from if()
+
+
+                //size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
+                //size_t t_intput_size = new_ldb * l.bit_align;// n;
+                //size_t t_bit_input_size = t_intput_size / 8;// +1;
+
+                //char *t_bit_input = (char *)calloc(t_bit_input_size, sizeof(char));
+                //transpose_uint32((uint32_t *)b, (uint32_t *)t_bit_input, new_k, n, n, new_ldb);
+                //cudaMemcpy(l.transposed_align_workspace_gpu, t_bit_input, t_bit_input_size * sizeof(char), cudaMemcpyDefault);
+
+                //cudaMemcpy(state.workspace, b, t_bit_input_size * sizeof(char), cudaMemcpyDefault);
+                //printf("\n n = %d, n % 32 = %d, new_ldb = %d, new_ldb % 32 = %d \n", n, n % 32, new_ldb, new_ldb % 32);
+
+                //start_timer();
+                transpose_uint32_gpu((uint32_t *)state.workspace, (uint32_t *)l.transposed_align_workspace_gpu, new_k, n, n, new_ldb);
+                //cudaDeviceSynchronize();
+                //stop_timer_and_show_name("transpose_uint32_gpu");
+
+                //cudaDeviceSynchronize();
+                //stop_timer_and_show_name("repack_input_gpu_bin + im2col_ongpu + transpose_uint32_gpu_2");
+
+                //start_timer();
+                gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
+                    (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
+                    new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY,
+                    l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu);
+                //cudaDeviceSynchronize();
+                //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu");
+
+
+                // the main GEMM function
+                //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (uint8_t *)l.align_bit_weights, new_ldb, (uint8_t *)t_bit_input, new_ldb, c, n, l.mean_arr);
+
+                //add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
+
+                //cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyDefault);
+
+
+                // // alternative GEMM
+                //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1,
+                //    l.align_bit_weights, l.new_lda/32,
+                //    t_bit_input, new_ldb / 32,
+                //    c, n, l.mean_arr);
+
+                //free(t_bit_input);
+
+                //free(b);
+            }
+            else
+            {
+                //printf("\n\n l.index = %d, l.w = %d, l.c = %d, l.n = %d, l.stride = %d, l.pad = %d - old XNOR \n", l.index, l.w, l.c, l.n, l.stride, l.pad);
+                //cudaDeviceSynchronize();
+
+                int i = 0;
+                /*
+                // if (l.stride == 1 && l.c >= 256 && l.size > 1)
+                if (l.stride == 1 && l.c >= 1024 && l.size > 1 && 0)// && l.w >= 13) // disabled
+                {
+                    // stride=1 only
+                    //start_timer();
+                    im2col_align_bin_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
+                    //cudaDeviceSynchronize();
+                    //stop_timer_and_show_name("im2col_align_bin_ongpu");
+                }
+                else*/
+                {
+                    //start_timer();
+                    im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align);
+                    //cudaDeviceSynchronize();
+                    //stop_timer_and_show_name("im2col_align_ongpu");
+
+                    // should be optimized
+                    //start_timer();
+                    float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size);
+                    //cudaDeviceSynchronize();
+                    //stop_timer_and_show_name("float_to_bit_gpu");
+                }
+                //start_timer();
+                transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8);
+                //cudaDeviceSynchronize();
+                //stop_timer_and_show_name("transpose_bin_gpu");
+
+                //cudaDeviceSynchronize();
+                //stop_timer_and_show_name("im2col_align_ongpu + float_to_bit_gpu + transpose_bin_gpu");
+
+                // should be optimized
+                //if(0) {//if (k > 1000) {    // sequentially input-shared - BAD
+                //    gemm_nn_custom_bin_mean_transposed_sequentially_gpu(m, n, k,
+                //        (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu);
+                //}
+                //else {  // coalescing & weights-shared-memory - GOOD
+                    //start_timer();
+                    gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
+                        (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
+                        new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY,
+                        l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu);
+                    //cudaDeviceSynchronize();
+                    //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu");
+                //}
+                //cudaDeviceSynchronize();
+                //check_error(status);
+            }
+
+
+            /*
+            {
+                float_to_bit_gpu(state.input, (unsigned char *)l.align_workspace_gpu, input_size);
+                convolve_bin_gpu(l.align_workspace_gpu, (float *)l.align_bit_weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr_gpu);
+
+                //convolve_gpu(state.input, l.weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad);
+
+                //cudaDeviceSynchronize();
+                //check_error(status);
+
+                add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+            }
+            */
+
+            //add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+            if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+            else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+            else if (l.activation == HARD_MISH) activate_array_hard_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+            else if (l.activation == NORM_CHAN) activate_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu);
+            else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 0);
+            else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 1);
+            else if (l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+            //if(l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+            //if (l.binary || l.xnor) swap_binary(&l);
+            //cudaDeviceSynchronize();
+            return;
+        }
+    }
+
+    if (l.xnor) {
+        swap_binary(&l);
+        binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
+        state.input = l.binary_input_gpu;
+    }
+
+    //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+
+#ifdef CUDNN
+    //float one = 1;    // alpha[0], beta[0] is float for HALF and FLOAT
+    float alpha = 1, beta = 0;
+
+//#ifdef CUDNN_HALF
+    //if (state.use_mixed_precision) {
+    int iteration_num = get_current_iteration(state.net); // (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || (iteration_num > 3 * state.net.burn_in) && state.net.loss_scale != 1) &&
+        (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && l.groups <= 1 && l.size > 1)
+    {
+        //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index);
+
+        // Note: For improved performance it is advised to use beta[0] = 0.0.
+        // For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH;
+        // 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF
+        // 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED
+        // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
+
+        const size_t input16_size = l.batch*l.c*l.w*l.h;
+        const size_t output16_size = l.batch*l.out_c*l.out_h*l.out_w;
+
+        if (*state.net.max_input16_size < input16_size) {
+            //printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size);
+            *state.net.max_input16_size = input16_size;
+            if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
+            assert(*state.net.max_input16_size > 0);
+            *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
+        }
+        float *input16 = *state.net.input16_gpu;
+
+        if (*state.net.max_output16_size < output16_size) {
+            *state.net.max_output16_size = output16_size;
+            if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
+            assert(*state.net.max_output16_size > 0);
+            *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
+        }
+        float *output16 = *state.net.output16_gpu;
+
+        assert(input16_size > 0);
+        cuda_convert_f32_to_f16(state.input, input16_size, input16);
+
+        //fill_ongpu(output16_size / 2, 0, (float *)output16, 1);
+        CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(),
+            &alpha,
+            l.srcTensorDesc16,
+            input16,
+            l.weightDesc16,
+            l.weights_gpu16,
+            l.convDesc,
+            l.fw_algo16,
+            state.workspace,
+            l.workspace_size,
+            &beta,
+            l.dstTensorDesc16,
+            output16));
+
+
+        if (l.batch_normalize)
+        {
+            if (state.train && !state.net.adversarial) // Training
+            {
+                simple_copy_ongpu(l.outputs*l.batch / 2, output16, l.x_gpu);
+                //copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1);
+                //cudaMemcpyAsync(l.x_gpu, output16, l.outputs*l.batch*sizeof(half), cudaMemcpyDefault, get_cuda_stream());
+                float one = 1.0f;
+                float zero = 0.0f;
+                // Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth
+                // compared to FP32, it's just that the statistics and value adjustment should be done in FP32.
+                CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(cudnn_handle(),
+                    CUDNN_BATCHNORM_SPATIAL,
+                    &one,
+                    &zero,
+                    l.normDstTensorDescF16,
+                    l.x_gpu,            // input
+                    l.normDstTensorDescF16,
+                    output16,            // output
+                    l.normTensorDesc,
+                    l.scales_gpu,       // input
+                    l.biases_gpu,       // input
+                    .01,
+                    l.rolling_mean_gpu,        // input/output (should be FP32)
+                    l.rolling_variance_gpu,    // input/output (should be FP32)
+                    .00001,
+                    l.mean_gpu,            // output (should be FP32) - optional cache to speedup cudnnBatchNormalizationBackward()
+                    l.variance_gpu));    // output (should be FP32) - optional cache to speedup cudnnBatchNormalizationBackward()
+
+                cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
+                //forward_batchnorm_layer_gpu(l, state);
+            }
+            else // Detection
+            {
+                cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
+                normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+                scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+                add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+            }
+        }
+        else // BIAS only
+        {
+            cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
+            add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+        }
+    }
+    else {
+
+        //#else
+        /*
+        int input_nan_inf = is_nan_or_inf(state.input, l.inputs * l.batch);
+        printf("\n is_nan_or_inf(state.input) = %d \n", input_nan_inf);
+        if (input_nan_inf) error();
+
+        int weights_nan_inf = is_nan_or_inf(l.weights_gpu, l.nweights);
+        printf("\n is_nan_or_inf(l.weights_gpu) = %d \n", weights_nan_inf);
+        if (weights_nan_inf) error();
+        */
+
+        CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(),
+            &alpha, //&one,
+            l.srcTensorDesc,
+            state.input,
+            l.weightDesc,
+            l.weights_gpu,
+            l.convDesc,
+            l.fw_algo,
+            state.workspace,
+            l.workspace_size,
+            &beta,  //&one,
+            l.dstTensorDesc,
+            l.output_gpu));
+
+        //cudaDeviceSynchronize();
+        if (l.batch_normalize) {
+            forward_batchnorm_layer_gpu(l, state);
+        }
+        else {
+            add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+        }
+    //#endif    // CUDNN_HALF
+    }
+
+
+#else
+    fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+
+    int i, j;
+    int m = l.n / l.groups;
+    int k = l.size*l.size*l.c / l.groups;
+    int n = l.out_w*l.out_h;
+    for(i = 0; i < l.batch; ++i){
+        for (j = 0; j < l.groups; ++j) {
+            //float *im = state.input + i*l.c*l.h*l.w;
+            float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
+            float *a = l.weights_gpu + j*l.nweights / l.groups;
+            float *b = state.workspace;
+            float *c = l.output_gpu + (i*l.groups + j)*n*m;
+            if (l.size == 1 && l.stride == 1 && l.dilation == 1) {
+                b = im;
+            }
+            else {
+                //im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
+
+                im2col_gpu_ext(im,          // input
+                    l.c / l.groups,         // input channels
+                    l.h, l.w,               // input size (h, w)
+                    l.size, l.size,         // kernel size (h, w)
+                    l.pad * l.dilation, l.pad * l.dilation,   // padding (h, w)
+                    l.stride_y, l.stride_x,     // stride (h, w)
+                    l.dilation, l.dilation, // dilation (h, w)
+                    state.workspace);       // output
+
+            }
+            //gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n);
+            gemm_ongpu(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
+        }
+    }
+
+    if (l.batch_normalize) {
+        forward_batchnorm_layer_gpu(l, state);
+    }
+    else {
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+    }
+#endif
+
+//#ifndef CUDNN_HALF
+//#endif // no CUDNN_HALF
+
+    if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+    else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+    else if (l.activation == HARD_MISH) activate_array_hard_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+    else if (l.activation == NORM_CHAN) activate_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu);
+    else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 0);
+    else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 1);
+    else if (l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    //if(l.dot > 0) dot_error_gpu(l);
+    if(l.binary || l.xnor) swap_binary(&l);
+    //cudaDeviceSynchronize();    // for correct profiling of performance
+
+    if (state.net.try_fix_nan) {
+        fix_nan_and_inf(l.output_gpu, l.outputs*l.batch);
+    }
+
+    if(l.assisted_excitation && state.train) assisted_excitation_forward_gpu(l, state);
+
+    if (l.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
+        s.input = l.output_gpu;
+        forward_convolutional_layer_gpu(*(l.input_layer), s);
+        simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.input_antialiasing_gpu);
+        simple_copy_ongpu(l.input_layer->outputs*l.input_layer->batch, l.input_layer->output_gpu, l.output_gpu);
+    }
+
+    if (l.coordconv) {
+        coord_conv_gpu(l.output_gpu, l.outputs*l.batch, l.out_w, l.out_h, l.out_c, l.batch, 0);
+    }
+}
+
+void backward_convolutional_layer_gpu(convolutional_layer l, network_state state)
+{
+    if (l.coordconv) {
+        coord_conv_gpu(l.delta_gpu, l.outputs*l.batch, l.out_w, l.out_h, l.out_c, l.batch, 1);
+    }
+
+    if (l.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        s.delta = l.delta_gpu;  // s.delta will be returned to l.delta_gpu
+        s.input = l.input_antialiasing_gpu;
+        //if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
+        simple_copy_ongpu(l.input_layer->outputs*l.input_layer->batch, l.delta_gpu, l.input_layer->delta_gpu);
+        backward_convolutional_layer_gpu(*(l.input_layer), s);
+
+        simple_copy_ongpu(l.outputs*l.batch, l.input_antialiasing_gpu, l.output_gpu);
+    }
+
+    if(state.net.try_fix_nan) constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
+
+    if (l.activation == SWISH) gradient_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
+    else if (l.activation == MISH) gradient_array_mish_ongpu(l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
+    else if (l.activation == HARD_MISH) gradient_array_hard_mish_ongpu(l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
+    else if (l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL) gradient_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
+    else if (l.activation == NORM_CHAN) gradient_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
+    else gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+
+    if (!l.batch_normalize)
+        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
+
+//#ifndef CUDNN_HALF
+    //if(l.batch_normalize){
+    //    backward_batchnorm_layer_gpu(l, state);
+    //} else {
+    //    //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
+    //}
+//#endif // no CUDNN_HALF
+    float *original_input = state.input;
+
+    if(l.xnor) state.input = l.binary_input_gpu;
+#ifdef CUDNN
+    float one = 1.f;
+    float alpha = 1, beta = 0;
+
+//#ifdef CUDNN_HALF
+    int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || (iteration_num > 3 * state.net.burn_in) && state.net.loss_scale != 1) &&
+        (l.c / l.groups) % 8 == 0 && l.n % 8 == 0  && l.groups <= 1 && l.size > 1)
+    {
+        const size_t input16_size = l.batch*l.c*l.w*l.h;
+        const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h;
+
+        if (*state.net.max_input16_size < input16_size) {
+            *state.net.max_input16_size = input16_size;
+            if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
+            assert(*state.net.max_input16_size > 0);
+            *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
+        }
+        float *input16 = *state.net.input16_gpu;
+
+        if (*state.net.max_output16_size < delta16_size) {
+            *state.net.max_output16_size = delta16_size;
+            if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
+            assert(*state.net.max_output16_size > 0);
+            *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
+        }
+        float *delta16 = *state.net.output16_gpu;
+
+        assert(input16_size > 0);
+        assert(delta16_size > 0);
+        cuda_convert_f32_to_f16(state.input, input16_size, input16);
+        cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16);
+
+        if (l.batch_normalize) {
+            //if (!state.train) {
+            //    l.mean_gpu = l.rolling_mean_gpu;
+            //    l.variance_gpu = l.rolling_variance_gpu;
+            //}
+            float one = 1.0f;
+            float zero = 0.0f;
+            CHECK_CUDNN(cudnnBatchNormalizationBackward(cudnn_handle(),
+                CUDNN_BATCHNORM_SPATIAL,
+                &one,
+                &zero,
+                &one,
+                &one,
+                l.normDstTensorDescF16,
+                l.x_gpu,                // input (input in BN-forward-inference)
+                l.normDstTensorDescF16,
+                delta16,                // input
+                l.normDstTensorDescF16,
+                l.output_gpu, //l.x_norm_gpu,            // output (new delta)
+                l.normTensorDesc,
+                l.scales_gpu,            // input (should be FP32)
+                l.scale_updates_gpu,    // output (should be FP32)
+                l.bias_updates_gpu,        // output (should be FP32)
+                .00001,
+                l.mean_gpu,                // input (should be FP32)
+                l.variance_gpu));        // input (should be FP32)
+
+            simple_copy_ongpu(l.outputs*l.batch / 2, l.output_gpu, delta16);
+            //copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1);
+            //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream());
+        }
+        else
+        {
+            //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
+        }
+
+        // convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16
+        // get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16)
+
+        // calculate conv weight updates
+        // Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weight*decay*batch*subdivision)*momentum
+        //   so we should copy f32 to f16, or compute: f16=(w_up - w*d*b*s)*m
+        assert((l.nweights) > 0);
+        cuda_convert_f32_to_f16(l.weight_updates_gpu, l.nweights, l.weight_updates_gpu16);
+
+        if (!state.net.adversarial && !l.train_only_bn) {
+            CHECK_CUDNN(cudnnConvolutionBackwardFilter(cudnn_handle(),
+                &one,
+                l.srcTensorDesc16,
+                input16, //state.input,
+                l.ddstTensorDesc16,
+                delta16, //l.delta_gpu,
+                l.convDesc,
+                l.bf_algo16,
+                state.workspace,
+                l.workspace_size,
+                &one,
+                l.dweightDesc16,
+                l.weight_updates_gpu16));    // l.weight_updates_gpu);
+
+            cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.nweights, l.weight_updates_gpu);
+        }
+
+        if (state.delta) {
+            if (l.binary || l.xnor) swap_binary(&l);
+
+            // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
+            // calculate delta for the next layer
+            // convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16
+            // get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16)
+            CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn_handle(),
+                &alpha,
+                l.weightDesc16,
+                l.weights_gpu16, //l.weights_gpu,
+                l.ddstTensorDesc16,
+                delta16, //l.delta_gpu,
+                l.convDesc,
+                l.bd_algo16,
+                state.workspace,
+                l.workspace_size,
+                &beta,
+                l.dsrcTensorDesc16,
+                input16));    // state.delta);
+
+            cuda_convert_f16_to_f32(input16, input16_size, state.delta);
+
+            if (l.binary || l.xnor) swap_binary(&l);
+            if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
+        }
+    }
+    else {
+        //#else    // CUDNN_HALF
+
+        if(l.batch_normalize){
+            backward_batchnorm_layer_gpu(l, state);
+        }
+
+        if (!state.net.adversarial && !l.train_only_bn) {
+
+            float *old_input = state.input;
+
+            /*
+            if (l.reverse) {
+                if (*state.net.max_output16_size < l.inputs*l.batch) {
+                    *state.net.max_output16_size = l.inputs*l.batch;
+                    if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
+                    assert(*state.net.max_output16_size > 0);
+                    *state.net.output16_gpu = cuda_make_array(NULL, *state.net.max_output16_size);
+                }
+                float clip = 0.0;
+                float divider = 1.0;
+                float abs_add = 1.0;
+                mult_inverse_array_gpu(state.input, *state.net.output16_gpu, l.inputs*l.batch, l.reverse, divider, clip, abs_add);
+                state.input = *state.net.output16_gpu;
+            }
+            */
+
+            // calculate conv weight updates
+            // if used: beta=1 then loss decreases faster
+            CHECK_CUDNN(cudnnConvolutionBackwardFilter(cudnn_handle(),
+                &one,
+                l.srcTensorDesc,
+                state.input,
+                l.ddstTensorDesc,
+                l.delta_gpu,
+                l.convDesc,
+                l.bf_algo,
+                state.workspace,
+                l.workspace_size,
+                &one,
+                l.dweightDesc,
+                l.weight_updates_gpu));
+
+            state.input = old_input;
+        }
+
+
+        if (state.delta) {
+            if (l.binary || l.xnor) swap_binary(&l);
+
+            float *old_weights = l.weights_gpu;
+
+            /*
+            if (l.reverse) {
+                if (*state.net.max_output16_size < l.nweights) {
+                    *state.net.max_output16_size = l.nweights;
+                    if (*state.net.output16_gpu && *state.net.max_output16_size > 0) cuda_free(*state.net.output16_gpu);
+                    assert(*state.net.max_output16_size > 0);
+                    *state.net.output16_gpu = cuda_make_array(NULL, l.nweights);
+                }
+                float clip = 0.0;
+                float divider = 1.0;
+                float abs_add = 1.0;
+                mult_inverse_array_gpu(l.weights_gpu, *state.net.output16_gpu, l.nweights, l.reverse, divider, clip, abs_add);
+                l.weights_gpu = *state.net.output16_gpu;
+            }
+            */
+
+            // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
+            // calculate delta for the next layer
+            CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn_handle(),
+                &one,
+                l.weightDesc,
+                l.weights_gpu,
+                l.ddstTensorDesc,
+                l.delta_gpu,
+                l.convDesc,
+                l.bd_algo,
+                state.workspace,
+                l.workspace_size,
+                &one,
+                l.dsrcTensorDesc,
+                state.delta));
+
+            l.weights_gpu = old_weights;
+
+            if (l.binary || l.xnor) swap_binary(&l);
+            if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
+        }
+    }
+
+//#endif    // CUDNN_HALF
+
+#else    // CUDNN
+    if (l.batch_normalize) {
+        backward_batchnorm_layer_gpu(l, state);
+    }
+
+    int m = l.n / l.groups;
+    int n = l.size*l.size*l.c / l.groups;
+    int k = l.out_w*l.out_h;
+
+    int i, j;
+    for(i = 0; i < l.batch; ++i){
+        for (j = 0; j < l.groups; ++j) {
+            float * a = l.delta_gpu + (i*l.groups + j)*m*k;
+            float * b = state.workspace;
+            float * c = l.weight_updates_gpu + j*l.nweights / l.groups;
+
+            float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
+
+            if (!state.net.adversarial && !l.train_only_bn) {
+                //im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
+                im2col_gpu_ext(im,          // input
+                    l.c / l.groups,         // input channels
+                    l.h, l.w,               // input size (h, w)
+                    l.size, l.size,         // kernel size (h, w)
+                    l.pad * l.dilation, l.pad * l.dilation,   // padding (h, w)
+                    l.stride_y, l.stride_x,     // stride (h, w)
+                    l.dilation, l.dilation, // dilation (h, w)
+                    state.workspace);       // output
+                //gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n);
+                gemm_ongpu(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);
+            }
+
+            if (state.delta) {
+                if (l.binary || l.xnor) swap_binary(&l);
+                float * a = l.weights_gpu + j*l.nweights / l.groups;
+                float * b = l.delta_gpu + (i*l.groups + j)*m*k;
+                float * c = state.workspace;
+
+                //gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k);
+                gemm_ongpu(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);
+
+
+                float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
+
+                //col2im_ongpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, delta);
+                col2im_gpu_ext(
+                    state.workspace,        // input
+                    l.c / l.groups,         // input channels
+                    l.h, l.w,               // input size (h, w)
+                    l.size, l.size,         // kernel size (h, w)
+                    l.pad * l.dilation, l.pad * l.dilation,   // padding size (h, w)
+                    l.stride_y, l.stride_x,     // stride size (h, w)
+                    l.dilation, l.dilation, // dilation size (h, w)
+                    delta);                 // output (delta)
+
+                if (l.binary || l.xnor) {
+                    swap_binary(&l);
+                }
+                if (l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w);
+            }
+        }
+    }
+#endif
+    if (state.net.try_fix_nan) {
+        if (state.delta) {
+            reset_nan_and_inf(state.delta, l.inputs * l.batch);
+        }
+        int size = l.nweights;
+        reset_nan_and_inf(l.weight_updates_gpu, size);
+        fix_nan_and_inf(l.weights_gpu, size);
+    }
+
+
+}
+
+__global__ void calc_avg_activation_kernel(float *src, float *dst, int size, int channels, int batches)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int xy = i % size;
+    int b = i / size;
+
+    if (i < size*batches) {
+        dst[i] = 0;
+        for (int c = 0; c < channels; ++c) {
+            dst[i] += src[xy + size*(c + channels*b)];
+        }
+        dst[i] = dst[i] / channels;
+    }
+}
+
+void calc_avg_activation_gpu(float *src, float *dst, int size, int channels, int batches)
+{
+    const int num_blocks = get_number_of_blocks(size*batches, BLOCK);
+
+    calc_avg_activation_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (src, dst, size, channels, batches);
+}
+
+
+__global__ void assisted_activation_kernel(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int xy = i % size;
+    int b = i / size;
+
+    if (b < batches) {
+        for (int c = 0; c < channels; ++c) {
+            output[xy + size*(c + channels*b)] += alpha * gt_gpu[i] * a_avg_gpu[i];
+            //output[xy + size*(c + channels*b)] += gt_gpu[i] * a_avg_gpu[i];
+            //output[xy + size*(c + channels*b)] += gt_gpu[i] * output[xy + size*(c + channels*b)];
+            //output[xy + size*(c + channels*b)] = a_avg_gpu[i];
+        }
+    }
+}
+
+void assisted_activation_gpu(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
+{
+    const int num_blocks = get_number_of_blocks(size*batches, BLOCK);
+
+    assisted_activation_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (alpha, output, gt_gpu, a_avg_gpu, size, channels, batches);
+}
+
+
+__global__ void assisted_activation2_kernel(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int xy = i % size;
+    int b = i / size;
+    float beta = 1 - alpha;
+
+    if (b < batches) {
+        for (int c = 0; c < channels; ++c) {
+            if(gt_gpu[i] == 0)
+                output[xy + size*(c + channels*b)] *= beta;
+
+        }
+    }
+}
+
+void assisted_activation2_gpu(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
+{
+    const int num_blocks = get_number_of_blocks(size*batches, BLOCK);
+
+    assisted_activation2_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (alpha, output, gt_gpu, a_avg_gpu, size, channels, batches);
+}
+
+void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
+{
+    const int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
+
+    // epoch
+    //const float epoch = (float)(*state.net.seen) / state.net.train_images_num;
+
+    // calculate alpha
+    //const float alpha = (1 + cos(3.141592 * iteration_num)) / (2 * state.net.max_batches);
+    //const float alpha = (1 + cos(3.141592 * epoch)) / (2 * state.net.max_batches);
+    float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)) / 2;
+    //float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches));
+
+    if (l.assisted_excitation == 1) {
+        if (iteration_num > state.net.max_batches / 2) return;
+    }
+    else {
+        if (iteration_num < state.net.burn_in) return;
+        else
+            if (iteration_num > l.assisted_excitation) return;
+        else
+            alpha = (1 + cos(3.141592 * iteration_num / (state.net.burn_in + l.assisted_excitation))) / 2; // from 1 to 0
+    }
+
+    //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n",
+    //    epoch, alpha, (*state.net.seen), state.net.max_batches, state.net.train_images_num);
+
+    //const int size = l.outputs * l.batch;
+
+    float *a_avg = (float *)calloc(l.out_w * l.out_h * l.batch, sizeof(float));
+    float *gt = (float *)calloc(l.out_w * l.out_h * l.batch, sizeof(float));
+
+    int b;
+    int w, h;
+
+    l.max_boxes = state.net.num_boxes;
+    l.truths = l.max_boxes*(4 + 1);
+
+    int num_truth = l.batch*l.truths;
+    float *truth_cpu = (float *)calloc(num_truth, sizeof(float));
+    cuda_pull_array(state.truth, truth_cpu, num_truth);
+    //cudaStreamSynchronize(get_cuda_stream());
+    //CHECK_CUDA(cudaPeekAtLastError());
+
+    for (b = 0; b < l.batch; ++b)
+    {
+        // calculate G
+        int t;
+        for (t = 0; t < state.net.num_boxes; ++t) {
+            box truth = float_to_box_stride(truth_cpu + t*(4 + 1) + b*l.truths, 1);
+            if (!truth.x) break;  // continue;
+            float beta = 0;
+            //float beta = 1 - alpha; // from 0 to 1
+            float dw = (1 - truth.w) * beta;
+            float dh = (1 - truth.h) * beta;
+            //printf(" alpha = %f, beta = %f, truth.w = %f, dw = %f, tw+dw = %f, l.out_w = %d \n", alpha, beta, truth.w, dw, truth.w+dw, l.out_w);
+
+            int left = floorf((truth.x - (dw + truth.w) / 2) * l.out_w);
+            int right = ceilf((truth.x + (dw + truth.w) / 2) * l.out_w);
+            int top = floorf((truth.y - (dh + truth.h) / 2) * l.out_h);
+            int bottom = ceilf((truth.y + (dh + truth.h) / 2) * l.out_h);
+            if (left < 0) left = 0;
+            if (top < 0) top = 0;
+            if (right > l.out_w) right = l.out_w;
+            if (bottom > l.out_h) bottom = l.out_h;
+
+            for (w = left; w <= right; w++) {
+                for (h = top; h < bottom; h++) {
+                    gt[w + l.out_w * h + l.out_w*l.out_h*b] = 1;
+                }
+            }
+        }
+    }
+
+    cuda_push_array(l.gt_gpu, gt, l.out_w * l.out_h * l.batch);
+    //cudaStreamSynchronize(get_cuda_stream());
+    //CHECK_CUDA(cudaPeekAtLastError());
+
+    // calc avg_output on GPU - for whole batch
+    calc_avg_activation_gpu(l.output_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
+    //cudaStreamSynchronize(get_cuda_stream());
+    //CHECK_CUDA(cudaPeekAtLastError());
+
+    // calc new output
+    //assisted_activation2_gpu(1, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);  // AE3: gt increases (beta = 1 - alpha = 0)
+    //assisted_activation2_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
+    assisted_activation_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
+    //cudaStreamSynchronize(get_cuda_stream());
+    //CHECK_CUDA(cudaPeekAtLastError());
+
+
+
+    /*
+    for (b = 0; b < l.batch; ++b)
+    {
+        // calculate average A
+        for (w = 0; w < l.out_w; w++) {
+            for (h = 0; h < l.out_h; h++) {
+                for (c = 0; c < l.out_c; c++) {
+                    a_avg[w + l.out_w*(h + l.out_h*b)] += l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))];
+                }
+                a_avg[w + l.out_w*(h + l.out_h*b)] /= l.out_c;  // a_avg / d
+            }
+        }
+    }
+
+    // change activation
+    for (b = 0; b < l.batch; ++b)
+    {
+        for (w = 0; w < l.out_w; w++) {
+            for (h = 0; h < l.out_h; h++) {
+                for (c = 0; c < l.out_c; c++)
+                {
+                    // a = a + alpha(t) + e(c,i,j) = a + alpha(t) + g(i,j) * avg_a(i,j) / channels
+                    l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] +=
+                        alpha *
+                        g[w + l.out_w*(h + l.out_h*b)] *
+                        a_avg[w + l.out_w*(h + l.out_h*b)];
+
+                    //l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] =
+                    //    alpha * g[w + l.out_w*(h + l.out_h*b)] * a_avg[w + l.out_w*(h + l.out_h*b)];
+                }
+            }
+        }
+    }
+    */
+
+    if (0)   // visualize ground truth
+    {
+#ifdef OPENCV
+        cuda_pull_array(l.output_gpu, l.output, l.outputs * l.batch);
+        cudaStreamSynchronize(get_cuda_stream());
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        for (b = 0; b < l.batch; ++b)
+        {
+            printf(" Assisted Excitation alpha = %f \n", alpha);
+            image img = float_to_image(l.out_w, l.out_h, 1, &gt[l.out_w*l.out_h*b]);
+            char buff[100];
+            sprintf(buff, "a_excitation_gt_%d", b);
+            show_image_cv(img, buff);
+
+            //image img2 = float_to_image(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
+            image img2 = float_to_image_scaled(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
+            char buff2[100];
+            sprintf(buff2, "a_excitation_output_%d", b);
+            show_image_cv(img2, buff2);
+
+            /*
+            int c = l.out_c;
+            if (c > 4) c = 4;
+            image img3 = float_to_image(l.out_w, l.out_h, c, &l.output[l.out_w*l.out_h*l.out_c*b]);
+            image dc = collapse_image_layers(img3, 1);
+            char buff3[100];
+            sprintf(buff3, "a_excitation_act_collapsed_%d", b);
+            show_image_cv(dc, buff3);
+            */
+
+            wait_key_cv(5);
+        }
+        wait_until_press_key_cv();
+#endif // OPENCV
+    }
+
+    free(truth_cpu);
+    free(gt);
+    free(a_avg);
+}
+
+void pull_convolutional_layer(convolutional_layer l)
+{
+    cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
+    cuda_pull_array_async(l.biases_gpu, l.biases, l.n);
+    if (l.weight_updates_gpu) cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
+    if (l.bias_updates_gpu) cuda_pull_array_async(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_pull_array_async(l.scales_gpu, l.scales, l.n);
+        cuda_pull_array_async(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_pull_array_async(l.rolling_variance_gpu, l.rolling_variance, l.n);
+    }
+    if (l.adam){
+        cuda_pull_array_async(l.m_gpu, l.m, l.nweights);
+        cuda_pull_array_async(l.v_gpu, l.v, l.nweights);
+    }
+    CHECK_CUDA(cudaPeekAtLastError());
+    cudaStreamSynchronize(get_cuda_stream());
+}
+
+void push_convolutional_layer(convolutional_layer l)
+{
+    cuda_push_array(l.weights_gpu, l.weights, l.nweights);
+#ifdef CUDNN_HALF
+    assert(l.nweights > 0);
+    cuda_convert_f32_to_f16(l.weights_gpu, l.nweights, l.weights_gpu16);
+#endif
+    cuda_push_array(l.biases_gpu, l.biases, l.n);
+    if (l.train) {
+        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+        cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    }
+    if (l.batch_normalize){
+        cuda_push_array(l.scales_gpu, l.scales, l.n);
+        cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
+    }
+    if (l.adam){
+        cuda_push_array(l.m_gpu, l.m, l.nweights);
+        cuda_push_array(l.v_gpu, l.v, l.nweights);
+    }
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
+{
+
+        /*
+        for (int angle = 0; angle < 360; angle++) {
+            printf(" angle = %d \n", angle);
+            smooth_rotate_weights_kernel(l.weights_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, angle, 0);
+
+            cuda_pull_array(l.weight_deform_gpu, l.weights, l.nweights);
+            visualize_convolutional_layer(l, "weights", NULL);
+            wait_key_cv(10);
+        }
+        */
+
+    if (l.deform) {
+
+        //for (l.angle = 0; l.angle < 360; l.angle += 1)
+        //{
+            //stretch_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle/180, 1);
+            //else simple_copy_ongpu(l.nweights, l.weight_updates_gpu, l.weight_deform_gpu);
+
+            if (l.rotate) rotate_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, 1);
+            else if (l.sway) sway_and_flip_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle, 1);
+            else if (l.stretch) stretch_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, 0, 1);
+            else if (l.stretch_sway) stretch_sway_flip_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle, 1);
+
+            //simple_copy_ongpu(l.nweights, l.weight_updates_gpu, l.weight_deform_gpu);
+
+            reduce_and_expand_array_gpu(l.weight_deform_gpu, l.weight_updates_gpu, l.nweights, 4);
+
+            //printf(" angle = %f \n", l.angle);
+            //cuda_pull_array(l.weight_deform_gpu, l.weights, l.nweights);
+            //visualize_convolutional_layer(l, "weights", NULL);
+            //wait_key_cv(10);
+        //}
+
+    }
+
+    // Loss scale for Mixed-Precision on Tensor-Cores
+    float learning_rate = learning_rate_init*l.learning_rate_scale / loss_scale;
+    //float momentum = a.momentum;
+    //float decay = a.decay;
+    //int batch = a.batch;
+
+
+    reset_nan_and_inf(l.weight_updates_gpu, l.nweights);
+    fix_nan_and_inf(l.weights_gpu, l.nweights);
+
+    // Gradient Centralization
+    if (l.grad_centr && l.batch_normalize) {
+        // weights[filters][channels][height][width]
+        // for(filters) w[f] = w[f] - mean(w[c][h][w])
+        gradient_centralization_gpu(l.size, l.size, l.c / l.groups, l.n, l.weight_updates_gpu);
+    }
+
+
+    if (l.adam) {
+        //adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t);
+
+        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t);
+        if (l.scales_gpu) {
+            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t);
+        }
+    }
+    else {
+        //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        //axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+        //scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
+
+        float *old_weight_updates_gpu = l.weight_updates_gpu;
+
+
+        if (l.reverse) {
+            float clip = 0.0;
+            float divider = 1.0;
+            float abs_add = 1.0;
+            mult_inverse_array_gpu(l.weight_updates_gpu, l.output_gpu, l.inputs*l.batch, l.reverse, divider, clip, abs_add);
+            l.weight_updates_gpu = l.output_gpu;
+        }
+
+
+        axpy_ongpu(l.nweights, -decay*batch*loss_scale, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+
+        l.weight_updates_gpu = old_weight_updates_gpu;
+
+        scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
+
+        axpy_ongpu(l.n, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+        scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1);
+
+        if (l.scales_gpu) {
+            axpy_ongpu(l.n, learning_rate / batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+            scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1);
+        }
+    }
+
+    if (l.deform) {
+        //for (l.angle = 0; l.angle < 360; l.angle += 4)
+        //{
+            expand_array_gpu(l.weights_gpu, l.weight_deform_gpu, l.nweights, 4);
+
+            //simple_copy_ongpu(l.nweights, l.weight_deform_gpu, l.weights_gpu);
+
+            if (l.rotate) rotate_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, 0);
+            else if (l.sway) sway_and_flip_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, l.angle, 0);
+            else if (l.stretch) stretch_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, 0, 0);
+            else if (l.stretch_sway) stretch_sway_flip_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, l.angle, 0);
+
+            //printf(" angle = %f, reverse = %d \n", l.angle, 0);
+            //cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
+            //visualize_convolutional_layer(l, "weights", NULL);
+            //wait_key_cv(10);
+        //}
+    }
+
+    if (l.clip) {
+        constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1);
+    }
+}
+
+
+
+/*
+void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay)
+{
+    int size = layer.size*layer.size*layer.c*layer.n;
+    axpy_ongpu(layer.n, learning_rate/batch, layer.bias_updates_gpu, 1, layer.biases_gpu, 1);
+    scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1);
+
+    if(layer.scales_gpu){
+        axpy_ongpu(layer.n, learning_rate/batch, layer.scale_updates_gpu, 1, layer.scales_gpu, 1);
+        scal_ongpu(layer.n, momentum, layer.scale_updates_gpu, 1);
+    }
+
+    if(layer.adam){
+        scal_ongpu(size, layer.B1, layer.m_gpu, 1);
+        scal_ongpu(size, layer.B2, layer.v_gpu, 1);
+
+        axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
+
+        axpy_ongpu(size, -(1-layer.B1), layer.weight_updates_gpu, 1, layer.m_gpu, 1);
+        mul_ongpu(size, layer.weight_updates_gpu, 1, layer.weight_updates_gpu, 1);
+        axpy_ongpu(size, (1-layer.B2), layer.weight_updates_gpu, 1, layer.v_gpu, 1);
+
+        adam_gpu(size, layer.weights_gpu, layer.m_gpu, layer.v_gpu, layer.B1, layer.B2, learning_rate/batch, layer.eps, layer.t+1);
+        fill_ongpu(size, 0, layer.weight_updates_gpu, 1);
+    }else{
+        axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);  // wu = wu - w*decay*batch
+        axpy_ongpu(size, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1); // w = w + wu*lr/batch
+        scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);    // wu = wu*momentum // wu = (wu - w*decay*batch)*momentum
+        // w = w + (wu - w*decay*batch)*lr/batch = w + wu*lr/batch - w*decay*lr = w*(1-decay*lr) + wu*lr/batch
+        //wu_prev = (wu_old - w_old*decay*batch)*momentum
+
+
+        //weights_update = weights_update_new + (weights_update_old - weights_old*decay*batch)*momentum - weights_new*decay*batch =
+        // = weights_update_new + weights_update_old*momentum - weights_old*decay*batch*momentum - weights_new*decay*batch
+        // = weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch
+
+        //------------- RESULT --------------
+        // weights_update = weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch
+        //-----------------------------------
+
+        // weights_newest = weights_new + (weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch)*lr/batch
+        // = weights_new + weights_update_new*lr/batch + weights_update_old*momentum*lr/batch - weights_old*momentum*decay*batch*lr/batch - weights_new*decay*batch*lr/batch
+        // = weights_new + weights_update_new*lr/batch + weights_update_old*momentum*lr/batch - weights_old*momentum*decay*lr - weights_new*decay*lr
+        // = weights_new*(1 - decay*lr) - weights_old*momentum*decay*lr + (weights_update_new + weights_update_old*momentum)*lr/batch
+
+        //------------- RESULT --------------
+        // weights_newest = weights_new*(1 - decay*lr) - weights_old*momentum*(decay*lr) + (weights_update_new + weights_update_old*momentum)*lr/batch =
+        // = weights_new - (weights_new + weights_old*momentum)*decay*lr + (weights_update_new + weights_update_old*momentum)*lr / batch
+        //-----------------------------------
+    }
+}
+*/
diff --git a/darknet-master/src/convolutional_layer.c b/darknet-master/src/convolutional_layer.c
new file mode 100644
index 0000000..18f9e8b
--- /dev/null
+++ b/darknet-master/src/convolutional_layer.c
@@ -0,0 +1,1683 @@
+#include "convolutional_layer.h"
+#include "utils.h"
+#include "batchnorm_layer.h"
+#include "im2col.h"
+#include "col2im.h"
+#include "blas.h"
+#include "gemm.h"
+#include "box.h"
+#include <stdio.h>
+#include <time.h>
+
+#ifdef AI2
+#include "xnor_layer.h"
+#endif
+
+#ifdef __cplusplus
+#define PUT_IN_REGISTER
+#else
+#define PUT_IN_REGISTER register
+#endif
+
+#ifndef AI2
+#define AI2 0
+void forward_xnor_layer(layer l, network_state state);
+#endif
+
+void swap_binary(convolutional_layer *l)
+{
+    float *swap = l->weights;
+    l->weights = l->binary_weights;
+    l->binary_weights = swap;
+
+    #ifdef GPU
+    swap = l->weights_gpu;
+    l->weights_gpu = l->binary_weights_gpu;
+    l->binary_weights_gpu = swap;
+    #endif
+}
+
+void binarize_weights(float *weights, int n, int size, float *binary)
+{
+    int i, f;
+    for(f = 0; f < n; ++f){
+        float mean = 0;
+        for(i = 0; i < size; ++i){
+            mean += fabs(weights[f*size + i]);
+        }
+        mean = mean / size;
+        for(i = 0; i < size; ++i){
+            binary[f*size + i] = (weights[f*size + i] > 0) ? mean: -mean;
+        }
+    }
+}
+
+void binarize_cpu(float *input, int n, float *binary)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        binary[i] = (input[i] > 0) ? 1 : -1;
+    }
+}
+
+void binarize_input(float *input, int n, int size, float *binary)
+{
+    int i, s;
+    for(s = 0; s < size; ++s){
+        float mean = 0;
+        for(i = 0; i < n; ++i){
+            mean += fabs(input[i*size + s]);
+        }
+        mean = mean / n;
+        for(i = 0; i < n; ++i){
+            binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
+        }
+    }
+}
+
+int convolutional_out_height(convolutional_layer l)
+{
+    return (l.h + 2*l.pad - l.size) / l.stride_y + 1;
+}
+
+int convolutional_out_width(convolutional_layer l)
+{
+    return (l.w + 2*l.pad - l.size) / l.stride_x + 1;
+}
+
+image get_convolutional_image(convolutional_layer l)
+{
+    int h,w,c;
+    h = convolutional_out_height(l);
+    w = convolutional_out_width(l);
+    c = l.n;
+    return float_to_image(w,h,c,l.output);
+}
+
+image get_convolutional_delta(convolutional_layer l)
+{
+    int h,w,c;
+    h = convolutional_out_height(l);
+    w = convolutional_out_width(l);
+    c = l.n;
+    return float_to_image(w,h,c,l.delta);
+}
+
+size_t get_workspace_size32(layer l){
+#ifdef CUDNN
+    if(gpu_index >= 0){
+        size_t most = 0;
+        size_t s = 0;
+        CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
+                l.srcTensorDesc,
+                l.weightDesc,
+                l.convDesc,
+                l.dstTensorDesc,
+                l.fw_algo,
+                &s));
+        if (s > most) most = s;
+        CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
+                l.srcTensorDesc,
+                l.ddstTensorDesc,
+                l.convDesc,
+                l.dweightDesc,
+                l.bf_algo,
+                &s));
+        if (s > most && l.train) most = s;
+        CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
+                l.weightDesc,
+                l.ddstTensorDesc,
+                l.convDesc,
+                l.dsrcTensorDesc,
+                l.bd_algo,
+                &s));
+        if (s > most && l.train) most = s;
+        return most;
+    }
+    #endif
+    if (l.xnor) {
+        size_t re_packed_input_size = l.c * l.w * l.h * sizeof(float);
+        size_t workspace_size = (size_t)l.bit_align*l.size*l.size*l.c * sizeof(float);
+        if (workspace_size < re_packed_input_size) workspace_size = re_packed_input_size;
+        return workspace_size;
+    }
+    return (size_t)l.out_h*l.out_w*l.size*l.size*(l.c / l.groups)*sizeof(float);
+}
+
+size_t get_workspace_size16(layer l) {
+#if defined(CUDNN) && defined(CUDNN_HALF)
+    if (gpu_index >= 0) {
+        size_t most = 0;
+        size_t s = 0;
+        CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
+            l.srcTensorDesc16,
+            l.weightDesc16,
+            l.convDesc,
+            l.dstTensorDesc16,
+            l.fw_algo16,
+            &s));
+        if (s > most) most = s;
+        CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
+            l.srcTensorDesc16,
+            l.ddstTensorDesc16,
+            l.convDesc,
+            l.dweightDesc16,
+            l.bf_algo16,
+            &s));
+        if (s > most && l.train) most = s;
+        CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
+            l.weightDesc16,
+            l.ddstTensorDesc16,
+            l.convDesc,
+            l.dsrcTensorDesc16,
+            l.bd_algo16,
+            &s));
+        if (s > most && l.train) most = s;
+        return most;
+    }
+#endif
+    return 0;
+    //if (l.xnor) return (size_t)l.bit_align*l.size*l.size*l.c * sizeof(float);
+    //return (size_t)l.out_h*l.out_w*l.size*l.size*l.c * sizeof(float);
+}
+
+size_t get_convolutional_workspace_size(layer l) {
+    size_t workspace_size = get_workspace_size32(l);
+    size_t workspace_size16 = get_workspace_size16(l);
+    if (workspace_size16 > workspace_size) workspace_size = workspace_size16;
+    return workspace_size;
+}
+#ifdef GPU
+#ifdef CUDNN
+void create_convolutional_cudnn_tensors(layer *l)
+{
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normTensorDesc));
+
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normDstTensorDesc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->srcTensorDesc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dstTensorDesc));
+    CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->weightDesc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dsrcTensorDesc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->ddstTensorDesc));
+    CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->dweightDesc));
+
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normDstTensorDescF16));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->srcTensorDesc16));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dstTensorDesc16));
+    CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->weightDesc16));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dsrcTensorDesc16));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->ddstTensorDesc16));
+    CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->dweightDesc16));
+
+    CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&l->convDesc));
+}
+
+void cudnn_convolutional_setup(layer *l, int cudnn_preference, size_t workspace_size_specify)
+{
+
+// CUDNN_HALF
+    // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0):
+    //   Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
+    // PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
+
+    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+
+#if(CUDNN_MAJOR >= 7)
+    // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
+    // For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
+    // otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
+    // Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
+    // 1. Accumulation into FP32
+    // 2. Loss Scaling - required only for: activation gradients. We do not use.
+    // 3. FP32 Master Copy of Weights
+    // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
+    if (l->groups < 1) l->groups = 1;
+    if (l->stride_x < 1) l->stride_x = 1;
+    if (l->stride_y < 1) l->stride_y = 1;
+    CHECK_CUDNN(cudnnSetConvolutionGroupCount(l->convDesc, l->groups));
+    CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH));
+#if((CUDNN_MAJOR*10 + CUDNN_MINOR) >= 72)   // cuDNN >= 7.2
+    //CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)); // reduces the speed of regular and group convolution
+#endif
+#else   //if(CUDNN_MAJOR >= 7)
+    if (l->groups > 1) {
+        error("CUDNN < 7 doesn't support groups, please upgrade!", DARKNET_LOC);
+    }
+#endif
+
+    // INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported
+    //   on architectures with DP4A support (compute capability 6.1 and later).
+    //cudnnDataType_t data_type = CUDNN_DATA_INT8;
+
+    // backward delta
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w));
+    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
+
+    // forward
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w));
+    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
+
+    // backward delta
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dsrcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->ddstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
+    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
+
+    // forward
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
+    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
+
+    // batch norm
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
+
+    // batch norm
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w));
+
+    //printf("\n l->dilation = %d, l->pad = %d, l->size = %d, l->stride = %d, l->stride_x = %d, l->stride_y = %d, l->groups = %d, l->w = %d, l->h = %d, l->c = %d, l->n = %d, l->out_w = %d, l->out_h = %d, l->out_c = %d, l->batch = %d, data_type = %d \n",
+    //    l->dilation, l->pad, l->size, l->stride, l->stride_x, l->stride_y, l->groups, l->w, l->h, l->c, l->n, l->out_w, l->out_h, l->out_c, l->batch, data_type);
+#if(CUDNN_MAJOR >= 6)
+    CHECK_CUDNN(cudnnSetConvolution2dDescriptor(l->convDesc, l->pad * l->dilation, l->pad * l->dilation, l->stride_y, l->stride_x, l->dilation, l->dilation, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));    // cudnn >= 6.0
+#else
+    CHECK_CUDNN(cudnnSetConvolution2dDescriptor(l->convDesc, l->pad * l->dilation, l->pad * l->dilation, l->stride_y, l->stride_x, l->dilation, l->dilation, CUDNN_CROSS_CORRELATION));    // cudnn 5.1
+#endif
+
+
+#if CUDNN_MAJOR >= 8
+
+    if (cudnn_preference == cudnn_smallest)
+    {
+        workspace_size_specify = 0;
+    }
+
+    size_t free_memory, total_memory;
+    int requested_algo_count = 0, returned_algo_count = 0;
+    int found_conv_algorithm = 0;
+    float min_time = 1000000;   // 1000 sec
+
+    // FWD
+    cudnnConvolutionFwdAlgoPerf_t conv_fwd_results[100];
+    CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle(), &requested_algo_count));
+
+    CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnn_handle(),
+        l->srcTensorDesc,
+        l->weightDesc,
+        l->convDesc,
+        l->dstTensorDesc,
+        requested_algo_count, // (cudnnConvolutionFwdPreference_t)forward_algo,
+        &returned_algo_count, // workspace_size_specify,
+        conv_fwd_results));
+
+    CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
+
+    found_conv_algorithm = 0;
+    min_time = 1000000;   // 1000 sec
+    for (int i = 0; i < returned_algo_count; i++)
+    {
+        if (conv_fwd_results[i].status == CUDNN_STATUS_SUCCESS &&
+            conv_fwd_results[i].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+            conv_fwd_results[i].memory < free_memory &&
+            (conv_fwd_results[i].memory <= workspace_size_specify || cudnn_preference == cudnn_fastest) &&
+            conv_fwd_results[i].time < min_time)
+        {
+            found_conv_algorithm = 1;
+            l->fw_algo = conv_fwd_results[i].algo;
+            min_time = conv_fwd_results[i].time;
+            //printf(" - cuDNN FWD algo: %d, time = %f ms \n", l->fw_algo, min_time);
+        }
+    }
+
+    if (!found_conv_algorithm) {
+        error("Error: cuDNN hasn't found FWD algo for convolution", DARKNET_LOC);
+    }
+    //printf(" cuDNN FWD algo: %d, time = %f ms \n", l->fw_algo, min_time);
+
+    // Bwd-Data
+    cudnnConvolutionBwdDataAlgoPerf_t conv_bwd_data_results[100];
+    CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnn_handle(), &requested_algo_count));
+
+    CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn_handle(),
+        l->weightDesc,
+        l->ddstTensorDesc,
+        l->convDesc,
+        l->dsrcTensorDesc,
+        requested_algo_count, // (cudnnConvolutionFwdPreference_t)forward_algo,
+        &returned_algo_count, // workspace_size_specify,
+        &conv_bwd_data_results[0]));
+
+    CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
+
+    found_conv_algorithm = 0;
+    min_time = 1000000;   // 1000 sec
+    for (int i = 0; i < returned_algo_count; i++)
+    {
+        if (conv_bwd_data_results[i].status == CUDNN_STATUS_SUCCESS &&
+            conv_bwd_data_results[i].memory < free_memory &&
+            (conv_bwd_data_results[i].memory <= workspace_size_specify || cudnn_preference == cudnn_fastest) &&
+            conv_bwd_data_results[i].time < min_time)
+        {
+            found_conv_algorithm = 1;
+            l->bd_algo = conv_bwd_data_results[i].algo;
+            min_time = conv_bwd_data_results[i].time;
+        }
+    }
+
+    if (!found_conv_algorithm) {
+        error("Error: cuDNN hasn't found BWD-data algo for convolution", DARKNET_LOC);
+    }
+    //printf(" cuDNN BWD-data algo: %d \n", l->bd_algo);
+
+    // Bwd-Filters
+    cudnnConvolutionBwdFilterAlgoPerf_t conv_bwd_filter_results[100];
+    CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnn_handle(), &requested_algo_count));
+
+    CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnn_handle(),
+        l->srcTensorDesc,
+        l->ddstTensorDesc,
+        l->convDesc,
+        l->dweightDesc,
+        requested_algo_count, // (cudnnConvolutionFwdPreference_t)forward_algo,
+        &returned_algo_count, // workspace_size_specify,
+        &conv_bwd_filter_results[0]));
+
+    CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
+
+    found_conv_algorithm = 0;
+    min_time = 1000000;   // 1000 sec
+    for (int i = 0; i < returned_algo_count; i++)
+    {
+        if (conv_bwd_filter_results[i].status == CUDNN_STATUS_SUCCESS &&
+            conv_bwd_filter_results[i].memory < free_memory &&
+            (conv_bwd_filter_results[i].memory <= workspace_size_specify || cudnn_preference == cudnn_fastest) &&
+            conv_bwd_filter_results[i].time < min_time)
+        {
+            found_conv_algorithm = 1;
+            l->bf_algo = conv_bwd_filter_results[i].algo;
+            min_time = conv_bwd_filter_results[i].time;
+        }
+    }
+
+    if (!found_conv_algorithm) {
+        error("Error: cuDNN hasn't found BWD-filter algo for convolution", DARKNET_LOC);
+    }
+    //printf(" cuDNN BWD-filter algo: %d \n", l->bf_algo);
+
+#else   // CUDNN_MAJOR >= 8
+
+    int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+    int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+    int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+    if (cudnn_preference == cudnn_smallest)
+    {
+        forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+        backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
+        backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+        printf(" CUDNN-slow ");
+    }
+    if (cudnn_preference == cudnn_specify)
+    {
+        forward_algo = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+        backward_algo = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+        //printf(" CUDNN-specified %zu ", workspace_size_specify);
+    }
+
+    CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
+            l->srcTensorDesc,
+            l->weightDesc,
+            l->convDesc,
+            l->dstTensorDesc,
+            (cudnnConvolutionFwdPreference_t)forward_algo,
+            workspace_size_specify,
+            &l->fw_algo));
+
+    CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
+        l->weightDesc,
+        l->ddstTensorDesc,
+        l->convDesc,
+        l->dsrcTensorDesc,
+        (cudnnConvolutionBwdDataPreference_t)backward_algo,
+        workspace_size_specify,
+        &l->bd_algo));
+
+    CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
+        l->srcTensorDesc,
+        l->ddstTensorDesc,
+        l->convDesc,
+        l->dweightDesc,
+        (cudnnConvolutionBwdFilterPreference_t)backward_filter,
+        workspace_size_specify,
+        &l->bf_algo));
+#endif  // CUDNN_MAJOR >= 8
+
+
+    //if (data_type == CUDNN_DATA_HALF)
+    {
+        // HALF-16 if(data_type == CUDNN_DATA_HALF)
+        l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+        l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+        l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+
+        // FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
+        //l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
+        //l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
+        //l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
+    }
+}
+#endif
+#endif
+
+
+void free_convolutional_batchnorm(convolutional_layer *l)
+{
+    if (!l->share_layer) {
+        if (l->scales)          free(l->scales),            l->scales = NULL;
+        if (l->scale_updates)   free(l->scale_updates),     l->scale_updates = NULL;
+        if (l->mean)            free(l->mean),              l->mean = NULL;
+        if (l->variance)        free(l->variance),          l->variance = NULL;
+        if (l->mean_delta)      free(l->mean_delta),        l->mean_delta = NULL;
+        if (l->variance_delta)  free(l->variance_delta),    l->variance_delta = NULL;
+        if (l->rolling_mean)    free(l->rolling_mean),      l->rolling_mean = NULL;
+        if (l->rolling_variance) free(l->rolling_variance),  l->rolling_variance = NULL;
+        if (l->x)               free(l->x),                 l->x = NULL;
+        if (l->x_norm)          free(l->x_norm),            l->x_norm = NULL;
+
+#ifdef GPU
+        if (l->scales_gpu)          cuda_free(l->scales_gpu),           l->scales_gpu = NULL;
+        if (l->scale_updates_gpu)   cuda_free(l->scale_updates_gpu),    l->scale_updates_gpu = NULL;
+        if (l->mean_gpu)            cuda_free(l->mean_gpu),             l->mean_gpu = NULL;
+        if (l->variance_gpu)        cuda_free(l->variance_gpu),         l->variance_gpu = NULL;
+        if (l->mean_delta_gpu)      cuda_free(l->mean_delta_gpu),       l->mean_delta_gpu = NULL;
+        if (l->variance_delta_gpu)  cuda_free(l->variance_delta_gpu),   l->variance_delta_gpu = NULL;
+        if (l->rolling_mean_gpu)    cuda_free(l->rolling_mean_gpu),     l->rolling_mean_gpu = NULL;
+        if (l->rolling_variance_gpu) cuda_free(l->rolling_variance_gpu), l->rolling_variance_gpu = NULL;
+        if (l->x_gpu)               cuda_free(l->x_gpu),                l->x_gpu = NULL;
+        if (l->x_norm_gpu)          cuda_free(l->x_norm_gpu),           l->x_norm_gpu = NULL;
+#endif
+    }
+}
+
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation, int deform, int train)
+{
+    int total_batch = batch*steps;
+    int i;
+    convolutional_layer l = { (LAYER_TYPE)0 };
+    l.type = CONVOLUTIONAL;
+    l.train = train;
+
+    if (xnor) groups = 1;   // disable groups for XNOR-net
+    if (groups < 1) groups = 1;
+
+    const int blur_stride_x = stride_x;
+    const int blur_stride_y = stride_y;
+    l.antialiasing = antialiasing;
+    if (antialiasing) {
+        stride_x = stride_y = l.stride = l.stride_x = l.stride_y = 1; // use stride=1 in host-layer
+    }
+
+    l.wait_stream_id = -1;
+    l.deform = deform;
+    l.assisted_excitation = assisted_excitation;
+    l.share_layer = share_layer;
+    l.index = index;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.groups = groups;
+    l.n = n;
+    l.binary = binary;
+    l.xnor = xnor;
+    l.use_bin_output = use_bin_output;
+    l.batch = batch;
+    l.steps = steps;
+    l.stride = stride_x;
+    l.stride_x = stride_x;
+    l.stride_y = stride_y;
+    l.dilation = dilation;
+    l.size = size;
+    l.pad = padding;
+    l.batch_normalize = batch_normalize;
+    l.learning_rate_scale = 1;
+    l.nweights = (c / groups) * n * size * size;
+
+    if (l.share_layer) {
+        if (l.size != l.share_layer->size || l.nweights != l.share_layer->nweights || l.c != l.share_layer->c || l.n != l.share_layer->n) {
+            error("Layer size, nweights, channels or filters don't match for the share_layer", DARKNET_LOC);
+        }
+
+        l.weights = l.share_layer->weights;
+        l.weight_updates = l.share_layer->weight_updates;
+
+        l.biases = l.share_layer->biases;
+        l.bias_updates = l.share_layer->bias_updates;
+    }
+    else {
+        l.weights = (float*)xcalloc(l.nweights, sizeof(float));
+        l.biases = (float*)xcalloc(n, sizeof(float));
+
+        if (train) {
+            l.weight_updates = (float*)xcalloc(l.nweights, sizeof(float));
+            l.bias_updates = (float*)xcalloc(n, sizeof(float));
+
+            l.weights_ema = (float*)xcalloc(l.nweights, sizeof(float));
+            l.biases_ema = (float*)xcalloc(n, sizeof(float));
+        }
+    }
+
+    // float scale = 1./sqrt(size*size*c);
+    float scale = sqrt(2./(size*size*c/groups));
+    if (l.activation == NORM_CHAN || l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL) {
+        for (i = 0; i < l.nweights; ++i) l.weights[i] = 1;   // rand_normal();
+    }
+    else {
+        for (i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_uniform(-1, 1);   // rand_normal();
+    }
+    int out_h = convolutional_out_height(l);
+    int out_w = convolutional_out_width(l);
+    l.out_h = out_h;
+    l.out_w = out_w;
+    l.out_c = n;
+    l.outputs = l.out_h * l.out_w * l.out_c;
+    l.inputs = l.w * l.h * l.c;
+    l.activation = activation;
+
+    l.output = (float*)xcalloc(total_batch*l.outputs, sizeof(float));
+#ifndef GPU
+    if (train) l.delta = (float*)xcalloc(total_batch*l.outputs, sizeof(float));
+#endif  // not GPU
+
+    l.forward = forward_convolutional_layer;
+    l.backward = backward_convolutional_layer;
+    l.update = update_convolutional_layer;
+    if(binary){
+        l.binary_weights = (float*)xcalloc(l.nweights, sizeof(float));
+        l.cweights = (char*)xcalloc(l.nweights, sizeof(char));
+        l.scales = (float*)xcalloc(n, sizeof(float));
+    }
+    if(xnor){
+        l.binary_weights = (float*)xcalloc(l.nweights, sizeof(float));
+        l.binary_input = (float*)xcalloc(l.inputs * l.batch, sizeof(float));
+
+        int align = 32;// 8;
+        int src_align = l.out_h*l.out_w;
+        l.bit_align = src_align + (align - src_align % align);
+
+        l.mean_arr = (float*)xcalloc(l.n, sizeof(float));
+
+        const size_t new_c = l.c / 32;
+        size_t in_re_packed_input_size = new_c * l.w * l.h + 1;
+        l.bin_re_packed_input = (uint32_t*)xcalloc(in_re_packed_input_size, sizeof(uint32_t));
+
+        l.lda_align = 256;  // AVX2
+        int k = l.size*l.size*l.c;
+        size_t k_aligned = k + (l.lda_align - k%l.lda_align);
+        size_t t_bit_input_size = k_aligned * l.bit_align / 8;
+        l.t_bit_input = (char*)xcalloc(t_bit_input_size, sizeof(char));
+    }
+
+    if(batch_normalize){
+        if (l.share_layer) {
+            l.scales = l.share_layer->scales;
+            l.scale_updates = l.share_layer->scale_updates;
+            l.mean = l.share_layer->mean;
+            l.variance = l.share_layer->variance;
+            l.mean_delta = l.share_layer->mean_delta;
+            l.variance_delta = l.share_layer->variance_delta;
+            l.rolling_mean = l.share_layer->rolling_mean;
+            l.rolling_variance = l.share_layer->rolling_variance;
+        }
+        else {
+            l.scales = (float*)xcalloc(n, sizeof(float));
+            for (i = 0; i < n; ++i) {
+                l.scales[i] = 1;
+            }
+            if (train) {
+                l.scales_ema = (float*)xcalloc(n, sizeof(float));
+                l.scale_updates = (float*)xcalloc(n, sizeof(float));
+
+                l.mean = (float*)xcalloc(n, sizeof(float));
+                l.variance = (float*)xcalloc(n, sizeof(float));
+
+                l.mean_delta = (float*)xcalloc(n, sizeof(float));
+                l.variance_delta = (float*)xcalloc(n, sizeof(float));
+            }
+            l.rolling_mean = (float*)xcalloc(n, sizeof(float));
+            l.rolling_variance = (float*)xcalloc(n, sizeof(float));
+        }
+
+#ifndef GPU
+        if (train) {
+            l.x = (float*)xcalloc(total_batch * l.outputs, sizeof(float));
+            l.x_norm = (float*)xcalloc(total_batch * l.outputs, sizeof(float));
+        }
+#endif  // not GPU
+    }
+
+#ifndef GPU
+    if (l.activation == SWISH || l.activation == MISH || l.activation == HARD_MISH) l.activation_input = (float*)calloc(total_batch*l.outputs, sizeof(float));
+#endif  // not GPU
+
+    if(adam){
+        l.adam = 1;
+        l.m = (float*)xcalloc(l.nweights, sizeof(float));
+        l.v = (float*)xcalloc(l.nweights, sizeof(float));
+        l.bias_m = (float*)xcalloc(n, sizeof(float));
+        l.scale_m = (float*)xcalloc(n, sizeof(float));
+        l.bias_v = (float*)xcalloc(n, sizeof(float));
+        l.scale_v = (float*)xcalloc(n, sizeof(float));
+    }
+
+#ifdef GPU
+
+
+    l.forward_gpu = forward_convolutional_layer_gpu;
+    l.backward_gpu = backward_convolutional_layer_gpu;
+    l.update_gpu = update_convolutional_layer_gpu;
+
+    if(gpu_index >= 0){
+
+        if (train && (l.activation == SWISH || l.activation == MISH || l.activation == HARD_MISH)) {
+            l.activation_input_gpu = cuda_make_array(l.activation_input, total_batch*l.outputs);
+        }
+
+        if (l.deform) l.weight_deform_gpu = cuda_make_array(NULL, l.nweights);
+
+        if (adam) {
+            l.m_gpu = cuda_make_array(l.m, l.nweights);
+            l.v_gpu = cuda_make_array(l.v, l.nweights);
+            l.bias_m_gpu = cuda_make_array(l.bias_m, n);
+            l.bias_v_gpu = cuda_make_array(l.bias_v, n);
+            l.scale_m_gpu = cuda_make_array(l.scale_m, n);
+            l.scale_v_gpu = cuda_make_array(l.scale_v, n);
+        }
+        if (l.share_layer) {
+            l.weights_gpu = l.share_layer->weights_gpu;
+            l.weight_updates_gpu = l.share_layer->weight_updates_gpu;
+            l.weights_gpu16 = l.share_layer->weights_gpu16;
+            l.weight_updates_gpu16 = l.share_layer->weight_updates_gpu16;
+            l.biases_gpu = l.share_layer->biases_gpu;
+            l.bias_updates_gpu = l.share_layer->bias_updates_gpu;
+        }
+        else {
+            l.weights_gpu = cuda_make_array(l.weights, l.nweights);
+            if (train) l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
+#ifdef CUDNN_HALF
+            l.weights_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
+            if (train) l.weight_updates_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
+#endif  // CUDNN_HALF
+            l.biases_gpu = cuda_make_array(l.biases, n);
+            if (train) l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
+        }
+
+        l.output_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
+        if (train) l.delta_gpu = cuda_make_array(l.delta, total_batch*out_h*out_w*n);
+
+        if(binary){
+            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
+        }
+        if(xnor){
+            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
+            l.mean_arr_gpu = cuda_make_array(0, l.n);
+            l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
+        }
+
+        if(batch_normalize){
+            if (l.share_layer) {
+                l.scales_gpu = l.share_layer->scales_gpu;
+                l.scale_updates_gpu = l.share_layer->scale_updates_gpu;
+                l.mean_gpu = l.share_layer->mean_gpu;
+                l.variance_gpu = l.share_layer->variance_gpu;
+                l.rolling_mean_gpu = l.share_layer->rolling_mean_gpu;
+                l.rolling_variance_gpu = l.share_layer->rolling_variance_gpu;
+                l.mean_delta_gpu = l.share_layer->mean_delta_gpu;
+                l.variance_delta_gpu = l.share_layer->variance_delta_gpu;
+            }
+            else {
+                l.scales_gpu = cuda_make_array(l.scales, n);
+
+                if (train) {
+                    l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
+
+                    l.mean_gpu = cuda_make_array(l.mean, n);
+                    l.variance_gpu = cuda_make_array(l.variance, n);
+                    l.m_cbn_avg_gpu = cuda_make_array(l.mean, n);
+                    l.v_cbn_avg_gpu = cuda_make_array(l.variance, n);
+#ifndef CUDNN
+                    l.mean_delta_gpu = cuda_make_array(l.mean, n);
+                    l.variance_delta_gpu = cuda_make_array(l.variance, n);
+#endif  // CUDNN
+                }
+
+                l.rolling_mean_gpu = cuda_make_array(l.mean, n);
+                l.rolling_variance_gpu = cuda_make_array(l.variance, n);
+            }
+
+            if (train) {
+                l.x_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
+#ifndef CUDNN
+                l.x_norm_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
+#endif  // CUDNN
+            }
+        }
+
+        if (l.assisted_excitation)
+        {
+            const int size = l.out_w * l.out_h * l.batch;
+            l.gt_gpu = cuda_make_array(NULL, size);
+            l.a_avg_gpu = cuda_make_array(NULL, size);
+        }
+#ifdef CUDNN
+        create_convolutional_cudnn_tensors(&l);
+        cudnn_convolutional_setup(&l, cudnn_fastest, 0);
+#endif  // CUDNN
+    }
+#endif  // GPU
+    l.workspace_size = get_convolutional_workspace_size(l);
+
+    //fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    l.bflops = (2.0 * l.nweights * l.out_h*l.out_w) / 1000000000.;
+    if (l.xnor) l.bflops = l.bflops / 32;
+    if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB");
+    else if (l.xnor) fprintf(stderr, "convX ");
+    else if (l.share_layer) fprintf(stderr, "convS ");
+    else if (l.assisted_excitation) fprintf(stderr, "convAE");
+    else fprintf(stderr, "conv  ");
+
+    if (groups > 1) fprintf(stderr, "%5d/%4d ", n, groups);
+    else           fprintf(stderr, "%5d      ", n);
+
+    if (stride_x != stride_y) fprintf(stderr, "%2dx%2d/%2dx%2d ", size, size, stride_x, stride_y);
+    else {
+        if (dilation > 1) fprintf(stderr, "%2d x%2d/%2d(%1d)", size, size, stride_x, dilation);
+        else             fprintf(stderr, "%2d x%2d/%2d   ", size, size, stride_x);
+    }
+
+    fprintf(stderr, "%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+
+    //fprintf(stderr, "%5d/%2d %2d x%2d /%2d(%d)%4d x%4d x%4d  -> %4d x%4d x%4d %5.3f BF\n", n, groups, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+
+    if (l.antialiasing) {
+        printf("AA:  ");
+        l.input_layer = (layer*)calloc(1, sizeof(layer));
+        int blur_size = 3;
+        int blur_pad = blur_size / 2;
+        if (l.antialiasing == 2) {
+            blur_size = 2;
+            blur_pad = 0;
+        }
+        *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_pad, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL, 0, 0, train);
+        const int blur_nweights = n * blur_size * blur_size;  // (n / n) * n * blur_size * blur_size;
+        int i;
+        if (blur_size == 2) {
+            for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
+                l.input_layer->weights[i + 0] = 1 / 4.f;
+                l.input_layer->weights[i + 1] = 1 / 4.f;
+                l.input_layer->weights[i + 2] = 1 / 4.f;
+                l.input_layer->weights[i + 3] = 1 / 4.f;
+            }
+        }
+        else {
+            for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
+                l.input_layer->weights[i + 0] = 1 / 16.f;
+                l.input_layer->weights[i + 1] = 2 / 16.f;
+                l.input_layer->weights[i + 2] = 1 / 16.f;
+
+                l.input_layer->weights[i + 3] = 2 / 16.f;
+                l.input_layer->weights[i + 4] = 4 / 16.f;
+                l.input_layer->weights[i + 5] = 2 / 16.f;
+
+                l.input_layer->weights[i + 6] = 1 / 16.f;
+                l.input_layer->weights[i + 7] = 2 / 16.f;
+                l.input_layer->weights[i + 8] = 1 / 16.f;
+            }
+        }
+        for (i = 0; i < n; ++i) l.input_layer->biases[i] = 0;
+#ifdef GPU
+        if (gpu_index >= 0) {
+            l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs);
+            push_convolutional_layer(*(l.input_layer));
+        }
+#endif  // GPU
+    }
+
+    return l;
+}
+
+void denormalize_convolutional_layer(convolutional_layer l)
+{
+    int i, j;
+    for(i = 0; i < l.n; ++i){
+        float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
+        for(j = 0; j < l.nweights; ++j){
+            l.weights[i*l.nweights + j] *= scale;
+        }
+        l.biases[i] -= l.rolling_mean[i] * scale;
+        l.scales[i] = 1;
+        l.rolling_mean[i] = 0;
+        l.rolling_variance[i] = 1;
+    }
+}
+
+void test_convolutional_layer()
+{
+    convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 2, 1, 1, LEAKY, 1, 0, 0, 0, 0, 0, 0, NULL, 0, 0, 0);
+    l.batch_normalize = 1;
+    float data[] = {1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3};
+    network_state state = {0};
+    state.input = data;
+    forward_convolutional_layer(l, state);
+}
+
+void resize_convolutional_layer(convolutional_layer *l, int w, int h)
+{
+    int total_batch = l->batch*l->steps;
+    int old_w = l->w;
+    int old_h = l->h;
+    l->w = w;
+    l->h = h;
+    int out_w = convolutional_out_width(*l);
+    int out_h = convolutional_out_height(*l);
+
+    l->out_w = out_w;
+    l->out_h = out_h;
+
+    l->outputs = l->out_h * l->out_w * l->out_c;
+    l->inputs = l->w * l->h * l->c;
+
+
+    l->output = (float*)xrealloc(l->output, total_batch * l->outputs * sizeof(float));
+    if (l->train) {
+        l->delta = (float*)xrealloc(l->delta, total_batch * l->outputs * sizeof(float));
+
+        if (l->batch_normalize) {
+            l->x = (float*)xrealloc(l->x, total_batch * l->outputs * sizeof(float));
+            l->x_norm = (float*)xrealloc(l->x_norm, total_batch * l->outputs * sizeof(float));
+        }
+    }
+
+    if (l->xnor) {
+        //l->binary_input = realloc(l->inputs*l->batch, sizeof(float));
+    }
+
+    if (l->activation == SWISH || l->activation == MISH || l->activation == HARD_MISH) l->activation_input = (float*)realloc(l->activation_input, total_batch*l->outputs * sizeof(float));
+#ifdef GPU
+    if (old_w < w || old_h < h || l->dynamic_minibatch) {
+        if (l->train) {
+            cuda_free(l->delta_gpu);
+            l->delta_gpu = cuda_make_array(l->delta, total_batch*l->outputs);
+        }
+
+        cuda_free(l->output_gpu);
+        l->output_gpu = cuda_make_array(l->output, total_batch*l->outputs);
+
+        if (l->batch_normalize) {
+            cuda_free(l->x_gpu);
+            l->x_gpu = cuda_make_array(l->output, total_batch*l->outputs);
+
+#ifndef CUDNN
+            cuda_free(l->x_norm_gpu);
+            l->x_norm_gpu = cuda_make_array(l->output, total_batch*l->outputs);
+#endif  // CUDNN
+        }
+
+        if (l->xnor) {
+            cuda_free(l->binary_input_gpu);
+            l->binary_input_gpu = cuda_make_array(0, l->inputs*l->batch);
+        }
+
+        if (l->activation == SWISH || l->activation == MISH || l->activation == HARD_MISH) {
+            cuda_free(l->activation_input_gpu);
+            l->activation_input_gpu = cuda_make_array(l->activation_input, total_batch*l->outputs);
+        }
+
+        if (l->assisted_excitation)
+        {
+            cuda_free(l->gt_gpu);
+            cuda_free(l->a_avg_gpu);
+
+            const int size = l->out_w * l->out_h * l->batch;
+            l->gt_gpu = cuda_make_array(NULL, size);
+            l->a_avg_gpu = cuda_make_array(NULL, size);
+        }
+    }
+#ifdef CUDNN
+    cudnn_convolutional_setup(l, cudnn_fastest, 0);
+#endif
+#endif
+    l->workspace_size = get_convolutional_workspace_size(*l);
+
+#ifdef CUDNN
+    // check for excessive memory consumption
+    size_t free_byte;
+    size_t total_byte;
+    CHECK_CUDA(cudaMemGetInfo(&free_byte, &total_byte));
+    if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
+        printf(" used slow CUDNN algo without Workspace! Need memory: %zu, available: %zu\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2);
+        cudnn_convolutional_setup(l, cudnn_smallest, 0);
+        l->workspace_size = get_convolutional_workspace_size(*l);
+    }
+#endif
+}
+
+void set_specified_workspace_limit(convolutional_layer *l, size_t workspace_size_limit)
+{
+#ifdef CUDNN
+    size_t free_byte;
+    size_t total_byte;
+    CHECK_CUDA(cudaMemGetInfo(&free_byte, &total_byte));
+    cudnn_convolutional_setup(l, cudnn_specify, workspace_size_limit);
+    l->workspace_size = get_convolutional_workspace_size(*l);
+    //printf("Set specified workspace limit for cuDNN: %zu, available: %zu, workspace = %zu \n", workspace_size_limit, free_byte, l->workspace_size);
+#endif  // CUDNN
+}
+
+void add_bias(float *output, float *biases, int batch, int n, int size)
+{
+    int i,j,b;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < n; ++i){
+            for(j = 0; j < size; ++j){
+                output[(b*n + i)*size + j] += biases[i];
+            }
+        }
+    }
+}
+
+void scale_bias(float *output, float *scales, int batch, int n, int size)
+{
+    int i,j,b;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < n; ++i){
+            for(j = 0; j < size; ++j){
+                output[(b*n + i)*size + j] *= scales[i];
+            }
+        }
+    }
+}
+
+void backward_bias(float *bias_updates, float *delta, int batch, int n, int size)
+{
+    int i,b;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < n; ++i){
+            bias_updates[i] += sum_array(delta+size*(i+b*n), size);
+        }
+    }
+}
+
+void gemm_nn_custom(int M, int N, int K, float ALPHA,
+    float *A, int lda,
+    float *B, int ldb,
+    float *C, int ldc)
+{
+    int i, j, k;
+    for (i = 0; i < M; ++i) {
+        for (k = 0; k < K; ++k) {
+            PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k];
+            //printf("\n weight = %f \n", A_PART);
+            for (j = 0; j < N; ++j) {
+                C[i*ldc + j] += A_PART*B[k*ldb + j];
+            }
+        }
+    }
+}
+
+
+void get_mean_array(float *src, size_t size, size_t filters, float *mean_arr) {
+    size_t i, counter;
+    counter = 0;
+    for (i = 0; i < size; i += size / filters) {
+        mean_arr[counter++] = fabs(src[i]);
+    }
+}
+
+/*
+void float_to_bit(float *src, unsigned char *dst, size_t size) {
+
+    size_t dst_size = size / 8 + 1;
+    memset(dst, 0, dst_size);
+    size_t i, dst_i, dst_shift;
+    for (i = 0; i < size; ++i) {
+        if (src[i] > 0) set_bit(dst, i);
+    }
+}
+*/
+
+void bit_to_float(unsigned char *src, float *dst, size_t size, size_t filters, float *mean_arr) {
+    memset(dst, 0, size *sizeof(float));
+    size_t i;
+
+    for (i = 0; i < size; ++i) {
+        float mean_val = 1;
+        if(mean_arr != NULL) mean_val = fabs(mean_arr[i / (size / filters)]);
+        if(get_bit(src, i)) dst[i] = mean_val;
+        else dst[i] = -mean_val;
+    }
+}
+
+void binary_align_weights(convolutional_layer *l)
+{
+    int m = l->n;   // (l->n / l->groups)
+    int k = l->size*l->size*l->c;   // ->size*l->size*(l->c / l->groups)
+    size_t new_lda = k + (l->lda_align - k % l->lda_align); // (k / 8 + 1) * 8;
+    l->new_lda = new_lda;
+
+    binarize_weights(l->weights, m, k, l->binary_weights);
+
+    size_t align_weights_size = new_lda * m;
+    l->align_bit_weights_size = align_weights_size / 8 + 1;
+    float* align_weights = (float*)xcalloc(align_weights_size, sizeof(float));
+    l->align_bit_weights = (char*)xcalloc(l->align_bit_weights_size, sizeof(char));
+
+    size_t i, j;
+    // align A without transpose
+    for (i = 0; i < m; ++i) {
+        for (j = 0; j < k; ++j) {
+            align_weights[i*new_lda + j] = l->binary_weights[i*k + j];
+        }
+    }
+
+
+    if (l->c % 32 == 0)
+    //if(gpu_index < 0 && l->stride == 1 && l->pad == 1 && l->c % 32 == 0)
+    //if (l->stride == 1 && l->pad == 1 && l->c % 32 == 0)
+    {
+        int fil, chan;
+        const int items_per_filter = l->c * l->size * l->size;
+        //const int dst_items_per_filter = new_lda;
+        for (fil = 0; fil < l->n; ++fil)
+        {
+            for (chan = 0; chan < l->c; chan += 32)
+            {
+                const int items_per_channel = l->size*l->size;
+                for (i = 0; i < items_per_channel; ++i)
+                {
+                    //uint32_t val = 0;
+                    int c_pack;
+                    for (c_pack = 0; c_pack < 32; ++c_pack) {
+                        float src = l->binary_weights[fil*items_per_filter + (chan + c_pack)*items_per_channel + i];
+
+                        //align_weights[fil*items_per_filter + chan*items_per_channel + i * 32 + c_pack] = src;
+
+                        align_weights[fil*new_lda + chan*items_per_channel + i*32 + c_pack] = src;
+                        //val |= (src << c);
+                    }
+
+                }
+            }
+        }
+
+        //printf("\n l.index = %d \t aw[0] = %f, aw[1] = %f, aw[2] = %f, aw[3] = %f \n", l->index, align_weights[0], align_weights[1], align_weights[2], align_weights[3]);
+        //memcpy(l->binary_weights, align_weights, (l->size * l->size * l->c * l->n) * sizeof(float));
+
+        float_to_bit(align_weights, (unsigned char*)l->align_bit_weights, align_weights_size);
+
+        //if (l->n >= 32)
+        if(gpu_index >= 0)
+        {
+            //int M = l->n;
+            //int N = l->out_w*l->out_h;
+            //printf("\n M = %d, N = %d, M %% 8 = %d, N %% 8 = %d - weights \n", M, N, M % 8, N % 8);
+            //printf("\n l.w = %d, l.c = %d, l.n = %d \n", l->w, l->c, l->n);
+            for (i = 0; i < align_weights_size / 8; ++i) l->align_bit_weights[i] = ~(l->align_bit_weights[i]);
+        }
+
+
+
+        get_mean_array(l->binary_weights, m*k, l->n, l->mean_arr);
+        //get_mean_array(l->binary_weights, m*new_lda, l->n, l->mean_arr);
+    }
+    else {
+        float_to_bit(align_weights, (unsigned char*)l->align_bit_weights, align_weights_size);
+
+        get_mean_array(l->binary_weights, m*k, l->n, l->mean_arr);
+    }
+
+    //l->mean_arr = calloc(l->n, sizeof(float));
+
+    //get_mean_array(align_weights, align_weights_size, l->n, l->mean_arr);
+
+
+
+
+#ifdef GPU
+    cudaError_t status;
+    l->align_workspace_size = l->bit_align * l->size * l->size * l->c;
+    status = cudaMalloc((void **)&l->align_workspace_gpu, l->align_workspace_size * sizeof(float));
+    status = cudaMalloc((void **)&l->transposed_align_workspace_gpu, l->align_workspace_size * sizeof(float));
+    CHECK_CUDA(status);
+
+    //l->align_bit_weights_gpu = cuda_make_array(l->align_bit_weights, l->align_bit_weights_size * sizeof(char)/sizeof(float));
+    status = cudaMalloc((void **)&l->align_bit_weights_gpu, l->align_bit_weights_size);
+    CHECK_CUDA(status);
+    status = cudaMemcpy(l->align_bit_weights_gpu, l->align_bit_weights, l->align_bit_weights_size, cudaMemcpyHostToDevice);
+    CHECK_CUDA(status);
+    status = cudaMemcpy(l->binary_weights_gpu, l->binary_weights, m*k * sizeof(float), cudaMemcpyHostToDevice);
+    CHECK_CUDA(status);
+
+    //l->mean_arr_gpu = cuda_make_array(l->mean_arr, l->n);
+    cuda_push_array(l->mean_arr_gpu, l->mean_arr, l->n);
+    CHECK_CUDA(cudaDeviceSynchronize());
+#endif // GPU
+
+    free(align_weights);
+}
+
+// binary transpose
+size_t binary_transpose_align_input(int k, int n, float *b, char **t_bit_input, size_t ldb_align, int bit_align)
+{
+    size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
+    //printf("\n n = %d, bit_align = %d \n", n, bit_align);
+    size_t t_intput_size = new_ldb * bit_align;// n;
+    size_t t_bit_input_size = t_intput_size / 8;// +1;
+
+    memset(*t_bit_input, 0, t_bit_input_size * sizeof(char));
+    //int src_size = k * bit_align;
+
+    // b - [bit_align, k] - [l.bit_align, l.size*l.size*l.c] = src_size
+    // t_input - [bit_align, k] - [n', k]
+    // t_bit_input - [new_ldb, n] - [k', n]
+
+    //transpose_bin(t_input, *t_bit_input, k, n, bit_align, new_ldb, 8);
+    transpose_bin((uint32_t*)b, (uint32_t*)*t_bit_input, k, n, bit_align, new_ldb, 8);
+
+    return t_intput_size;
+}
+
+
+void forward_convolutional_layer(convolutional_layer l, network_state state)
+{
+    int out_h = convolutional_out_height(l);
+    int out_w = convolutional_out_width(l);
+    int i, j;
+
+    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+
+    if (l.xnor && (!l.align_bit_weights || state.train)) {
+        if (!l.align_bit_weights || state.train) {
+            binarize_weights(l.weights, l.n, l.nweights, l.binary_weights);
+            //printf("\n binarize_weights l.align_bit_weights = %p \n", l.align_bit_weights);
+        }
+        swap_binary(&l);
+        binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);
+        state.input = l.binary_input;
+    }
+
+    int m = l.n / l.groups;
+    int k = l.size*l.size*l.c / l.groups;
+    int n = out_h*out_w;
+
+    static int u = 0;
+    u++;
+
+    for(i = 0; i < l.batch; ++i)
+    {
+        for (j = 0; j < l.groups; ++j)
+        {
+            float *a = l.weights +j*l.nweights / l.groups;
+            float *b = state.workspace;
+            float *c = l.output +(i*l.groups + j)*n*m;
+
+            //gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+            //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n);
+            if (l.xnor && l.align_bit_weights && !state.train && l.stride_x == l.stride_y)
+            {
+                memset(b, 0, l.bit_align*l.size*l.size*l.c * sizeof(float));
+
+                if (l.c % 32 == 0)
+                {
+                    //printf(" l.index = %d - new XNOR \n", l.index);
+
+                    int ldb_align = l.lda_align;
+                    size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
+                    //size_t t_intput_size = new_ldb * l.bit_align;// n;
+                    //size_t t_bit_input_size = t_intput_size / 8;// +1;
+
+                    int re_packed_input_size = l.c * l.w * l.h;
+                    memset(state.workspace, 0, re_packed_input_size * sizeof(float));
+
+                    const size_t new_c = l.c / 32;
+                    size_t in_re_packed_input_size = new_c * l.w * l.h + 1;
+                    memset(l.bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t));
+
+                    //float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float));
+                    //uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
+
+                    // float32x4 by channel (as in cuDNN)
+                    repack_input(state.input, state.workspace, l.w, l.h, l.c);
+
+                    // 32 x floats -> 1 x uint32_t
+                    float_to_bit(state.workspace, (unsigned char *)l.bin_re_packed_input, l.c * l.w * l.h);
+
+                    //free(re_packed_input);
+
+                    // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN)
+                    //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output,
+                    //    l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr);
+
+                    // // then exit from if()
+
+
+                    im2col_cpu_custom((float *)l.bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
+                    //im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
+
+                    //free(bin_re_packed_input);
+
+                    int new_k = l.size*l.size*l.c / 32;
+
+                    // good for (l.c == 64)
+                    //gemm_nn_bin_32bit_packed(m, n, new_k, 1,
+                    //    l.align_bit_weights, l.new_lda/32,
+                    //    b, n,
+                    //    c, n, l.mean_arr);
+
+    // // then exit from if()
+
+                    transpose_uint32((uint32_t *)state.workspace, (uint32_t*)l.t_bit_input, new_k, n, n, new_ldb);
+
+                    // the main GEMM function
+                    gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr);
+
+                    // // alternative GEMM
+                    //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1,
+                    //    l.align_bit_weights, l.new_lda/32,
+                    //    t_bit_input, new_ldb / 32,
+                    //    c, n, l.mean_arr);
+
+                    //free(t_bit_input);
+
+                }
+                else
+                { // else (l.c % 32 != 0)
+
+                    //--------------------------------------------------------
+                    //printf(" l.index = %d - old XNOR \n", l.index);
+
+                    //im2col_cpu_custom_align(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b, l.bit_align);
+                    im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
+
+                    //size_t output_size = l.outputs;
+                    //float *count_output = calloc(output_size, sizeof(float));
+                    //size_t bit_output_size = output_size / 8 + 1;
+                    //char *bit_output = calloc(bit_output_size, sizeof(char));
+
+                    //size_t intput_size = n * k; // (out_h*out_w) X (l.size*l.size*l.c) : after im2col()
+                    //size_t bit_input_size = intput_size / 8 + 1;
+                    //char *bit_input = calloc(bit_input_size, sizeof(char));
+
+                    //size_t weights_size = k * m; //l.size*l.size*l.c*l.n; // l.nweights
+                    //size_t bit_weights_size = weights_size / 8 + 1;
+
+                    //char *bit_weights = calloc(bit_weights_size, sizeof(char));
+                    //float *mean_arr = calloc(l.n, sizeof(float));
+
+                    // transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should be multiple of 8 bits)
+                    {
+                        //size_t ldb_align = 256; // 256 bit for AVX2
+                        int ldb_align = l.lda_align;
+                        size_t new_ldb = k + (ldb_align - k%ldb_align);
+                        size_t t_intput_size = binary_transpose_align_input(k, n, state.workspace, &l.t_bit_input, ldb_align, l.bit_align);
+
+                        // 5x times faster than gemm()-float32
+                        gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr);
+
+                        //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, bit_weights, k, t_bit_input, new_ldb, c, n, mean_arr);
+
+                        //free(t_input);
+                        //free(t_bit_input);
+                        //}
+                    }
+
+                }
+
+                add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
+
+                //activate_array(l.output, m*n*l.batch, l.activation);
+                if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+                else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+                else if (l.activation == HARD_MISH) activate_array_hard_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+                else if (l.activation == NORM_CHAN) activate_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output);
+                else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 0);
+                else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 1);
+                else activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
+                return;
+
+            }
+            else {
+                //printf(" l.index = %d - FP32 \n", l.index);
+                float *im = state.input + (i*l.groups + j)*(l.c / l.groups)*l.h*l.w;
+                if (l.size == 1 && l.stride == 1 && l.dilation == 1) {
+                    b = im;
+                }
+                else {
+                    //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+
+                    im2col_cpu_ext(im,   // input
+                        l.c / l.groups,     // input channels
+                        l.h, l.w,           // input size (h, w)
+                        l.size, l.size,     // kernel size (h, w)
+                        l.pad * l.dilation, l.pad * l.dilation,       // padding (h, w)
+                        l.stride_y, l.stride_x, // stride (h, w)
+                        l.dilation, l.dilation, // dilation (h, w)
+                        b);                 // output
+
+                }
+
+                gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
+                // bit-count to float
+            }
+            //c += n*m;
+            //state.input += l.c*l.h*l.w;
+        }
+    }
+
+    if(l.batch_normalize){
+        forward_batchnorm_layer(l, state);
+    }
+    else {
+        add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
+    }
+
+    //activate_array(l.output, m*n*l.batch, l.activation);
+    if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+    else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+    else if (l.activation == HARD_MISH) activate_array_hard_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+    else if (l.activation == NORM_CHAN) activate_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output);
+    else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 0);
+    else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 1);
+    else activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation);
+
+    if(l.binary || l.xnor) swap_binary(&l);
+
+    //visualize_convolutional_layer(l, "conv_visual", NULL);
+    //wait_until_press_key_cv();
+
+    if(l.assisted_excitation && state.train) assisted_excitation_forward(l, state);
+
+    if (l.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        s.input = l.output;
+        forward_convolutional_layer(*(l.input_layer), s);
+        //simple_copy_ongpu(l.outputs*l.batch, l.output, l.input_antialiasing);
+        memcpy(l.output, l.input_layer->output, l.input_layer->outputs * l.input_layer->batch * sizeof(float));
+    }
+}
+
+void assisted_excitation_forward(convolutional_layer l, network_state state)
+{
+    const int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+
+    // epoch
+    //const float epoch = (float)(*state.net.seen) / state.net.train_images_num;
+
+    // calculate alpha
+    //const float alpha = (1 + cos(3.141592 * iteration_num)) / (2 * state.net.max_batches);
+    //const float alpha = (1 + cos(3.141592 * epoch)) / (2 * state.net.max_batches);
+    float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches));
+
+    if (l.assisted_excitation > 1) {
+        if (iteration_num > l.assisted_excitation) alpha = 0;
+        else alpha = (1 + cos(3.141592 * iteration_num / l.assisted_excitation));
+    }
+
+    //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n",
+    //    epoch, alpha, (*state.net.seen), state.net.max_batches, state.net.train_images_num);
+
+    float *a_avg = (float *)xcalloc(l.out_w * l.out_h * l.batch, sizeof(float));
+    float *g = (float *)xcalloc(l.out_w * l.out_h * l.batch, sizeof(float));
+
+    int b;
+    int w, h, c;
+
+    l.max_boxes = state.net.num_boxes;
+    l.truths = l.max_boxes*(4 + 1);
+
+    for (b = 0; b < l.batch; ++b)
+    {
+        // calculate G
+        int t;
+        for (t = 0; t < state.net.num_boxes; ++t) {
+            box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
+            if (!truth.x) break;  // continue;
+
+            int left = floor((truth.x - truth.w / 2) * l.out_w);
+            int right = ceil((truth.x + truth.w / 2) * l.out_w);
+            int top = floor((truth.y - truth.h / 2) * l.out_h);
+            int bottom = ceil((truth.y + truth.h / 2) * l.out_h);
+
+            for (w = left; w <= right; w++) {
+                for (h = top; h < bottom; h++) {
+                    g[w + l.out_w * h + l.out_w*l.out_h*b] = 1;
+                }
+            }
+        }
+    }
+
+    for (b = 0; b < l.batch; ++b)
+    {
+        // calculate average A
+        for (w = 0; w < l.out_w; w++) {
+            for (h = 0; h < l.out_h; h++) {
+                for (c = 0; c < l.out_c; c++) {
+                    a_avg[w + l.out_w*(h + l.out_h*b)] += l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))];
+                }
+                a_avg[w + l.out_w*(h + l.out_h*b)] /= l.out_c;  // a_avg / d
+            }
+        }
+    }
+
+    // change activation
+    for (b = 0; b < l.batch; ++b)
+    {
+        for (w = 0; w < l.out_w; w++) {
+            for (h = 0; h < l.out_h; h++) {
+                for (c = 0; c < l.out_c; c++)
+                {
+                    // a = a + alpha(t) + e(c,i,j) = a + alpha(t) + g(i,j) * avg_a(i,j) / channels
+                    l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] +=
+                        alpha *
+                        g[w + l.out_w*(h + l.out_h*b)] *
+                        a_avg[w + l.out_w*(h + l.out_h*b)];
+
+                    //l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] =
+                    //    alpha * g[w + l.out_w*(h + l.out_h*b)] * a_avg[w + l.out_w*(h + l.out_h*b)];
+                }
+            }
+        }
+    }
+
+    if(0)   // visualize ground truth
+    {
+#ifdef OPENCV
+        for (b = 0; b < l.batch; ++b)
+        {
+            image img = float_to_image(l.out_w, l.out_h, 1, &g[l.out_w*l.out_h*b]);
+            char buff[100];
+            sprintf(buff, "a_excitation_%d", b);
+            show_image_cv(img, buff);
+
+            image img2 = float_to_image(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
+            char buff2[100];
+            sprintf(buff2, "a_excitation_act_%d", b);
+            show_image_cv(img2, buff2);
+            wait_key_cv(5);
+        }
+        wait_until_press_key_cv();
+#endif // OPENCV
+    }
+
+    free(g);
+    free(a_avg);
+}
+
+
+void backward_convolutional_layer(convolutional_layer l, network_state state)
+{
+    int i, j;
+    int m = l.n / l.groups;
+    int n = l.size*l.size*l.c / l.groups;
+    int k = l.out_w*l.out_h;
+
+    if (l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.delta);
+    else if (l.activation == MISH) gradient_array_mish(l.outputs*l.batch, l.activation_input, l.delta);
+    else if (l.activation == HARD_MISH) gradient_array_hard_mish(l.outputs*l.batch, l.activation_input, l.delta);
+    else if (l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL) gradient_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
+    else if (l.activation == NORM_CHAN) gradient_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
+    else gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+
+    if (l.batch_normalize) {
+        backward_batchnorm_layer(l, state);
+    }
+    else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
+    }
+
+    for (i = 0; i < l.batch; ++i) {
+        for (j = 0; j < l.groups; ++j) {
+            float *a = l.delta + (i*l.groups + j)*m*k;
+            float *b = state.workspace;
+            float *c = l.weight_updates + j*l.nweights / l.groups;
+
+            float *im = state.input + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w;
+
+            //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+            im2col_cpu_ext(
+                im,                 // input
+                l.c / l.groups,     // input channels
+                l.h, l.w,           // input size (h, w)
+                l.size, l.size,     // kernel size (h, w)
+                l.pad * l.dilation, l.pad * l.dilation,       // padding (h, w)
+                l.stride_y, l.stride_x, // stride (h, w)
+                l.dilation, l.dilation, // dilation (h, w)
+                b);                 // output
+
+            gemm(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);
+
+            if (state.delta) {
+                a = l.weights + j*l.nweights / l.groups;
+                b = l.delta + (i*l.groups + j)*m*k;
+                c = state.workspace;
+
+                gemm(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);
+
+                //col2im_cpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride,
+                //     l.pad, state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w);
+
+                col2im_cpu_ext(
+                    state.workspace,        // input
+                    l.c / l.groups,         // input channels (h, w)
+                    l.h, l.w,               // input size (h, w)
+                    l.size, l.size,         // kernel size (h, w)
+                    l.pad * l.dilation, l.pad * l.dilation,           // padding (h, w)
+                    l.stride_y, l.stride_x,     // stride (h, w)
+                    l.dilation, l.dilation, // dilation (h, w)
+                    state.delta + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w); // output (delta)
+            }
+        }
+    }
+}
+
+void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate_init, float momentum, float decay)
+{
+    float learning_rate = learning_rate_init*l.learning_rate_scale;
+    //float momentum = a.momentum;
+    //float decay = a.decay;
+    //int batch = a.batch;
+
+    axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(l.nweights, momentum, l.weight_updates, 1);
+
+    axpy_cpu(l.n, learning_rate / batch, l.bias_updates, 1, l.biases, 1);
+    scal_cpu(l.n, momentum, l.bias_updates, 1);
+
+    if (l.scales) {
+        axpy_cpu(l.n, learning_rate / batch, l.scale_updates, 1, l.scales, 1);
+        scal_cpu(l.n, momentum, l.scale_updates, 1);
+    }
+}
+
+
+
+image get_convolutional_weight(convolutional_layer l, int i)
+{
+    int h = l.size;
+    int w = l.size;
+    int c = l.c / l.groups;
+    return float_to_image(w, h, c, l.weights + i*h*w*c);
+}
+
+void rgbgr_weights(convolutional_layer l)
+{
+    int i;
+    for (i = 0; i < l.n; ++i) {
+        image im = get_convolutional_weight(l, i);
+        if (im.c == 3) {
+            rgbgr_image(im);
+        }
+    }
+}
+
+void rescale_weights(convolutional_layer l, float scale, float trans)
+{
+    int i;
+    for (i = 0; i < l.n; ++i) {
+        image im = get_convolutional_weight(l, i);
+        if (im.c == 3) {
+            scale_image(im, scale);
+            float sum = sum_array(im.data, im.w*im.h*im.c);
+            l.biases[i] += sum*trans;
+        }
+    }
+}
+
+image *get_weights(convolutional_layer l)
+{
+    image *weights = (image *)xcalloc(l.n, sizeof(image));
+    int i;
+    for (i = 0; i < l.n; ++i) {
+        weights[i] = copy_image(get_convolutional_weight(l, i));
+        normalize_image(weights[i]);
+        /*
+        char buff[256];
+        sprintf(buff, "filter%d", i);
+        save_image(weights[i], buff);
+        */
+    }
+    //error("hey");
+    return weights;
+}
+
+image *visualize_convolutional_layer(convolutional_layer l, char *window, image *prev_weights)
+{
+    image *single_weights = get_weights(l);
+    show_images(single_weights, l.n, window);
+
+    image delta = get_convolutional_image(l);
+    image dc = collapse_image_layers(delta, 1);
+    char buff[256];
+    sprintf(buff, "%s: Output", window);
+    show_image(dc, buff);
+    //save_image(dc, buff);
+    free_image(dc);
+    return single_weights;
+}
diff --git a/darknet-master/src/convolutional_layer.h b/darknet-master/src/convolutional_layer.h
new file mode 100644
index 0000000..e83ca87
--- /dev/null
+++ b/darknet-master/src/convolutional_layer.h
@@ -0,0 +1,68 @@
+#ifndef CONVOLUTIONAL_LAYER_H
+#define CONVOLUTIONAL_LAYER_H
+
+#include "dark_cuda.h"
+#include "image.h"
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+
+typedef layer convolutional_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef GPU
+void forward_convolutional_layer_gpu(convolutional_layer layer, network_state state);
+void backward_convolutional_layer_gpu(convolutional_layer layer, network_state state);
+void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay, float loss_scale);
+
+void push_convolutional_layer(convolutional_layer layer);
+void pull_convolutional_layer(convolutional_layer layer);
+
+void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
+void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
+#ifdef CUDNN
+void cudnn_convolutional_setup(layer *l, int cudnn_preference, size_t workspace_size_specify);
+void create_convolutional_cudnn_tensors(layer *l);
+void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16);
+#endif
+#endif
+void free_convolutional_batchnorm(convolutional_layer *l);
+
+size_t get_convolutional_workspace_size(layer l);
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation, int deform, int train);
+void denormalize_convolutional_layer(convolutional_layer l);
+void set_specified_workspace_limit(convolutional_layer *l, size_t workspace_size_limit);
+void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
+void forward_convolutional_layer(const convolutional_layer layer, network_state state);
+void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay);
+image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_weights);
+void binarize_weights(float *weights, int n, int size, float *binary);
+void swap_binary(convolutional_layer *l);
+void binarize_weights2(float *weights, int n, int size, char *binary, float *scales);
+
+void binary_align_weights(convolutional_layer *l);
+
+void backward_convolutional_layer(convolutional_layer layer, network_state state);
+
+void add_bias(float *output, float *biases, int batch, int n, int size);
+void backward_bias(float *bias_updates, float *delta, int batch, int n, int size);
+
+image get_convolutional_image(convolutional_layer layer);
+image get_convolutional_delta(convolutional_layer layer);
+image get_convolutional_weight(convolutional_layer layer, int i);
+
+
+int convolutional_out_height(convolutional_layer layer);
+int convolutional_out_width(convolutional_layer layer);
+void rescale_weights(convolutional_layer l, float scale, float trans);
+void rgbgr_weights(convolutional_layer l);
+void assisted_excitation_forward(convolutional_layer l, network_state state);
+void assisted_excitation_forward_gpu(convolutional_layer l, network_state state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/cost_layer.c b/darknet-master/src/cost_layer.c
new file mode 100644
index 0000000..60080b1
--- /dev/null
+++ b/darknet-master/src/cost_layer.c
@@ -0,0 +1,148 @@
+#include "cost_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+COST_TYPE get_cost_type(char *s)
+{
+    if (strcmp(s, "sse")==0) return SSE;
+    if (strcmp(s, "masked")==0) return MASKED;
+    if (strcmp(s, "smooth")==0) return SMOOTH;
+    fprintf(stderr, "Couldn't find cost type %s, going with SSE\n", s);
+    return SSE;
+}
+
+char *get_cost_string(COST_TYPE a)
+{
+    switch(a){
+        case SSE:
+            return "sse";
+        case MASKED:
+            return "masked";
+        case SMOOTH:
+            return "smooth";
+        default:
+            return "sse";
+    }
+}
+
+cost_layer make_cost_layer(int batch, int inputs, COST_TYPE cost_type, float scale)
+{
+    fprintf(stderr, "cost                                           %4d\n",  inputs);
+    cost_layer l = { (LAYER_TYPE)0 };
+    l.type = COST;
+
+    l.scale = scale;
+    l.batch = batch;
+    l.inputs = inputs;
+    l.outputs = inputs;
+    l.cost_type = cost_type;
+    l.delta = (float*)xcalloc(inputs * batch, sizeof(float));
+    l.output = (float*)xcalloc(inputs * batch, sizeof(float));
+    l.cost = (float*)xcalloc(1, sizeof(float));
+
+    l.forward = forward_cost_layer;
+    l.backward = backward_cost_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_cost_layer_gpu;
+    l.backward_gpu = backward_cost_layer_gpu;
+
+    l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
+    l.output_gpu = cuda_make_array(l.output, inputs*batch);
+    #endif
+    return l;
+}
+
+void resize_cost_layer(cost_layer *l, int inputs)
+{
+    l->inputs = inputs;
+    l->outputs = inputs;
+    l->delta = (float*)xrealloc(l->delta, inputs * l->batch * sizeof(float));
+    l->output = (float*)xrealloc(l->output, inputs * l->batch * sizeof(float));
+#ifdef GPU
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+    l->delta_gpu = cuda_make_array(l->delta, inputs*l->batch);
+    l->output_gpu = cuda_make_array(l->output, inputs*l->batch);
+#endif
+}
+
+void forward_cost_layer(cost_layer l, network_state state)
+{
+    if (!state.truth) return;
+    if(l.cost_type == MASKED){
+        int i;
+        for(i = 0; i < l.batch*l.inputs; ++i){
+            if(state.truth[i] == SECRET_NUM) state.input[i] = SECRET_NUM;
+        }
+    }
+    if(l.cost_type == SMOOTH){
+        smooth_l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output);
+    } else {
+        l2_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output);
+    }
+    l.cost[0] = sum_array(l.output, l.batch*l.inputs);
+}
+
+void backward_cost_layer(const cost_layer l, network_state state)
+{
+    axpy_cpu(l.batch*l.inputs, l.scale, l.delta, 1, state.delta, 1);
+}
+
+#ifdef GPU
+
+void pull_cost_layer(cost_layer l)
+{
+    cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs);
+}
+
+void push_cost_layer(cost_layer l)
+{
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.inputs);
+}
+
+int float_abs_compare (const void * a, const void * b)
+{
+    float fa = *(const float*) a;
+    if(fa < 0) fa = -fa;
+    float fb = *(const float*) b;
+    if(fb < 0) fb = -fb;
+    return (fa > fb) - (fa < fb);
+}
+
+void forward_cost_layer_gpu(cost_layer l, network_state state)
+{
+    if (!state.truth) return;
+    if (l.cost_type == MASKED) {
+        mask_ongpu(l.batch*l.inputs, state.input, SECRET_NUM, state.truth);
+    }
+
+    if(l.cost_type == SMOOTH){
+        smooth_l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu);
+    } else {
+        l2_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu);
+    }
+
+    if(l.ratio){
+        cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs);
+        qsort(l.delta, l.batch*l.inputs, sizeof(float), float_abs_compare);
+        int n = (1-l.ratio) * l.batch*l.inputs;
+        float thresh = l.delta[n];
+        thresh = 0;
+        printf("%f\n", thresh);
+        supp_ongpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
+    }
+
+    cuda_pull_array(l.output_gpu, l.output, l.batch*l.inputs);
+    l.cost[0] = sum_array(l.output, l.batch*l.inputs);
+}
+
+void backward_cost_layer_gpu(const cost_layer l, network_state state)
+{
+    axpy_ongpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, state.delta, 1);
+}
+#endif
diff --git a/darknet-master/src/cost_layer.h b/darknet-master/src/cost_layer.h
new file mode 100644
index 0000000..b350003
--- /dev/null
+++ b/darknet-master/src/cost_layer.h
@@ -0,0 +1,26 @@
+#ifndef COST_LAYER_H
+#define COST_LAYER_H
+#include "layer.h"
+#include "network.h"
+
+typedef layer cost_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+COST_TYPE get_cost_type(char *s);
+char *get_cost_string(COST_TYPE a);
+cost_layer make_cost_layer(int batch, int inputs, COST_TYPE cost_type, float scale);
+void forward_cost_layer(const cost_layer l, network_state state);
+void backward_cost_layer(const cost_layer l, network_state state);
+void resize_cost_layer(cost_layer *l, int inputs);
+
+#ifdef GPU
+void forward_cost_layer_gpu(cost_layer l, network_state state);
+void backward_cost_layer_gpu(const cost_layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/cpu_gemm.c b/darknet-master/src/cpu_gemm.c
new file mode 100644
index 0000000..ca1a8e4
--- /dev/null
+++ b/darknet-master/src/cpu_gemm.c
@@ -0,0 +1,96 @@
+//#include "mini_blas.h"
+#ifdef __cplusplus
+#define PUT_IN_REGISTER
+#else
+#define PUT_IN_REGISTER register
+#endif
+
+void cpu_gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float BETA,
+        float *C, int ldc)
+{
+    int i,j,k;
+    for(i = 0; i < M; ++i){
+        for(k = 0; k < K; ++k){
+            PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k];
+            for(j = 0; j < N; ++j){
+                C[i*ldc+j] += A_PART*B[k*ldb+j];
+            }
+        }
+    }
+}
+
+void cpu_gemm_nt(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float BETA,
+        float *C, int ldc)
+{
+    int i,j,k;
+    for(i = 0; i < M; ++i){
+        for(j = 0; j < N; ++j){
+            PUT_IN_REGISTER float sum = 0;
+            for(k = 0; k < K; ++k){
+                sum += ALPHA*A[i*lda+k]*B[k+j*ldb];
+            }
+            C[i*ldc+j] += sum;
+        }
+    }
+}
+
+void cpu_gemm_tn(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float BETA,
+        float *C, int ldc)
+{
+    int i,j,k;
+    for(i = 0; i < M; ++i){
+        for(k = 0; k < K; ++k){
+            PUT_IN_REGISTER float A_PART = ALPHA * A[k * lda + i];
+            for(j = 0; j < N; ++j){
+                C[i*ldc+j] += A_PART*B[k*ldb+j];
+            }
+        }
+    }
+}
+void cpu_gemm_tt(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float BETA,
+        float *C, int ldc)
+{
+    int i,j,k;
+    for(i = 0; i < M; ++i){
+        for(j = 0; j < N; ++j){
+            for(k = 0; k < K; ++k){
+                C[i*ldc+j] += ALPHA*A[i+k*lda]*B[k+j*ldb];
+            }
+        }
+    }
+}
+
+
+void cpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float BETA,
+        float *C, int ldc)
+{
+    int i, j;
+    for(i = 0; i < M; ++i){
+        for(j = 0; j < N; ++j){
+            C[i*ldc + j] *= BETA;
+        }
+    }
+    if(!TA && !TB)
+        cpu_gemm_nn( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
+    else if(TA && !TB)
+        cpu_gemm_tn( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
+    else if(!TA && TB)
+        cpu_gemm_nt( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
+    else
+        cpu_gemm_tt( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
+}
diff --git a/darknet-master/src/crnn_layer.c b/darknet-master/src/crnn_layer.c
new file mode 100644
index 0000000..84646b4
--- /dev/null
+++ b/darknet-master/src/crnn_layer.c
@@ -0,0 +1,383 @@
+#include "crnn_layer.h"
+#include "convolutional_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include "gemm.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void increment_layer(layer *l, int steps)
+{
+    int num = l->outputs*l->batch*steps;
+    l->output += num;
+    l->delta += num;
+    l->x += num;
+    l->x_norm += num;
+
+#ifdef GPU
+    l->output_gpu += num;
+    l->delta_gpu += num;
+    l->x_gpu += num;
+    l->x_norm_gpu += num;
+#endif
+}
+
+layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int xnor, int train)
+{
+    fprintf(stderr, "CRNN Layer: %d x %d x %d image, %d filters\n", h,w,c,output_filters);
+    batch = batch / steps;
+    layer l = { (LAYER_TYPE)0 };
+    l.train = train;
+    l.batch = batch;
+    l.type = CRNN;
+    l.steps = steps;
+    l.size = size;
+    l.stride = stride;
+    l.dilation = dilation;
+    l.pad = pad;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.groups = groups;
+    l.out_c = output_filters;
+    l.inputs = h * w * c;
+    l.hidden = h * w * hidden_filters;
+    l.xnor = xnor;
+
+    l.state = (float*)xcalloc(l.hidden * l.batch * (l.steps + 1), sizeof(float));
+
+    l.input_layer = (layer*)xcalloc(1, sizeof(layer));
+    *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+    l.input_layer->batch = batch;
+    if (l.workspace_size < l.input_layer->workspace_size) l.workspace_size = l.input_layer->workspace_size;
+
+    l.self_layer = (layer*)xcalloc(1, sizeof(layer));
+    *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+    l.self_layer->batch = batch;
+    if (l.workspace_size < l.self_layer->workspace_size) l.workspace_size = l.self_layer->workspace_size;
+
+    l.output_layer = (layer*)xcalloc(1, sizeof(layer));
+    *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
+    l.output_layer->batch = batch;
+    if (l.workspace_size < l.output_layer->workspace_size) l.workspace_size = l.output_layer->workspace_size;
+
+    l.out_h = l.output_layer->out_h;
+    l.out_w = l.output_layer->out_w;
+    l.outputs = l.output_layer->outputs;
+
+    assert(l.input_layer->outputs == l.self_layer->outputs);
+    assert(l.input_layer->outputs == l.output_layer->inputs);
+
+    l.output = l.output_layer->output;
+    l.delta = l.output_layer->delta;
+
+    l.forward = forward_crnn_layer;
+    l.backward = backward_crnn_layer;
+    l.update = update_crnn_layer;
+
+#ifdef GPU
+    l.forward_gpu = forward_crnn_layer_gpu;
+    l.backward_gpu = backward_crnn_layer_gpu;
+    l.update_gpu = update_crnn_layer_gpu;
+    l.state_gpu = cuda_make_array(l.state, l.batch*l.hidden*(l.steps + 1));
+    l.output_gpu = l.output_layer->output_gpu;
+    l.delta_gpu = l.output_layer->delta_gpu;
+#endif
+
+    l.bflops = l.input_layer->bflops + l.self_layer->bflops + l.output_layer->bflops;
+
+    return l;
+}
+
+void resize_crnn_layer(layer *l, int w, int h)
+{
+    resize_convolutional_layer(l->input_layer, w, h);
+    if (l->workspace_size < l->input_layer->workspace_size) l->workspace_size = l->input_layer->workspace_size;
+
+    resize_convolutional_layer(l->self_layer, w, h);
+    if (l->workspace_size < l->self_layer->workspace_size) l->workspace_size = l->self_layer->workspace_size;
+
+    resize_convolutional_layer(l->output_layer, w, h);
+    if (l->workspace_size < l->output_layer->workspace_size) l->workspace_size = l->output_layer->workspace_size;
+
+    l->output = l->output_layer->output;
+    l->delta = l->output_layer->delta;
+
+    int hidden_filters = l->self_layer->c;
+    l->w = w;
+    l->h = h;
+    l->inputs = h * w * l->c;
+    l->hidden = h * w * hidden_filters;
+
+    l->out_h = l->output_layer->out_h;
+    l->out_w = l->output_layer->out_w;
+    l->outputs = l->output_layer->outputs;
+
+    assert(l->input_layer->inputs == l->inputs);
+    assert(l->self_layer->inputs == l->hidden);
+    assert(l->input_layer->outputs == l->self_layer->outputs);
+    assert(l->input_layer->outputs == l->output_layer->inputs);
+
+    l->state = (float*)xrealloc(l->state, l->batch*l->hidden*(l->steps + 1)*sizeof(float));
+
+#ifdef GPU
+    if (l->state_gpu) cudaFree(l->state_gpu);
+    l->state_gpu = cuda_make_array(l->state, l->batch*l->hidden*(l->steps + 1));
+
+    l->output_gpu = l->output_layer->output_gpu;
+    l->delta_gpu = l->output_layer->delta_gpu;
+#endif
+}
+
+void free_state_crnn(layer l)
+{
+    int i;
+    for (i = 0; i < l.outputs * l.batch; ++i) l.self_layer->output[i] = rand_uniform(-1, 1);
+
+#ifdef GPU
+    cuda_push_array(l.self_layer->output_gpu, l.self_layer->output, l.outputs * l.batch);
+#endif  // GPU
+}
+
+void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+{
+    update_convolutional_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
+    update_convolutional_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
+    update_convolutional_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
+}
+
+void forward_crnn_layer(layer l, network_state state)
+{
+    network_state s = {0};
+    s.train = state.train;
+    s.workspace = state.workspace;
+    s.net = state.net;
+    //s.index = state.index;
+    int i;
+    layer input_layer = *(l.input_layer);
+    layer self_layer = *(l.self_layer);
+    layer output_layer = *(l.output_layer);
+
+    if (state.train) {
+        fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1);
+        fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1);
+        fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1);
+        fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i) {
+        s.input = state.input;
+        forward_convolutional_layer(input_layer, s);
+
+        s.input = l.state;
+        forward_convolutional_layer(self_layer, s);
+
+        float *old_state = l.state;
+        if(state.train) l.state += l.hidden*l.batch;
+        if(l.shortcut){
+            copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1);
+        }else{
+            fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+        }
+        axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1);
+        axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
+
+        s.input = l.state;
+        forward_convolutional_layer(output_layer, s);
+
+        state.input += l.inputs*l.batch;
+        increment_layer(&input_layer, 1);
+        increment_layer(&self_layer, 1);
+        increment_layer(&output_layer, 1);
+    }
+}
+
+void backward_crnn_layer(layer l, network_state state)
+{
+    network_state s = {0};
+    s.train = state.train;
+    s.workspace = state.workspace;
+    s.net = state.net;
+    //s.index = state.index;
+    int i;
+    layer input_layer = *(l.input_layer);
+    layer self_layer = *(l.self_layer);
+    layer output_layer = *(l.output_layer);
+
+    increment_layer(&input_layer, l.steps-1);
+    increment_layer(&self_layer, l.steps-1);
+    increment_layer(&output_layer, l.steps-1);
+
+    l.state += l.hidden*l.batch*l.steps;
+    for (i = l.steps-1; i >= 0; --i) {
+        copy_cpu(l.hidden * l.batch, input_layer.output, 1, l.state, 1);
+        axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
+
+        s.input = l.state;
+        s.delta = self_layer.delta;
+        backward_convolutional_layer(output_layer, s);
+
+        l.state -= l.hidden*l.batch;
+        /*
+           if(i > 0){
+           copy_cpu(l.hidden * l.batch, input_layer.output - l.hidden*l.batch, 1, l.state, 1);
+           axpy_cpu(l.hidden * l.batch, 1, self_layer.output - l.hidden*l.batch, 1, l.state, 1);
+           }else{
+           fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+           }
+         */
+
+        s.input = l.state;
+        s.delta = self_layer.delta - l.hidden*l.batch;
+        if (i == 0) s.delta = 0;
+        backward_convolutional_layer(self_layer, s);
+
+        copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1);
+        if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1);
+        s.input = state.input + i*l.inputs*l.batch;
+        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
+        else s.delta = 0;
+        backward_convolutional_layer(input_layer, s);
+
+        increment_layer(&input_layer, -1);
+        increment_layer(&self_layer, -1);
+        increment_layer(&output_layer, -1);
+    }
+}
+
+#ifdef GPU
+
+void pull_crnn_layer(layer l)
+{
+    pull_convolutional_layer(*(l.input_layer));
+    pull_convolutional_layer(*(l.self_layer));
+    pull_convolutional_layer(*(l.output_layer));
+}
+
+void push_crnn_layer(layer l)
+{
+    push_convolutional_layer(*(l.input_layer));
+    push_convolutional_layer(*(l.self_layer));
+    push_convolutional_layer(*(l.output_layer));
+}
+
+void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
+{
+    update_convolutional_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay, loss_scale);
+}
+
+void forward_crnn_layer_gpu(layer l, network_state state)
+{
+    network_state s = {0};
+    s.train = state.train;
+    s.workspace = state.workspace;
+    s.net = state.net;
+    if(!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
+    int i;
+    layer input_layer = *(l.input_layer);
+    layer self_layer = *(l.self_layer);
+    layer output_layer = *(l.output_layer);
+
+/*
+#ifdef CUDNN_HALF   // slow and bad for training
+    if (!state.train && state.net.cudnn_half) {
+        s.index = state.index;
+        cuda_convert_f32_to_f16(input_layer.weights_gpu, input_layer.c*input_layer.n*input_layer.size*input_layer.size, input_layer.weights_gpu16);
+        cuda_convert_f32_to_f16(self_layer.weights_gpu, self_layer.c*self_layer.n*self_layer.size*self_layer.size, self_layer.weights_gpu16);
+        cuda_convert_f32_to_f16(output_layer.weights_gpu, output_layer.c*output_layer.n*output_layer.size*output_layer.size, output_layer.weights_gpu16);
+    }
+#endif  //CUDNN_HALF
+*/
+
+    if (state.train) {
+        fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
+        fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
+        fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
+        fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i) {
+        s.input = state.input;
+        forward_convolutional_layer_gpu(input_layer, s);
+
+        s.input = l.state_gpu;
+        forward_convolutional_layer_gpu(self_layer, s);
+
+        float *old_state = l.state_gpu;
+        if(state.train) l.state_gpu += l.hidden*l.batch;
+        if(l.shortcut){
+            copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
+        }else{
+            fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
+        }
+        axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
+
+        s.input = l.state_gpu;
+        forward_convolutional_layer_gpu(output_layer, s);
+
+        state.input += l.inputs*l.batch;
+        increment_layer(&input_layer, 1);
+        increment_layer(&self_layer, 1);
+        increment_layer(&output_layer, 1);
+    }
+}
+
+void backward_crnn_layer_gpu(layer l, network_state state)
+{
+    network_state s = {0};
+    s.train = state.train;
+    s.workspace = state.workspace;
+    s.net = state.net;
+    //s.index = state.index;
+    int i;
+    layer input_layer = *(l.input_layer);
+    layer self_layer = *(l.self_layer);
+    layer output_layer = *(l.output_layer);
+    increment_layer(&input_layer,  l.steps - 1);
+    increment_layer(&self_layer,   l.steps - 1);
+    increment_layer(&output_layer, l.steps - 1);
+    float *init_state_gpu = l.state_gpu;
+    l.state_gpu += l.hidden*l.batch*l.steps;
+    for (i = l.steps-1; i >= 0; --i) {
+        //copy_ongpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);   // commented in RNN
+        //axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); // commented in RNN
+
+        s.input = l.state_gpu;
+        s.delta = self_layer.delta_gpu;
+        backward_convolutional_layer_gpu(output_layer, s);
+
+        l.state_gpu -= l.hidden*l.batch;
+
+        copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
+
+        s.input = l.state_gpu;
+        s.delta = self_layer.delta_gpu - l.hidden*l.batch;
+        if (i == 0) s.delta = 0;
+        backward_convolutional_layer_gpu(self_layer, s);
+
+        if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
+        s.input = state.input + i*l.inputs*l.batch;
+        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
+        else s.delta = 0;
+        backward_convolutional_layer_gpu(input_layer, s);
+
+        if (state.net.try_fix_nan) {
+            fix_nan_and_inf(output_layer.delta_gpu, output_layer.inputs * output_layer.batch);
+            fix_nan_and_inf(self_layer.delta_gpu, self_layer.inputs * self_layer.batch);
+            fix_nan_and_inf(input_layer.delta_gpu, input_layer.inputs * input_layer.batch);
+        }
+
+        increment_layer(&input_layer,  -1);
+        increment_layer(&self_layer,   -1);
+        increment_layer(&output_layer, -1);
+    }
+    fill_ongpu(l.hidden * l.batch, 0, init_state_gpu, 1); //clean l.state_gpu
+}
+#endif
diff --git a/darknet-master/src/crnn_layer.h b/darknet-master/src/crnn_layer.h
new file mode 100644
index 0000000..b85df6f
--- /dev/null
+++ b/darknet-master/src/crnn_layer.h
@@ -0,0 +1,32 @@
+
+#ifndef CRNN_LAYER_H
+#define CRNN_LAYER_H
+
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int xnor, int train);
+void resize_crnn_layer(layer *l, int w, int h);
+void free_state_crnn(layer l);
+
+void forward_crnn_layer(layer l, network_state state);
+void backward_crnn_layer(layer l, network_state state);
+void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+
+#ifdef GPU
+void forward_crnn_layer_gpu(layer l, network_state state);
+void backward_crnn_layer_gpu(layer l, network_state state);
+void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale);
+void push_crnn_layer(layer l);
+void pull_crnn_layer(layer l);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/crop_layer.c b/darknet-master/src/crop_layer.c
new file mode 100644
index 0000000..2d1fafc
--- /dev/null
+++ b/darknet-master/src/crop_layer.c
@@ -0,0 +1,103 @@
+#include "utils.h"
+#include "crop_layer.h"
+#include "dark_cuda.h"
+#include <stdio.h>
+
+image get_crop_image(crop_layer l)
+{
+    int h = l.out_h;
+    int w = l.out_w;
+    int c = l.out_c;
+    return float_to_image(w,h,c,l.output);
+}
+
+void backward_crop_layer(const crop_layer l, network_state state){}
+void backward_crop_layer_gpu(const crop_layer l, network_state state){}
+
+crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure)
+{
+    fprintf(stderr, "Crop Layer: %d x %d -> %d x %d x %d image\n", h,w,crop_height,crop_width,c);
+    crop_layer l = { (LAYER_TYPE)0 };
+    l.type = CROP;
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.scale = (float)crop_height / h;
+    l.flip = flip;
+    l.angle = angle;
+    l.saturation = saturation;
+    l.exposure = exposure;
+    l.out_w = crop_width;
+    l.out_h = crop_height;
+    l.out_c = c;
+    l.inputs = l.w * l.h * l.c;
+    l.outputs = l.out_w * l.out_h * l.out_c;
+    l.output = (float*)xcalloc(l.outputs * batch, sizeof(float));
+    l.forward = forward_crop_layer;
+    l.backward = backward_crop_layer;
+
+    #ifdef GPU
+    l.forward_gpu = forward_crop_layer_gpu;
+    l.backward_gpu = backward_crop_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+    l.rand_gpu   = cuda_make_array(0, l.batch*8);
+    #endif
+    return l;
+}
+
+void resize_crop_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+
+    l->out_w =  l->scale*w;
+    l->out_h =  l->scale*h;
+
+    l->inputs = l->w * l->h * l->c;
+    l->outputs = l->out_h * l->out_w * l->out_c;
+
+    l->output = (float*)xrealloc(l->output, l->batch * l->outputs * sizeof(float));
+    #ifdef GPU
+    cuda_free(l->output_gpu);
+    l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
+    #endif
+}
+
+
+void forward_crop_layer(const crop_layer l, network_state state)
+{
+    int i,j,c,b,row,col;
+    int index;
+    int count = 0;
+    int flip = (l.flip && rand()%2);
+    int dh = rand()%(l.h - l.out_h + 1);
+    int dw = rand()%(l.w - l.out_w + 1);
+    float scale = 2;
+    float trans = -1;
+    if(l.noadjust){
+        scale = 1;
+        trans = 0;
+    }
+    if(!state.train){
+        flip = 0;
+        dh = (l.h - l.out_h)/2;
+        dw = (l.w - l.out_w)/2;
+    }
+    for(b = 0; b < l.batch; ++b){
+        for(c = 0; c < l.c; ++c){
+            for(i = 0; i < l.out_h; ++i){
+                for(j = 0; j < l.out_w; ++j){
+                    if(flip){
+                        col = l.w - dw - j - 1;
+                    }else{
+                        col = j + dw;
+                    }
+                    row = i + dh;
+                    index = col+l.w*(row+l.h*(c + l.c*b));
+                    l.output[count++] = state.input[index]*scale + trans;
+                }
+            }
+        }
+    }
+}
diff --git a/darknet-master/src/crop_layer.h b/darknet-master/src/crop_layer.h
new file mode 100644
index 0000000..3195824
--- /dev/null
+++ b/darknet-master/src/crop_layer.h
@@ -0,0 +1,26 @@
+#ifndef CROP_LAYER_H
+#define CROP_LAYER_H
+
+#include "image.h"
+#include "layer.h"
+#include "network.h"
+
+typedef layer crop_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+image get_crop_image(crop_layer l);
+crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure);
+void forward_crop_layer(const crop_layer l, network_state state);
+void resize_crop_layer(layer *l, int w, int h);
+
+#ifdef GPU
+void forward_crop_layer_gpu(crop_layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/crop_layer_kernels.cu b/darknet-master/src/crop_layer_kernels.cu
new file mode 100644
index 0000000..85783bc
--- /dev/null
+++ b/darknet-master/src/crop_layer_kernels.cu
@@ -0,0 +1,222 @@
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+
+#include "crop_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "image.h"
+
+__device__ float get_pixel_kernel(float *image, int w, int h, int x, int y, int c)
+{
+    if(x < 0 || x >= w || y < 0 || y >= h) return 0;
+    return image[x + w*(y + c*h)];
+}
+
+__device__ float3 rgb_to_hsv_kernel(float3 rgb)
+{
+    float r = rgb.x;
+    float g = rgb.y;
+    float b = rgb.z;
+
+    float h, s, v;
+    float max = (r > g) ? ( (r > b) ? r : b) : ( (g > b) ? g : b);
+    float min = (r < g) ? ( (r < b) ? r : b) : ( (g < b) ? g : b);
+    float delta = max - min;
+    v = max;
+    if(max == 0){
+        s = 0;
+        h = -1;
+    }else{
+        s = delta/max;
+        if(r == max){
+            h = (g - b) / delta;
+        } else if (g == max) {
+            h = 2 + (b - r) / delta;
+        } else {
+            h = 4 + (r - g) / delta;
+        }
+        if (h < 0) h += 6;
+    }
+    return make_float3(h, s, v);
+}
+
+__device__ float3 hsv_to_rgb_kernel(float3 hsv)
+{
+    float h = hsv.x;
+    float s = hsv.y;
+    float v = hsv.z;
+
+    float r, g, b;
+    float f, p, q, t;
+
+    if (s == 0) {
+        r = g = b = v;
+    } else {
+        int index = (int) floorf(h);
+        f = h - index;
+        p = v*(1-s);
+        q = v*(1-s*f);
+        t = v*(1-s*(1-f));
+        if(index == 0){
+            r = v; g = t; b = p;
+        } else if(index == 1){
+            r = q; g = v; b = p;
+        } else if(index == 2){
+            r = p; g = v; b = t;
+        } else if(index == 3){
+            r = p; g = q; b = v;
+        } else if(index == 4){
+            r = t; g = p; b = v;
+        } else {
+            r = v; g = p; b = q;
+        }
+    }
+    r = (r < 0) ? 0 : ((r > 1) ? 1 : r);
+    g = (g < 0) ? 0 : ((g > 1) ? 1 : g);
+    b = (b < 0) ? 0 : ((b > 1) ? 1 : b);
+    return make_float3(r, g, b);
+}
+
+__device__ float bilinear_interpolate_kernel(float *image, int w, int h, float x, float y, int c)
+{
+    int ix = (int) floorf(x);
+    int iy = (int) floorf(y);
+
+    float dx = x - ix;
+    float dy = y - iy;
+
+    float val = (1-dy) * (1-dx) * get_pixel_kernel(image, w, h, ix, iy, c) +
+        dy     * (1-dx) * get_pixel_kernel(image, w, h, ix, iy+1, c) +
+        (1-dy) *   dx   * get_pixel_kernel(image, w, h, ix+1, iy, c) +
+        dy     *   dx   * get_pixel_kernel(image, w, h, ix+1, iy+1, c);
+    return val;
+}
+
+__global__ void levels_image_kernel(float *image, float *rand, int batch, int w, int h, int train, float saturation, float exposure, float translate, float scale, float shift)
+{
+    int size = batch * w * h;
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(id >= size) return;
+    int x = id % w;
+    id /= w;
+    int y = id % h;
+    id /= h;
+    float rshift = rand[0];
+    float gshift = rand[1];
+    float bshift = rand[2];
+    float r0 = rand[8*id + 0];
+    float r1 = rand[8*id + 1];
+    float r2 = rand[8*id + 2];
+    float r3 = rand[8*id + 3];
+
+    saturation = r0*(saturation - 1) + 1;
+    saturation = (r1 > .5) ? 1./saturation : saturation;
+    exposure = r2*(exposure - 1) + 1;
+    exposure = (r3 > .5) ? 1./exposure : exposure;
+
+    size_t offset = id * h * w * 3;
+    image += offset;
+    float r = image[x + w*(y + h*0)];
+    float g = image[x + w*(y + h*1)];
+    float b = image[x + w*(y + h*2)];
+    float3 rgb = make_float3(r,g,b);
+    if(train){
+        float3 hsv = rgb_to_hsv_kernel(rgb);
+        hsv.y *= saturation;
+        hsv.z *= exposure;
+        rgb = hsv_to_rgb_kernel(hsv);
+    } else {
+        shift = 0;
+    }
+    image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5)*shift;
+    image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5)*shift;
+    image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5)*shift;
+}
+
+__global__ void forward_crop_layer_kernel(float *input, float *rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, float *output)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(id >= size) return;
+
+    float cx = w/2.;
+    float cy = h/2.;
+
+    int count = id;
+    int j = id % crop_width;
+    id /= crop_width;
+    int i = id % crop_height;
+    id /= crop_height;
+    int k = id % c;
+    id /= c;
+    int b = id;
+
+    float r4 = rand[8*b + 4];
+    float r5 = rand[8*b + 5];
+    float r6 = rand[8*b + 6];
+    float r7 = rand[8*b + 7];
+
+    float dw = (w - crop_width)*r4;
+    float dh = (h - crop_height)*r5;
+    flip = (flip && (r6 > .5));
+    angle = 2*angle*r7 - angle;
+    if(!train){
+        dw = (w - crop_width)/2.;
+        dh = (h - crop_height)/2.;
+        flip = 0;
+        angle = 0;
+    }
+
+    input += w*h*c*b;
+
+    float x = (flip) ? w - dw - j - 1 : j + dw;
+    float y = i + dh;
+
+    float rx = cos(angle)*(x-cx) - sin(angle)*(y-cy) + cx;
+    float ry = sin(angle)*(x-cx) + cos(angle)*(y-cy) + cy;
+
+    output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k);
+}
+
+extern "C" void forward_crop_layer_gpu(crop_layer layer, network_state state)
+{
+    cuda_random(layer.rand_gpu, layer.batch*8);
+
+    float radians = layer.angle*3.14159265/180.;
+
+    float scale = 2;
+    float translate = -1;
+    if(layer.noadjust){
+        scale = 1;
+        translate = 0;
+    }
+
+    int size = layer.batch * layer.w * layer.h;
+
+    levels_image_kernel<<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>>(state.input, layer.rand_gpu, layer.batch, layer.w, layer.h, state.train, layer.saturation, layer.exposure, translate, scale, layer.shift);
+    CHECK_CUDA(cudaPeekAtLastError());
+
+    size = layer.batch*layer.c*layer.out_w*layer.out_h;
+
+    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, state.train, layer.flip, radians, layer.output_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+
+/*
+       cuda_pull_array(layer.output_gpu, layer.output, size);
+       image im = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 0*(size/layer.batch));
+       image im2 = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 1*(size/layer.batch));
+       image im3 = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 2*(size/layer.batch));
+
+       translate_image(im, -translate);
+       scale_image(im, 1/scale);
+       translate_image(im2, -translate);
+       scale_image(im2, 1/scale);
+       translate_image(im3, -translate);
+       scale_image(im3, 1/scale);
+
+       show_image(im, "cropped");
+       show_image(im2, "cropped2");
+       show_image(im3, "cropped3");
+       cvWaitKey(0);
+       */
+}
diff --git a/darknet-master/src/csharp/CMakeLists.txt b/darknet-master/src/csharp/CMakeLists.txt
new file mode 100644
index 0000000..1b591a1
--- /dev/null
+++ b/darknet-master/src/csharp/CMakeLists.txt
@@ -0,0 +1,19 @@
+
+project(YoloCSharpWrapper LANGUAGES CSharp)
+include(CSharpUtilities)
+
+add_library(${PROJECT_NAME}
+  ${PROJECT_NAME}.cs
+)
+
+target_link_libraries(${PROJECT_NAME} PRIVATE dark)
+
+set_property(TARGET ${PROJECT_NAME} PROPERTY VS_DOTNET_REFERENCES
+  "System"
+  "System.Runtime.InteropServices"
+)
+
+install(TARGETS ${PROJECT_NAME}
+  RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
+  COMPONENT dev
+)
diff --git a/darknet-master/src/csharp/YoloCSharpWrapper.cs b/darknet-master/src/csharp/YoloCSharpWrapper.cs
new file mode 100644
index 0000000..8fc8567
--- /dev/null
+++ b/darknet-master/src/csharp/YoloCSharpWrapper.cs
@@ -0,0 +1,89 @@
+﻿using System;
+using System.Runtime.InteropServices;
+
+namespace Darknet
+{
+    public class YoloWrapper : IDisposable
+    {
+        private const string YoloLibraryName = "darknet.dll";
+        private const int MaxObjects = 1000;
+
+        [DllImport(YoloLibraryName, EntryPoint = "init")]
+        private static extern int InitializeYolo(string configurationFilename, string weightsFilename, int gpu, int batch_size);
+
+        [DllImport(YoloLibraryName, EntryPoint = "detect_image")]
+        private static extern int DetectImage(string filename, ref BboxContainer container);
+
+        [DllImport(YoloLibraryName, EntryPoint = "detect_mat")]
+        private static extern int DetectImage(IntPtr pArray, int nSize, ref BboxContainer container);
+
+        [DllImport(YoloLibraryName, EntryPoint = "dispose")]
+        private static extern int DisposeYolo();
+
+        [StructLayout(LayoutKind.Sequential)]
+        public struct bbox_t
+        {
+            public UInt32 x, y, w, h;    // (x,y) - top-left corner, (w, h) - width & height of bounded box
+            public float prob;           // confidence - probability that the object was found correctly
+            public UInt32 obj_id;        // class of object - from range [0, classes-1]
+            public UInt32 track_id;      // tracking id for video (0 - untracked, 1 - inf - tracked object)
+            public UInt32 frames_counter;
+            public float x_3d, y_3d, z_3d;  // 3-D coordinates, if there is used 3D-stereo camera
+        };
+
+        [StructLayout(LayoutKind.Sequential)]
+        public struct BboxContainer
+        {
+            [MarshalAs(UnmanagedType.ByValArray, SizeConst = MaxObjects)]
+            public bbox_t[] candidates;
+        }
+
+        public YoloWrapper(string configurationFilename, string weightsFilename, int gpu, int batch_size = 1)
+        {
+            InitializeYolo(configurationFilename, weightsFilename, gpu, batch_size);
+        }
+
+        public void Dispose()
+        {
+            DisposeYolo();
+        }
+
+        public bbox_t[] Detect(string filename)
+        {
+            var container = new BboxContainer();
+            var count = DetectImage(filename, ref container);
+
+            return container.candidates;
+        }
+
+        public bbox_t[] Detect(byte[] imageData)
+        {
+            var container = new BboxContainer();
+
+            var size = Marshal.SizeOf(imageData[0]) * imageData.Length;
+            var pnt = Marshal.AllocHGlobal(size);
+
+            try
+            {
+                // Copy the array to unmanaged memory.
+                Marshal.Copy(imageData, 0, pnt, imageData.Length);
+                var count = DetectImage(pnt, imageData.Length, ref container);
+                if (count == -1)
+                {
+                    throw new NotSupportedException($"{YoloLibraryName} has no OpenCV support");
+                }
+            }
+            catch (Exception exception)
+            {
+                return null;
+            }
+            finally
+            {
+                // Free the unmanaged memory.
+                Marshal.FreeHGlobal(pnt);
+            }
+
+            return container.candidates;
+        }
+    }
+}
diff --git a/darknet-master/src/dark_cuda.c b/darknet-master/src/dark_cuda.c
new file mode 100644
index 0000000..74f0677
--- /dev/null
+++ b/darknet-master/src/dark_cuda.c
@@ -0,0 +1,634 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+int cuda_debug_sync = 0;
+int gpu_index = 0;
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#ifdef GPU
+
+#include "dark_cuda.h"
+#include "utils.h"
+#include "blas.h"
+#include "assert.h"
+#include <stdlib.h>
+#include <time.h>
+#include <cuda.h>
+#include <stdio.h>
+
+#ifndef USE_CMAKE_LIBS
+#pragma comment(lib, "cuda.lib")
+
+#ifdef CUDNN
+#pragma comment(lib, "cudnn.lib")
+#endif  // CUDNN
+#endif  // USE_CMAKE_LIBS
+
+#if defined(CUDNN_HALF) && !defined(CUDNN)
+#error "If you set CUDNN_HALF=1 then you must set CUDNN=1"
+#endif
+
+
+void cuda_set_device(int n)
+{
+    gpu_index = n;
+    cudaError_t status = cudaSetDevice(n);
+    if(status != cudaSuccess) CHECK_CUDA(status);
+}
+
+int cuda_get_device()
+{
+    int n = 0;
+    cudaError_t status = cudaGetDevice(&n);
+    CHECK_CUDA(status);
+    return n;
+}
+
+void *cuda_get_context()
+{
+    CUcontext pctx;
+    CUresult status = cuCtxGetCurrent(&pctx);
+    if(status != CUDA_SUCCESS) fprintf(stderr, " Error: cuCtxGetCurrent() is failed \n");
+    return (void *)pctx;
+}
+
+void check_error(cudaError_t status, const char * const filename, const char * const funcname, const int line)
+{
+    cudaError_t status2 = cudaGetLastError();
+    if (status != cudaSuccess)
+    {
+        const char *s = cudaGetErrorString(status);
+        char buffer[256];
+        printf("\n CUDA Error: %s\n", s);
+        snprintf(buffer, 256, "CUDA Error: %s", s);
+        error(buffer, filename, funcname, line);
+    }
+    if (status2 != cudaSuccess)
+    {
+        const char *s = cudaGetErrorString(status2);
+        char buffer[256];
+        printf("\n CUDA Error Prev: %s\n", s);
+        snprintf(buffer, 256, "CUDA Error Prev: %s", s);
+        error(buffer, filename, funcname, line);
+    }
+}
+
+void check_error_extended(cudaError_t status, const char * const filename, const char * const funcname, const int line)
+{
+    if (status != cudaSuccess) {
+        printf("CUDA status Error: file: %s: func: %s() line: %d\n", filename, funcname, line);
+        check_error(status, filename, funcname, line);
+    }
+#if defined(DEBUG) || defined(CUDA_DEBUG)
+    cuda_debug_sync = 1;
+#endif
+    if (cuda_debug_sync) {
+        status = cudaDeviceSynchronize();
+        if (status != cudaSuccess)
+            printf("CUDA status = cudaDeviceSynchronize() Error: file: %s: func: %s() line: %d\n", filename, funcname, line);
+    }
+    check_error(status, filename, funcname, line);
+}
+
+dim3 cuda_gridsize(size_t n){
+    size_t k = (n-1) / BLOCK + 1;
+    size_t x = k;
+    size_t y = 1;
+    if(x > 65535){
+        x = ceil(sqrt(k));
+        y = (n-1)/(x*BLOCK) + 1;
+    }
+    //dim3 d = { (unsigned int)x, (unsigned int)y, 1 };
+    dim3 d;
+    d.x = x;
+    d.y = y;
+    d.z = 1;
+    //printf("%ld %ld %ld %ld\n", n, x, y, x*y*BLOCK);
+    return d;
+}
+
+static cudaStream_t streamsArray[16];    // cudaStreamSynchronize( get_cuda_stream() );
+static int streamInit[16] = { 0 };
+
+cudaStream_t get_cuda_stream() {
+    int i = cuda_get_device();
+    if (!streamInit[i]) {
+        printf("Create CUDA-stream - %d \n", i);
+#ifdef CUDNN
+        cudaError_t status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamNonBlocking);
+#else
+        cudaError_t status = cudaStreamCreate(&streamsArray[i]);
+#endif
+        if (status != cudaSuccess) {
+            printf(" cudaStreamCreate error: %d \n", status);
+            const char *s = cudaGetErrorString(status);
+            printf("CUDA Error: %s\n", s);
+            status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamNonBlocking);    // cudaStreamDefault
+            CHECK_CUDA(status);
+        }
+        streamInit[i] = 1;
+    }
+    return streamsArray[i];
+}
+
+/*
+static cudaStream_t streamsArray2[16];    // cudaStreamSynchronize( get_cuda_memcpy_stream() );
+static int streamInit2[16] = { 0 };
+
+cudaStream_t get_cuda_memcpy_stream() {
+    int i = cuda_get_device();
+    if (!streamInit2[i]) {
+        printf(" Create COPY stream %d \n", i);
+        //cudaError_t status = cudaStreamCreate(&streamsArray2[i], cudaStreamNonBlocking);
+        cudaError_t status = cudaStreamCreateWithFlags(&streamsArray2[i], cudaStreamNonBlocking);
+        if (status != cudaSuccess) {
+            printf(" cudaStreamCreate-Memcpy error: %d \n", status);
+            const char *s = cudaGetErrorString(status);
+            printf("CUDA Error: %s\n", s);
+            status = cudaStreamCreateWithFlags(&streamsArray2[i], cudaStreamNonBlocking);
+            CHECK_CUDA(status);
+        }
+        streamInit2[i] = 1;
+    }
+    return streamsArray2[i];
+}
+*/
+
+#ifdef CUDNN
+static int cudnnInit[16] = { 0 };
+static cudnnHandle_t cudnnHandle[16];
+
+cudnnHandle_t cudnn_handle()
+{
+    int i = cuda_get_device();
+    if(!cudnnInit[i]) {
+        cudnnCreate(&cudnnHandle[i]);
+        cudnnInit[i] = 1;
+        cudnnStatus_t status = cudnnSetStream(cudnnHandle[i], get_cuda_stream());
+        CHECK_CUDNN(status);
+        printf(" Create cudnn-handle %d \n", i);
+    }
+    return cudnnHandle[i];
+}
+
+
+void cudnn_check_error(cudnnStatus_t status, const char * const filename, const char * const function, const int line)
+{
+#if defined(DEBUG) || defined(CUDA_DEBUG)
+    cudaDeviceSynchronize();
+#endif
+    if (cuda_debug_sync) {
+        cudaDeviceSynchronize();
+    }
+    cudnnStatus_t status2 = CUDNN_STATUS_SUCCESS;
+#ifdef CUDNN_ERRQUERY_RAWCODE
+    cudnnStatus_t status_tmp = cudnnQueryRuntimeError(cudnn_handle(), &status2, CUDNN_ERRQUERY_RAWCODE, NULL);
+#endif
+    if (status != CUDNN_STATUS_SUCCESS)
+    {
+        const char *s = cudnnGetErrorString(status);
+        char buffer[256];
+        printf("\n cuDNN Error: %s\n", s);
+        snprintf(buffer, 256, "cuDNN Error: %s", s);
+        error(buffer, filename, function, line);
+    }
+    if (status2 != CUDNN_STATUS_SUCCESS)
+    {
+        const char *s = cudnnGetErrorString(status2);
+        char buffer[256];
+        printf("\n cuDNN Error Prev: %s\n", s);
+        snprintf(buffer, 256, "cuDNN Error Prev: %s", s);
+        error(buffer, filename, function, line);
+    }
+}
+
+void cudnn_check_error_extended(cudnnStatus_t status, const char * const filename, const char * const function, const int line)
+{
+    if (status != CUDNN_STATUS_SUCCESS) {
+        printf("\n cuDNN status Error in: file: %s function: %s() line: %d\n", filename, function, line);
+        cudnn_check_error(status, filename, function, line);
+    }
+#if defined(DEBUG) || defined(CUDA_DEBUG)
+    cuda_debug_sync = 1;
+#endif
+    if (cuda_debug_sync) {
+        cudaError_t status = cudaDeviceSynchronize();
+        if (status != CUDNN_STATUS_SUCCESS)
+            printf("\n cudaError_t status = cudaDeviceSynchronize() Error in: file: %s function: %s() line: %d\n", filename, function, line);
+    }
+    cudnn_check_error(status, filename, function, line);
+}
+
+static cudnnHandle_t switchCudnnHandle[16];
+static int switchCudnnInit[16];
+#endif
+
+
+void cublas_check_error(cublasStatus_t status)
+{
+#if defined(DEBUG) || defined(CUDA_DEBUG)
+    cudaDeviceSynchronize();
+#endif
+    if (cuda_debug_sync) {
+        cudaDeviceSynchronize();
+    }
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        printf("cuBLAS Error\n");
+    }
+}
+
+void cublas_check_error_extended(cublasStatus_t status, const char * const filename, const char * const function, const int line)
+{
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      printf("\n cuBLAS status Error in: file: %s function: %s() line: %d\n", filename, function, line);
+    }
+#if defined(DEBUG) || defined(CUDA_DEBUG)
+    cuda_debug_sync = 1;
+#endif
+    if (cuda_debug_sync) {
+        cudaError_t status = cudaDeviceSynchronize();
+      if (status != CUDA_SUCCESS)
+          printf("\n cudaError_t status = cudaDeviceSynchronize() Error in: file: %s function: %s() line: %d\n", filename, function, line);
+    }
+    cublas_check_error(status);
+}
+
+static int blasInit[16] = { 0 };
+static cublasHandle_t blasHandle[16];
+
+cublasHandle_t blas_handle()
+{
+    int i = cuda_get_device();
+    if (!blasInit[i]) {
+        CHECK_CUBLAS(cublasCreate(&blasHandle[i]));
+        cublasStatus_t status = cublasSetStream(blasHandle[i], get_cuda_stream());
+        CHECK_CUBLAS(status);
+        blasInit[i] = 1;
+    }
+    return blasHandle[i];
+}
+
+
+static int switchBlasInit[16] = { 0 };
+static cublasHandle_t switchBlasHandle[16];
+
+static cudaStream_t switchStreamsArray[16];
+static int switchStreamInit[16] = { 0 };
+
+cudaStream_t switch_stream(int i) {
+    int dev_id = cuda_get_device();
+
+    //printf(" switch_stream = %d \n", i);
+    if (!switchStreamInit[i]) {
+        CHECK_CUDA(cudaStreamCreateWithFlags(&switchStreamsArray[i], cudaStreamNonBlocking));
+        switchStreamInit[i] = 1;
+        printf(" Create stream %d \n", i);
+    }
+
+    //cudaStreamQuery(streamsArray[0]);   // Flush previous stream queue
+    streamsArray[dev_id] = switchStreamsArray[i];
+    streamInit[dev_id] = switchStreamInit[i];
+
+    //printf("switch_stream %d - get_cuda_stream() = %d \n", i, get_cuda_stream());
+
+    /*
+    if (!switchBlasInit[i]) {
+        CHECK_CUDA( cublasCreate(&switchBlasHandle[i]) );
+        switchBlasInit[i] = 1;
+        CHECK_CUDA( cublasSetStream(switchBlasHandle[i], switchStreamsArray[i]) );
+        printf(" Create blas-handle %d \n", i);
+    }
+    blasHandle[dev_id] = switchBlasHandle[i];
+    blasInit[dev_id] = switchBlasInit[i];
+    */
+
+#ifdef CUDNN
+    if (!switchCudnnInit[i]) {
+        CHECK_CUDNN( cudnnCreate(&switchCudnnHandle[i]) );
+        switchCudnnInit[i] = 1;
+        CHECK_CUDNN(cudnnSetStream(switchCudnnHandle[i], switchStreamsArray[i]));
+        printf(" Create cudnn-handle %d \n", i);
+    }
+    cudnnHandle[dev_id] = switchCudnnHandle[i];
+    cudnnInit[dev_id] = switchCudnnInit[i];
+#endif
+
+    return switchStreamsArray[i];
+}
+
+#ifndef cudaEventWaitDefault
+#define cudaEventWaitDefault 0x00
+#endif // cudaEventWaitDefault
+
+static const int max_events = 1024;
+static cudaEvent_t switchEventsArray[1024];
+static volatile int event_counter = 0;
+
+void wait_stream(int i) {
+    int dev_id = cuda_get_device();
+    if (event_counter >= max_events) error("CUDA max_events exceeded", DARKNET_LOC);
+
+    CHECK_CUDA( cudaEventCreateWithFlags(&switchEventsArray[event_counter], cudaEventDisableTiming) );
+    //printf(" create event = %d (wait for stream = %d) \n", event_counter, i);
+
+    //CHECK_CUDA(cudaEventRecordWithFlags(switchEventsArray[i], switchStreamsArray[i], cudaEventRecordExternal) );
+    CHECK_CUDA( cudaEventRecord(switchEventsArray[event_counter], switchStreamsArray[i]) );
+    CHECK_CUDA( cudaStreamWaitEvent(streamsArray[dev_id], switchEventsArray[event_counter], cudaEventWaitDefault) );
+    //cudaStreamWaitEvent(streamsArray[dev_id], switchEventsArray[i], cudaEventWaitExternal);
+    event_counter++;
+}
+
+void reset_wait_stream_events() {
+    int i;
+    for (i = 0; i < event_counter; ++i) {
+        CHECK_CUDA(cudaEventDestroy(switchEventsArray[i]));
+    }
+    event_counter = 0;
+}
+
+
+static float **pinned_ptr = NULL;
+static size_t pinned_num_of_blocks = 0;
+static size_t pinned_index = 0;
+static size_t pinned_block_id = 0;
+static const size_t pinned_block_size = (size_t)1024 * 1024 * 1024 * 1;   // 1 GB block size
+static pthread_mutex_t mutex_pinned = PTHREAD_MUTEX_INITIALIZER;
+
+// free CPU-pinned memory
+void free_pinned_memory()
+{
+    if (pinned_ptr) {
+        int k;
+        for (k = 0; k < pinned_num_of_blocks; ++k) {
+            cuda_free_host(pinned_ptr[k]);
+        }
+        free(pinned_ptr);
+        pinned_ptr = NULL;
+    }
+}
+
+// custom CPU-pinned memory allocation
+void pre_allocate_pinned_memory(const size_t size)
+{
+    const size_t num_of_blocks = size / pinned_block_size + ((size % pinned_block_size) ? 1 : 0);
+    printf("pre_allocate... pinned_ptr = %p \n", (void *)pinned_ptr);
+
+    pthread_mutex_lock(&mutex_pinned);
+    if (!pinned_ptr) {
+        pinned_ptr = (float **)calloc(num_of_blocks, sizeof(float *));
+        if(!pinned_ptr) error("calloc failed in pre_allocate()", DARKNET_LOC);
+
+        printf("pre_allocate: size = %zu MB, num_of_blocks = %zu, block_size = %zu MB \n",
+            size / (1024*1024), num_of_blocks, pinned_block_size / (1024 * 1024));
+
+        int k;
+        for (k = 0; k < num_of_blocks; ++k) {
+            cudaError_t status = cudaHostAlloc((void **)&pinned_ptr[k], pinned_block_size, cudaHostRegisterMapped);
+            if (status != cudaSuccess) fprintf(stderr, " Can't pre-allocate CUDA-pinned buffer on CPU-RAM \n");
+            CHECK_CUDA(status);
+            if (!pinned_ptr[k]) error("cudaHostAlloc failed", DARKNET_LOC);
+            else {
+                printf(" Allocated %zu pinned block \n", pinned_block_size);
+            }
+        }
+        pinned_num_of_blocks = num_of_blocks;
+    }
+    pthread_mutex_unlock(&mutex_pinned);
+}
+
+// simple - get pre-allocated pinned memory
+float *cuda_make_array_pinned_preallocated(float *x, size_t n)
+{
+    pthread_mutex_lock(&mutex_pinned);
+    float *x_cpu = NULL;
+    const size_t memory_step = 512;// 4096;
+    const size_t size = sizeof(float)*n;
+    const size_t allocation_size = ((size / memory_step) + 1) * memory_step;
+
+    if (pinned_ptr && pinned_block_id < pinned_num_of_blocks && (allocation_size < pinned_block_size/2))
+    {
+        if ((allocation_size + pinned_index) > pinned_block_size) {
+            const float filled = (float)100 * pinned_index / pinned_block_size;
+            printf("\n Pinned block_id = %zu, filled = %f %% \n", pinned_block_id, filled);
+            pinned_block_id++;
+            pinned_index = 0;
+        }
+        if ((allocation_size + pinned_index) < pinned_block_size && pinned_block_id < pinned_num_of_blocks) {
+            x_cpu = (float *)((char *)pinned_ptr[pinned_block_id] + pinned_index);
+            pinned_index += allocation_size;
+        }
+        else {
+            //printf("Pre-allocated pinned memory is over! \n");
+        }
+    }
+
+    if(!x_cpu) {
+        if (allocation_size > pinned_block_size / 2) {
+            printf("Try to allocate new pinned memory, size = %zu MB \n", size / (1024 * 1024));
+            cudaError_t status = cudaHostAlloc((void **)&x_cpu, size, cudaHostRegisterMapped);
+            if (status != cudaSuccess) fprintf(stderr, " Can't allocate CUDA-pinned memory on CPU-RAM (pre-allocated memory is over too) \n");
+            CHECK_CUDA(status);
+        }
+        else {
+            printf("Try to allocate new pinned BLOCK, size = %zu MB \n", size / (1024 * 1024));
+            pinned_num_of_blocks++;
+            pinned_block_id = pinned_num_of_blocks - 1;
+            pinned_index = 0;
+            pinned_ptr = (float **)realloc(pinned_ptr, pinned_num_of_blocks * sizeof(float *));
+            cudaError_t status = cudaHostAlloc((void **)&pinned_ptr[pinned_block_id], pinned_block_size, cudaHostRegisterMapped);
+            if (status != cudaSuccess) fprintf(stderr, " Can't pre-allocate CUDA-pinned buffer on CPU-RAM \n");
+            CHECK_CUDA(status);
+            x_cpu = pinned_ptr[pinned_block_id];
+        }
+    }
+
+    if (x) {
+        cudaError_t status = cudaMemcpyAsync(x_cpu, x, size, cudaMemcpyDefault, get_cuda_stream());
+        CHECK_CUDA(status);
+    }
+
+    pthread_mutex_unlock(&mutex_pinned);
+    return x_cpu;
+}
+
+float *cuda_make_array_pinned(float *x, size_t n)
+{
+    float *x_gpu;
+    size_t size = sizeof(float)*n;
+    //cudaError_t status = cudaMalloc((void **)&x_gpu, size);
+    cudaError_t status = cudaHostAlloc((void **)&x_gpu, size, cudaHostRegisterMapped);
+    if (status != cudaSuccess) fprintf(stderr, " Can't allocate CUDA-pinned memory on CPU-RAM \n");
+    CHECK_CUDA(status);
+    if (x) {
+        status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyDefault, get_cuda_stream());
+        CHECK_CUDA(status);
+    }
+    if (!x_gpu) error("cudaHostAlloc failed", DARKNET_LOC);
+    return x_gpu;
+}
+
+float *cuda_make_array(float *x, size_t n)
+{
+    float *x_gpu;
+    size_t size = sizeof(float)*n;
+    cudaError_t status = cudaMalloc((void **)&x_gpu, size);
+    //cudaError_t status = cudaMallocManaged((void **)&x_gpu, size, cudaMemAttachGlobal);
+    //status = cudaMemAdvise(x_gpu, size, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
+    if (status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
+    CHECK_CUDA(status);
+    if(x){
+        //status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
+        status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyDefault, get_cuda_stream());
+        CHECK_CUDA(status);
+    }
+    if(!x_gpu) error("Cuda malloc failed", DARKNET_LOC);
+    return x_gpu;
+}
+
+void **cuda_make_array_pointers(void **x, size_t n)
+{
+    void **x_gpu;
+    size_t size = sizeof(void*) * n;
+    cudaError_t status = cudaMalloc((void **)&x_gpu, size);
+    if (status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
+    CHECK_CUDA(status);
+    if (x) {
+        status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyDefault, get_cuda_stream());
+        CHECK_CUDA(status);
+    }
+    if (!x_gpu) error("Cuda malloc failed", DARKNET_LOC);
+    return x_gpu;
+}
+
+void cuda_random(float *x_gpu, size_t n)
+{
+    static curandGenerator_t gen[16];
+    static int init[16] = {0};
+    int i = cuda_get_device();
+    if(!init[i]){
+        curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT);
+        curandSetPseudoRandomGeneratorSeed(gen[i], time(0));
+        init[i] = 1;
+    }
+    curandGenerateUniform(gen[i], x_gpu, n);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+float cuda_compare(float *x_gpu, float *x, size_t n, char *s)
+{
+    float* tmp = (float*)xcalloc(n, sizeof(float));
+    cuda_pull_array(x_gpu, tmp, n);
+    //int i;
+    //for(i = 0; i < n; ++i) printf("%f %f\n", tmp[i], x[i]);
+    axpy_cpu(n, -1, x, 1, tmp, 1);
+    float err = dot_cpu(n, tmp, 1, tmp, 1);
+    printf("Error %s: %f\n", s, sqrt(err/n));
+    free(tmp);
+    return err;
+}
+
+int *cuda_make_int_array(size_t n)
+{
+    int *x_gpu;
+    size_t size = sizeof(int)*n;
+    cudaError_t status = cudaMalloc((void **)&x_gpu, size);
+    if(status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
+    CHECK_CUDA(status);
+    return x_gpu;
+}
+
+int *cuda_make_int_array_new_api(int *x, size_t n)
+{
+    int *x_gpu;
+    size_t size = sizeof(int)*n;
+    cudaError_t status = cudaMalloc((void **)&x_gpu, size);
+    CHECK_CUDA(status);
+    if (x) {
+        //status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
+        cudaError_t status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
+        CHECK_CUDA(status);
+    }
+    if (!x_gpu) error("Cuda malloc failed", DARKNET_LOC);
+    return x_gpu;
+}
+
+void cuda_free(float *x_gpu)
+{
+    //cudaStreamSynchronize(get_cuda_stream());
+    cudaError_t status = cudaFree(x_gpu);
+    CHECK_CUDA(status);
+}
+
+void cuda_free_host(float *x_cpu)
+{
+    //cudaStreamSynchronize(get_cuda_stream());
+    cudaError_t status = cudaFreeHost(x_cpu);
+    CHECK_CUDA(status);
+}
+
+void cuda_push_array(float *x_gpu, float *x, size_t n)
+{
+    size_t size = sizeof(float)*n;
+    //cudaError_t status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
+    cudaError_t status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
+    CHECK_CUDA(status);
+}
+
+void cuda_pull_array(float *x_gpu, float *x, size_t n)
+{
+    size_t size = sizeof(float)*n;
+    //cudaError_t status = cudaMemcpy(x, x_gpu, size, cudaMemcpyDeviceToHost);
+    //printf("cuda_pull_array - get_cuda_stream() = %d \n", get_cuda_stream());
+    cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDeviceToHost, get_cuda_stream());
+    CHECK_CUDA(status);
+    cudaStreamSynchronize(get_cuda_stream());
+}
+
+void cuda_pull_array_async(float *x_gpu, float *x, size_t n)
+{
+    size_t size = sizeof(float)*n;
+    cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDefault, get_cuda_stream());
+    check_error(status, DARKNET_LOC);
+    //cudaStreamSynchronize(get_cuda_stream());
+}
+
+int get_number_of_blocks(int array_size, int block_size)
+{
+    return array_size / block_size + ((array_size % block_size > 0) ? 1 : 0);
+}
+
+int get_gpu_compute_capability(int i, char *device_name)
+{
+    typedef struct cudaDeviceProp cudaDeviceProp;
+    cudaDeviceProp prop;
+    cudaError_t status = cudaGetDeviceProperties(&prop, i);
+    CHECK_CUDA(status);
+    if (device_name) strcpy(device_name, prop.name);
+    int cc = prop.major * 100 + prop.minor * 10;    // __CUDA_ARCH__ format
+    return cc;
+}
+
+void show_cuda_cudnn_info()
+{
+    int cuda_version = 0, cuda_driver_version = 0, device_count = 0;
+    CHECK_CUDA(cudaRuntimeGetVersion(&cuda_version));
+    CHECK_CUDA(cudaDriverGetVersion(&cuda_driver_version));
+    fprintf(stderr, " CUDA-version: %d (%d)", cuda_version, cuda_driver_version);
+    if(cuda_version > cuda_driver_version) fprintf(stderr, "\n Warning: CUDA-version is higher than Driver-version! \n");
+#ifdef CUDNN
+    fprintf(stderr, ", cuDNN: %d.%d.%d", CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL);
+#endif  // CUDNN
+#ifdef CUDNN_HALF
+    fprintf(stderr, ", CUDNN_HALF=1");
+#endif  // CUDNN_HALF
+    CHECK_CUDA(cudaGetDeviceCount(&device_count));
+    fprintf(stderr, ", GPU count: %d ", device_count);
+    fprintf(stderr, " \n");
+}
+
+#else // GPU
+#include "darknet.h"
+void cuda_set_device(int n) {}
+#endif // GPU
diff --git a/darknet-master/src/dark_cuda.h b/darknet-master/src/dark_cuda.h
new file mode 100644
index 0000000..ffe3836
--- /dev/null
+++ b/darknet-master/src/dark_cuda.h
@@ -0,0 +1,118 @@
+#ifndef DARKCUDA_H
+#define DARKCUDA_H
+#include "darknet.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+extern int cuda_debug_sync;
+extern int gpu_index;
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#ifdef GPU
+
+#define BLOCK 512
+#define FULL_MASK 0xffffffff
+#define WARP_SIZE 32
+#define BLOCK_TRANSPOSE32 256
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+#include <cuda_runtime_api.h>
+
+#ifdef CUDA_OPENGL_INTEGRATION
+// On Windows, we need to include <windows.h> before
+// including OpenGL headers or else we will get various
+// compiler errors due to missing macros.
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif // _WIN32
+
+#include <cuda_gl_interop.h>
+#endif // CUDA_OPENGL_INTEGRATION
+//#include <driver_types.h>
+
+#ifdef CUDNN
+#include <cudnn.h>
+#endif // CUDNN
+
+#ifndef __DATE__
+#define __DATE__
+#endif
+
+#ifndef __TIME__
+#define __TIME__
+#endif
+
+#ifndef __FUNCTION__
+#define __FUNCTION__
+#endif
+
+#ifndef __LINE__
+#define __LINE__ 0
+#endif
+
+#ifndef __FILE__
+#define __FILE__
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+    void check_error(cudaError_t status, const char * const filename, const char * const funcname, const int line);
+    void check_error_extended(cudaError_t status, const char * const filename, const char * const funcname, const int line);
+    void cublas_check_error_extended(cublasStatus_t status, const char * const filename, const char * const funcname, const int line);
+#define CHECK_CUDA(X) check_error_extended(X, __FILE__, __func__, __LINE__ );
+#define CHECK_CUBLAS(X) cublas_check_error_extended(X, __FILE__, __func__, __LINE__ );
+
+    cublasHandle_t blas_handle();
+    void free_pinned_memory();
+    void pre_allocate_pinned_memory(size_t size);
+    float *cuda_make_array_pinned_preallocated(float *x, size_t n);
+    float *cuda_make_array_pinned(float *x, size_t n);
+    float *cuda_make_array(float *x, size_t n);
+    void **cuda_make_array_pointers(void **x, size_t n);
+    int *cuda_make_int_array(size_t n);
+    int *cuda_make_int_array_new_api(int *x, size_t n);
+    void cuda_push_array(float *x_gpu, float *x, size_t n);
+    //LIB_API void cuda_pull_array(float *x_gpu, float *x, size_t n);
+    //LIB_API void cuda_set_device(int n);
+    int cuda_get_device();
+    void cuda_free_host(float *x_cpu);
+    void cuda_free(float *x_gpu);
+    void cuda_random(float *x_gpu, size_t n);
+    float cuda_compare(float *x_gpu, float *x, size_t n, char *s);
+    dim3 cuda_gridsize(size_t n);
+    cudaStream_t get_cuda_stream();
+    //cudaStream_t get_cuda_memcpy_stream();
+    int get_number_of_blocks(int array_size, int block_size);
+    int get_gpu_compute_capability(int i, char *device_name);
+    void show_cuda_cudnn_info();
+
+    cudaStream_t switch_stream(int i);
+    void wait_stream(int i);
+    void reset_wait_stream_events();
+
+#ifdef CUDNN
+cudnnHandle_t cudnn_handle();
+enum {cudnn_fastest, cudnn_smallest, cudnn_specify};
+
+void cudnn_check_error_extended(cudnnStatus_t status, const char * const filename, const char * const function, const int line);
+#define CHECK_CUDNN(X) cudnn_check_error_extended(X, __FILE__, __func__, __LINE__);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#else // GPU
+//LIB_API void cuda_set_device(int n);
+#endif // GPU
+#endif // DARKCUDA_H
diff --git a/darknet-master/src/darknet.c b/darknet-master/src/darknet.c
new file mode 100644
index 0000000..392f2e4
--- /dev/null
+++ b/darknet-master/src/darknet.c
@@ -0,0 +1,554 @@
+#include "darknet.h"
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#if defined(_MSC_VER) && defined(_DEBUG)
+#include <crtdbg.h>
+#endif
+
+#include "parser.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include "connected_layer.h"
+
+
+extern void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int top);
+extern void run_voxel(int argc, char **argv);
+extern void run_yolo(int argc, char **argv);
+extern void run_detector(int argc, char **argv);
+extern void run_coco(int argc, char **argv);
+extern void run_writing(int argc, char **argv);
+extern void run_captcha(int argc, char **argv);
+extern void run_nightmare(int argc, char **argv);
+extern void run_dice(int argc, char **argv);
+extern void run_compare(int argc, char **argv);
+extern void run_classifier(int argc, char **argv);
+extern void run_char_rnn(int argc, char **argv);
+extern void run_vid_rnn(int argc, char **argv);
+extern void run_tag(int argc, char **argv);
+extern void run_cifar(int argc, char **argv);
+extern void run_go(int argc, char **argv);
+extern void run_art(int argc, char **argv);
+extern void run_super(int argc, char **argv);
+
+void average(int argc, char *argv[])
+{
+    char *cfgfile = argv[2];
+    char *outfile = argv[3];
+    gpu_index = -1;
+    network net = parse_network_cfg(cfgfile);
+    network sum = parse_network_cfg(cfgfile);
+
+    char *weightfile = argv[4];
+    load_weights(&sum, weightfile);
+
+    int i, j;
+    int n = argc - 5;
+    for(i = 0; i < n; ++i){
+        weightfile = argv[i+5];
+        load_weights(&net, weightfile);
+        for(j = 0; j < net.n; ++j){
+            layer l = net.layers[j];
+            layer out = sum.layers[j];
+            if(l.type == CONVOLUTIONAL){
+                int num = l.n*l.c*l.size*l.size;
+                axpy_cpu(l.n, 1, l.biases, 1, out.biases, 1);
+                axpy_cpu(num, 1, l.weights, 1, out.weights, 1);
+                if(l.batch_normalize){
+                    axpy_cpu(l.n, 1, l.scales, 1, out.scales, 1);
+                    axpy_cpu(l.n, 1, l.rolling_mean, 1, out.rolling_mean, 1);
+                    axpy_cpu(l.n, 1, l.rolling_variance, 1, out.rolling_variance, 1);
+                }
+            }
+            if(l.type == CONNECTED){
+                axpy_cpu(l.outputs, 1, l.biases, 1, out.biases, 1);
+                axpy_cpu(l.outputs*l.inputs, 1, l.weights, 1, out.weights, 1);
+            }
+        }
+    }
+    n = n+1;
+    for(j = 0; j < net.n; ++j){
+        layer l = sum.layers[j];
+        if(l.type == CONVOLUTIONAL){
+            int num = l.n*l.c*l.size*l.size;
+            scal_cpu(l.n, 1./n, l.biases, 1);
+            scal_cpu(num, 1./n, l.weights, 1);
+                if(l.batch_normalize){
+                    scal_cpu(l.n, 1./n, l.scales, 1);
+                    scal_cpu(l.n, 1./n, l.rolling_mean, 1);
+                    scal_cpu(l.n, 1./n, l.rolling_variance, 1);
+                }
+        }
+        if(l.type == CONNECTED){
+            scal_cpu(l.outputs, 1./n, l.biases, 1);
+            scal_cpu(l.outputs*l.inputs, 1./n, l.weights, 1);
+        }
+    }
+    save_weights(sum, outfile);
+}
+
+void speed(char *cfgfile, int tics)
+{
+    if (tics == 0) tics = 1000;
+    network net = parse_network_cfg(cfgfile);
+    set_batch_network(&net, 1);
+    int i;
+    time_t start = time(0);
+    image im = make_image(net.w, net.h, net.c);
+    for(i = 0; i < tics; ++i){
+        network_predict(net, im.data);
+    }
+    double t = difftime(time(0), start);
+    printf("\n%d evals, %f Seconds\n", tics, t);
+    printf("Speed: %f sec/eval\n", t/tics);
+    printf("Speed: %f Hz\n", tics/t);
+}
+
+void operations(char *cfgfile)
+{
+    gpu_index = -1;
+    network net = parse_network_cfg(cfgfile);
+    int i;
+    long ops = 0;
+    for(i = 0; i < net.n; ++i){
+        layer l = net.layers[i];
+        if(l.type == CONVOLUTIONAL){
+            ops += 2l * l.n * l.size*l.size*l.c * l.out_h*l.out_w;
+        } else if(l.type == CONNECTED){
+            ops += 2l * l.inputs * l.outputs;
+        } else if (l.type == RNN){
+            ops += 2l * l.input_layer->inputs * l.input_layer->outputs;
+            ops += 2l * l.self_layer->inputs * l.self_layer->outputs;
+            ops += 2l * l.output_layer->inputs * l.output_layer->outputs;
+        } else if (l.type == GRU){
+            ops += 2l * l.uz->inputs * l.uz->outputs;
+            ops += 2l * l.uh->inputs * l.uh->outputs;
+            ops += 2l * l.ur->inputs * l.ur->outputs;
+            ops += 2l * l.wz->inputs * l.wz->outputs;
+            ops += 2l * l.wh->inputs * l.wh->outputs;
+            ops += 2l * l.wr->inputs * l.wr->outputs;
+        } else if (l.type == LSTM){
+            ops += 2l * l.uf->inputs * l.uf->outputs;
+            ops += 2l * l.ui->inputs * l.ui->outputs;
+            ops += 2l * l.ug->inputs * l.ug->outputs;
+            ops += 2l * l.uo->inputs * l.uo->outputs;
+            ops += 2l * l.wf->inputs * l.wf->outputs;
+            ops += 2l * l.wi->inputs * l.wi->outputs;
+            ops += 2l * l.wg->inputs * l.wg->outputs;
+            ops += 2l * l.wo->inputs * l.wo->outputs;
+        }
+    }
+    printf("Floating Point Operations: %ld\n", ops);
+    printf("Floating Point Operations: %.2f Bn\n", (float)ops/1000000000.);
+}
+
+void oneoff(char *cfgfile, char *weightfile, char *outfile)
+{
+    gpu_index = -1;
+    network net = parse_network_cfg(cfgfile);
+    int oldn = net.layers[net.n - 2].n;
+    int c = net.layers[net.n - 2].c;
+    net.layers[net.n - 2].n = 9372;
+    net.layers[net.n - 2].biases += 5;
+    net.layers[net.n - 2].weights += 5*c;
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    net.layers[net.n - 2].biases -= 5;
+    net.layers[net.n - 2].weights -= 5*c;
+    net.layers[net.n - 2].n = oldn;
+    printf("%d\n", oldn);
+    layer l = net.layers[net.n - 2];
+    copy_cpu(l.n/3, l.biases, 1, l.biases +   l.n/3, 1);
+    copy_cpu(l.n/3, l.biases, 1, l.biases + 2*l.n/3, 1);
+    copy_cpu(l.n/3*l.c, l.weights, 1, l.weights +   l.n/3*l.c, 1);
+    copy_cpu(l.n/3*l.c, l.weights, 1, l.weights + 2*l.n/3*l.c, 1);
+    *net.seen = 0;
+    *net.cur_iteration = 0;
+    save_weights(net, outfile);
+}
+
+void partial(char *cfgfile, char *weightfile, char *outfile, int max)
+{
+    gpu_index = -1;
+    network net = parse_network_cfg_custom(cfgfile, 1, 1);
+    if(weightfile){
+        load_weights_upto(&net, weightfile, max);
+    }
+    *net.seen = 0;
+    *net.cur_iteration = 0;
+    save_weights_upto(net, outfile, max, 0);
+}
+
+#include "convolutional_layer.h"
+void rescale_net(char *cfgfile, char *weightfile, char *outfile)
+{
+    gpu_index = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    int i;
+    for(i = 0; i < net.n; ++i){
+        layer l = net.layers[i];
+        if(l.type == CONVOLUTIONAL){
+            rescale_weights(l, 2, -.5);
+            break;
+        }
+    }
+    save_weights(net, outfile);
+}
+
+void rgbgr_net(char *cfgfile, char *weightfile, char *outfile)
+{
+    gpu_index = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    int i;
+    for(i = 0; i < net.n; ++i){
+        layer l = net.layers[i];
+        if(l.type == CONVOLUTIONAL){
+            rgbgr_weights(l);
+            break;
+        }
+    }
+    save_weights(net, outfile);
+}
+
+void reset_normalize_net(char *cfgfile, char *weightfile, char *outfile)
+{
+    gpu_index = -1;
+    network net = parse_network_cfg(cfgfile);
+    if (weightfile) {
+        load_weights(&net, weightfile);
+    }
+    int i;
+    for (i = 0; i < net.n; ++i) {
+        layer l = net.layers[i];
+        if (l.type == CONVOLUTIONAL && l.batch_normalize) {
+            denormalize_convolutional_layer(l);
+        }
+        if (l.type == CONNECTED && l.batch_normalize) {
+            denormalize_connected_layer(l);
+        }
+        if (l.type == GRU && l.batch_normalize) {
+            denormalize_connected_layer(*l.input_z_layer);
+            denormalize_connected_layer(*l.input_r_layer);
+            denormalize_connected_layer(*l.input_h_layer);
+            denormalize_connected_layer(*l.state_z_layer);
+            denormalize_connected_layer(*l.state_r_layer);
+            denormalize_connected_layer(*l.state_h_layer);
+        }
+        if (l.type == LSTM && l.batch_normalize) {
+            denormalize_connected_layer(*l.wf);
+            denormalize_connected_layer(*l.wi);
+            denormalize_connected_layer(*l.wg);
+            denormalize_connected_layer(*l.wo);
+            denormalize_connected_layer(*l.uf);
+            denormalize_connected_layer(*l.ui);
+            denormalize_connected_layer(*l.ug);
+            denormalize_connected_layer(*l.uo);
+        }
+    }
+    save_weights(net, outfile);
+}
+
+layer normalize_layer(layer l, int n)
+{
+    int j;
+    l.batch_normalize=1;
+    l.scales = (float*)xcalloc(n, sizeof(float));
+    for(j = 0; j < n; ++j){
+        l.scales[j] = 1;
+    }
+    l.rolling_mean = (float*)xcalloc(n, sizeof(float));
+    l.rolling_variance = (float*)xcalloc(n, sizeof(float));
+    return l;
+}
+
+void normalize_net(char *cfgfile, char *weightfile, char *outfile)
+{
+    gpu_index = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    int i;
+    for(i = 0; i < net.n; ++i){
+        layer l = net.layers[i];
+        if(l.type == CONVOLUTIONAL && !l.batch_normalize){
+            net.layers[i] = normalize_layer(l, l.n);
+        }
+        if (l.type == CONNECTED && !l.batch_normalize) {
+            net.layers[i] = normalize_layer(l, l.outputs);
+        }
+        if (l.type == GRU && l.batch_normalize) {
+            *l.input_z_layer = normalize_layer(*l.input_z_layer, l.input_z_layer->outputs);
+            *l.input_r_layer = normalize_layer(*l.input_r_layer, l.input_r_layer->outputs);
+            *l.input_h_layer = normalize_layer(*l.input_h_layer, l.input_h_layer->outputs);
+            *l.state_z_layer = normalize_layer(*l.state_z_layer, l.state_z_layer->outputs);
+            *l.state_r_layer = normalize_layer(*l.state_r_layer, l.state_r_layer->outputs);
+            *l.state_h_layer = normalize_layer(*l.state_h_layer, l.state_h_layer->outputs);
+            net.layers[i].batch_normalize=1;
+        }
+        if (l.type == LSTM && l.batch_normalize) {
+            *l.wf = normalize_layer(*l.wf, l.wf->outputs);
+            *l.wi = normalize_layer(*l.wi, l.wi->outputs);
+            *l.wg = normalize_layer(*l.wg, l.wg->outputs);
+            *l.wo = normalize_layer(*l.wo, l.wo->outputs);
+            *l.uf = normalize_layer(*l.uf, l.uf->outputs);
+            *l.ui = normalize_layer(*l.ui, l.ui->outputs);
+            *l.ug = normalize_layer(*l.ug, l.ug->outputs);
+            *l.uo = normalize_layer(*l.uo, l.uo->outputs);
+            net.layers[i].batch_normalize=1;
+        }
+    }
+    save_weights(net, outfile);
+}
+
+void statistics_net(char *cfgfile, char *weightfile)
+{
+    gpu_index = -1;
+    network net = parse_network_cfg(cfgfile);
+    if (weightfile) {
+        load_weights(&net, weightfile);
+    }
+    int i;
+    for (i = 0; i < net.n; ++i) {
+        layer l = net.layers[i];
+        if (l.type == CONNECTED && l.batch_normalize) {
+            printf("Connected Layer %d\n", i);
+            statistics_connected_layer(l);
+        }
+        if (l.type == GRU && l.batch_normalize) {
+            printf("GRU Layer %d\n", i);
+            printf("Input Z\n");
+            statistics_connected_layer(*l.input_z_layer);
+            printf("Input R\n");
+            statistics_connected_layer(*l.input_r_layer);
+            printf("Input H\n");
+            statistics_connected_layer(*l.input_h_layer);
+            printf("State Z\n");
+            statistics_connected_layer(*l.state_z_layer);
+            printf("State R\n");
+            statistics_connected_layer(*l.state_r_layer);
+            printf("State H\n");
+            statistics_connected_layer(*l.state_h_layer);
+        }
+        if (l.type == LSTM && l.batch_normalize) {
+            printf("LSTM Layer %d\n", i);
+            printf("wf\n");
+            statistics_connected_layer(*l.wf);
+            printf("wi\n");
+            statistics_connected_layer(*l.wi);
+            printf("wg\n");
+            statistics_connected_layer(*l.wg);
+            printf("wo\n");
+            statistics_connected_layer(*l.wo);
+            printf("uf\n");
+            statistics_connected_layer(*l.uf);
+            printf("ui\n");
+            statistics_connected_layer(*l.ui);
+            printf("ug\n");
+            statistics_connected_layer(*l.ug);
+            printf("uo\n");
+            statistics_connected_layer(*l.uo);
+        }
+        printf("\n");
+    }
+}
+
+void denormalize_net(char *cfgfile, char *weightfile, char *outfile)
+{
+    gpu_index = -1;
+    network net = parse_network_cfg(cfgfile);
+    if (weightfile) {
+        load_weights(&net, weightfile);
+    }
+    int i;
+    for (i = 0; i < net.n; ++i) {
+        layer l = net.layers[i];
+        if (l.type == CONVOLUTIONAL && l.batch_normalize) {
+            denormalize_convolutional_layer(l);
+            net.layers[i].batch_normalize=0;
+        }
+        if (l.type == CONNECTED && l.batch_normalize) {
+            denormalize_connected_layer(l);
+            net.layers[i].batch_normalize=0;
+        }
+        if (l.type == GRU && l.batch_normalize) {
+            denormalize_connected_layer(*l.input_z_layer);
+            denormalize_connected_layer(*l.input_r_layer);
+            denormalize_connected_layer(*l.input_h_layer);
+            denormalize_connected_layer(*l.state_z_layer);
+            denormalize_connected_layer(*l.state_r_layer);
+            denormalize_connected_layer(*l.state_h_layer);
+            l.input_z_layer->batch_normalize = 0;
+            l.input_r_layer->batch_normalize = 0;
+            l.input_h_layer->batch_normalize = 0;
+            l.state_z_layer->batch_normalize = 0;
+            l.state_r_layer->batch_normalize = 0;
+            l.state_h_layer->batch_normalize = 0;
+            net.layers[i].batch_normalize=0;
+        }
+        if (l.type == GRU && l.batch_normalize) {
+            denormalize_connected_layer(*l.wf);
+            denormalize_connected_layer(*l.wi);
+            denormalize_connected_layer(*l.wg);
+            denormalize_connected_layer(*l.wo);
+            denormalize_connected_layer(*l.uf);
+            denormalize_connected_layer(*l.ui);
+            denormalize_connected_layer(*l.ug);
+            denormalize_connected_layer(*l.uo);
+            l.wf->batch_normalize = 0;
+            l.wi->batch_normalize = 0;
+            l.wg->batch_normalize = 0;
+            l.wo->batch_normalize = 0;
+            l.uf->batch_normalize = 0;
+            l.ui->batch_normalize = 0;
+            l.ug->batch_normalize = 0;
+            l.uo->batch_normalize = 0;
+            net.layers[i].batch_normalize=0;
+        }
+    }
+    save_weights(net, outfile);
+}
+
+void visualize(char *cfgfile, char *weightfile)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    visualize_network(net);
+#ifdef OPENCV
+    wait_until_press_key_cv();
+#endif
+}
+
+int main(int argc, char **argv)
+{
+#ifdef _DEBUG
+    _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
+    printf(" _DEBUG is used \n");
+#endif
+
+#ifdef DEBUG
+    printf(" DEBUG=1 \n");
+#endif
+
+    int i;
+    for (i = 0; i < argc; ++i) {
+        if (!argv[i]) continue;
+        strip_args(argv[i]);
+    }
+
+    //test_resize("data/bad.jpg");
+    //test_box();
+    //test_convolutional_layer();
+    if(argc < 2){
+        fprintf(stderr, "usage: %s <function>\n", argv[0]);
+        return 0;
+    }
+    gpu_index = find_int_arg(argc, argv, "-i", 0);
+
+#ifndef GPU
+    gpu_index = -1;
+    printf(" GPU isn't used \n");
+    init_cpu();
+#else   // GPU
+    if(gpu_index >= 0){
+        cuda_set_device(gpu_index);
+        CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+    }
+
+    show_cuda_cudnn_info();
+    cuda_debug_sync = find_arg(argc, argv, "-cuda_debug_sync");
+
+#ifdef CUDNN_HALF
+    printf(" CUDNN_HALF=1 \n");
+#endif  // CUDNN_HALF
+
+#endif  // GPU
+
+    show_opencv_info();
+
+    if (0 == strcmp(argv[1], "average")){
+        average(argc, argv);
+    } else if (0 == strcmp(argv[1], "yolo")){
+        run_yolo(argc, argv);
+    } else if (0 == strcmp(argv[1], "voxel")){
+        run_voxel(argc, argv);
+    } else if (0 == strcmp(argv[1], "super")){
+        run_super(argc, argv);
+    } else if (0 == strcmp(argv[1], "detector")){
+        run_detector(argc, argv);
+    } else if (0 == strcmp(argv[1], "detect")){
+        float thresh = find_float_arg(argc, argv, "-thresh", .24);
+        int ext_output = find_arg(argc, argv, "-ext_output");
+        char *filename = (argc > 4) ? argv[4]: 0;
+        test_detector("cfg/coco.data", argv[2], argv[3], filename, thresh, 0.5, 0, ext_output, 0, NULL, 0, 0);
+    } else if (0 == strcmp(argv[1], "cifar")){
+        run_cifar(argc, argv);
+    } else if (0 == strcmp(argv[1], "go")){
+        run_go(argc, argv);
+    } else if (0 == strcmp(argv[1], "rnn")){
+        run_char_rnn(argc, argv);
+    } else if (0 == strcmp(argv[1], "vid")){
+        run_vid_rnn(argc, argv);
+    } else if (0 == strcmp(argv[1], "coco")){
+        run_coco(argc, argv);
+    } else if (0 == strcmp(argv[1], "classify")){
+        predict_classifier("cfg/imagenet1k.data", argv[2], argv[3], argv[4], 5);
+    } else if (0 == strcmp(argv[1], "classifier")){
+        run_classifier(argc, argv);
+    } else if (0 == strcmp(argv[1], "art")){
+        run_art(argc, argv);
+    } else if (0 == strcmp(argv[1], "tag")){
+        run_tag(argc, argv);
+    } else if (0 == strcmp(argv[1], "compare")){
+        run_compare(argc, argv);
+    } else if (0 == strcmp(argv[1], "dice")){
+        run_dice(argc, argv);
+    } else if (0 == strcmp(argv[1], "writing")){
+        run_writing(argc, argv);
+    } else if (0 == strcmp(argv[1], "3d")){
+        composite_3d(argv[2], argv[3], argv[4], (argc > 5) ? atof(argv[5]) : 0);
+    } else if (0 == strcmp(argv[1], "test")){
+        test_resize(argv[2]);
+    } else if (0 == strcmp(argv[1], "captcha")){
+        run_captcha(argc, argv);
+    } else if (0 == strcmp(argv[1], "nightmare")){
+        run_nightmare(argc, argv);
+    } else if (0 == strcmp(argv[1], "rgbgr")){
+        rgbgr_net(argv[2], argv[3], argv[4]);
+    } else if (0 == strcmp(argv[1], "reset")){
+        reset_normalize_net(argv[2], argv[3], argv[4]);
+    } else if (0 == strcmp(argv[1], "denormalize")){
+        denormalize_net(argv[2], argv[3], argv[4]);
+    } else if (0 == strcmp(argv[1], "statistics")){
+        statistics_net(argv[2], argv[3]);
+    } else if (0 == strcmp(argv[1], "normalize")){
+        normalize_net(argv[2], argv[3], argv[4]);
+    } else if (0 == strcmp(argv[1], "rescale")){
+        rescale_net(argv[2], argv[3], argv[4]);
+    } else if (0 == strcmp(argv[1], "ops")){
+        operations(argv[2]);
+    } else if (0 == strcmp(argv[1], "speed")){
+        speed(argv[2], (argc > 3 && argv[3]) ? atoi(argv[3]) : 0);
+    } else if (0 == strcmp(argv[1], "oneoff")){
+        oneoff(argv[2], argv[3], argv[4]);
+    } else if (0 == strcmp(argv[1], "partial")){
+        partial(argv[2], argv[3], argv[4], atoi(argv[5]));
+    } else if (0 == strcmp(argv[1], "visualize")){
+        visualize(argv[2], (argc > 3) ? argv[3] : 0);
+    } else if (0 == strcmp(argv[1], "imtest")){
+        test_resize(argv[2]);
+    } else {
+        fprintf(stderr, "Not an option: %s\n", argv[1]);
+    }
+    return 0;
+}
diff --git a/darknet-master/src/darknet.ico b/darknet-master/src/darknet.ico
new file mode 100644
index 0000000..358c53f
Binary files /dev/null and b/darknet-master/src/darknet.ico differ
diff --git a/darknet-master/src/darknet.rc b/darknet-master/src/darknet.rc
new file mode 100644
index 0000000..8c5a7d5
--- /dev/null
+++ b/darknet-master/src/darknet.rc
@@ -0,0 +1 @@
+1 ICON "darknet.ico"
diff --git a/darknet-master/src/darkunistd.h b/darknet-master/src/darkunistd.h
new file mode 100644
index 0000000..de3a3cb
--- /dev/null
+++ b/darknet-master/src/darkunistd.h
@@ -0,0 +1,56 @@
+#ifdef _WIN32
+#ifndef _UNISTD_H
+#define _UNISTD_H    1
+
+/* This file intended to serve as a drop-in replacement for
+*  unistd.h on Windows
+*  Please add functionality as needed
+*/
+
+#include <winsock2.h>
+#include <direct.h> /* for _getcwd() and _chdir() */
+#include <getopt.h>
+#include <io.h>
+#include <process.h> /* for getpid() and the exec..() family */
+#include <stdlib.h>
+
+#define srandom srand
+#define random rand
+
+/* Values for the second argument to access.
+These may be OR'd together.  */
+#define R_OK    4       /* Test for read permission.  */
+#define W_OK    2       /* Test for write permission.  */
+#define X_OK R_OK       /* execute permission - unsupported in Windows, */
+#define F_OK    0       /* Test for existence.  */
+
+#define access _access
+#define dup2 _dup2
+#define execve _execve
+#define ftruncate _chsize
+#define unlink _unlink
+#define fileno _fileno
+#define getcwd _getcwd
+#define chdir _chdir
+#define isatty _isatty
+#define lseek _lseek
+/* read, write, and close are NOT being #defined here, because while there are file handle specific versions for Windows, they probably don't work for sockets. You need to look at your app and consider whether to call e.g. closesocket(). */
+
+#define ssize_t int
+
+#define STDIN_FILENO 0
+#define STDOUT_FILENO 1
+#define STDERR_FILENO 2
+/* should be in some equivalent to <sys/types.h> */
+//typedef __int8            int8_t;
+//typedef __int16           int16_t;
+//typedef __int32           int32_t;
+//typedef __int64           int64_t;
+//typedef unsigned __int8   uint8_t;
+//typedef unsigned __int16  uint16_t;
+//typedef unsigned __int32  uint32_t;
+//typedef unsigned __int64  uint64_t;
+#endif /* _UNISTD_H  */
+#else
+#include <unistd.h>
+#endif /* _WIN32  */
diff --git a/darknet-master/src/data.c b/darknet-master/src/data.c
new file mode 100644
index 0000000..70e1b09
--- /dev/null
+++ b/darknet-master/src/data.c
@@ -0,0 +1,2315 @@
+#include "data.h"
+#include "utils.h"
+#include "image.h"
+#include "dark_cuda.h"
+#include "box.h"
+#include "http_stream.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define NUMCHARS 37
+
+pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+
+list *get_paths(char *filename)
+{
+    char *path;
+    FILE *file = fopen(filename, "r");
+    if(!file) file_error(filename);
+    list *lines = make_list();
+    while((path=fgetl(file))){
+        list_insert(lines, path);
+    }
+    fclose(file);
+    return lines;
+}
+
+/*
+char **get_random_paths_indexes(char **paths, int n, int m, int *indexes)
+{
+    char **random_paths = calloc(n, sizeof(char*));
+    int i;
+    pthread_mutex_lock(&mutex);
+    for(i = 0; i < n; ++i){
+        int index = random_gen()%m;
+        indexes[i] = index;
+        random_paths[i] = paths[index];
+        if(i == 0) printf("%s\n", paths[index]);
+    }
+    pthread_mutex_unlock(&mutex);
+    return random_paths;
+}
+*/
+
+char **get_sequential_paths(char **paths, int n, int m, int mini_batch, int augment_speed, int contrastive)
+{
+    int speed = rand_int(1, augment_speed);
+    if (speed < 1) speed = 1;
+    char** sequentia_paths = (char**)xcalloc(n, sizeof(char*));
+    int i;
+    pthread_mutex_lock(&mutex);
+    //printf("n = %d, mini_batch = %d \n", n, mini_batch);
+    unsigned int *start_time_indexes = (unsigned int *)xcalloc(mini_batch, sizeof(unsigned int));
+    for (i = 0; i < mini_batch; ++i) {
+        if (contrastive && (i % 2) == 1) start_time_indexes[i] = start_time_indexes[i - 1];
+        else start_time_indexes[i] = random_gen() % m;
+
+        //printf(" start_time_indexes[i] = %u, ", start_time_indexes[i]);
+    }
+
+    for (i = 0; i < n; ++i) {
+        do {
+            int time_line_index = i % mini_batch;
+            unsigned int index = start_time_indexes[time_line_index] % m;
+            start_time_indexes[time_line_index] += speed;
+
+            //int index = random_gen() % m;
+            sequentia_paths[i] = paths[index];
+            //printf(" index = %d, ", index);
+            //if(i == 0) printf("%s\n", paths[index]);
+            //printf(" index = %u - grp: %s \n", index, paths[index]);
+            if (strlen(sequentia_paths[i]) <= 4) printf(" Very small path to the image: %s \n", sequentia_paths[i]);
+        } while (strlen(sequentia_paths[i]) == 0);
+    }
+    free(start_time_indexes);
+    pthread_mutex_unlock(&mutex);
+    return sequentia_paths;
+}
+
+char **get_random_paths_custom(char **paths, int n, int m, int contrastive)
+{
+    char** random_paths = (char**)xcalloc(n, sizeof(char*));
+    int i;
+    pthread_mutex_lock(&mutex);
+    int old_index = 0;
+    //printf("n = %d \n", n);
+    for(i = 0; i < n; ++i){
+        do {
+            int index = random_gen() % m;
+            if (contrastive && (i % 2 == 1)) index = old_index;
+            else old_index = index;
+            random_paths[i] = paths[index];
+            //if(i == 0) printf("%s\n", paths[index]);
+            //printf("grp: %s\n", paths[index]);
+            if (strlen(random_paths[i]) <= 4) printf(" Very small path to the image: %s \n", random_paths[i]);
+        } while (strlen(random_paths[i]) == 0);
+    }
+    pthread_mutex_unlock(&mutex);
+    return random_paths;
+}
+
+char **get_random_paths(char **paths, int n, int m)
+{
+    return get_random_paths_custom(paths, n, m, 0);
+}
+
+char **find_replace_paths(char **paths, int n, char *find, char *replace)
+{
+    char** replace_paths = (char**)xcalloc(n, sizeof(char*));
+    int i;
+    for(i = 0; i < n; ++i){
+        char replaced[4096];
+        find_replace(paths[i], find, replace, replaced);
+        replace_paths[i] = copy_string(replaced);
+    }
+    return replace_paths;
+}
+
+matrix load_image_paths_gray(char **paths, int n, int w, int h)
+{
+    int i;
+    matrix X;
+    X.rows = n;
+    X.vals = (float**)xcalloc(X.rows, sizeof(float*));
+    X.cols = 0;
+
+    for(i = 0; i < n; ++i){
+        image im = load_image(paths[i], w, h, 3);
+
+        image gray = grayscale_image(im);
+        free_image(im);
+        im = gray;
+
+        X.vals[i] = im.data;
+        X.cols = im.h*im.w*im.c;
+    }
+    return X;
+}
+
+matrix load_image_paths(char **paths, int n, int w, int h)
+{
+    int i;
+    matrix X;
+    X.rows = n;
+    X.vals = (float**)xcalloc(X.rows, sizeof(float*));
+    X.cols = 0;
+
+    for(i = 0; i < n; ++i){
+        image im = load_image_color(paths[i], w, h);
+        X.vals[i] = im.data;
+        X.cols = im.h*im.w*im.c;
+    }
+    return X;
+}
+
+matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure, int dontuse_opencv, int contrastive)
+{
+    int i;
+    matrix X;
+    X.rows = n;
+    X.vals = (float**)xcalloc(X.rows, sizeof(float*));
+    X.cols = 0;
+
+    for(i = 0; i < n; ++i){
+        int size = w > h ? w : h;
+        image im;
+        const int img_index = (contrastive) ? (i / 2) : i;
+        if(dontuse_opencv) im = load_image_stb_resize(paths[img_index], 0, 0, 3);
+        else im = load_image_color(paths[img_index], 0, 0);
+
+        image crop = random_augment_image(im, angle, aspect, min, max, size);
+        int flip = use_flip ? random_gen() % 2 : 0;
+        if (flip)
+            flip_image(crop);
+        random_distort_image(crop, hue, saturation, exposure);
+
+        image sized = resize_image(crop, w, h);
+
+        //show_image(im, "orig");
+        //show_image(sized, "sized");
+        //show_image(sized, paths[img_index]);
+        //wait_until_press_key_cv();
+        //printf("w = %d, h = %d \n", sized.w, sized.h);
+
+        free_image(im);
+        free_image(crop);
+        X.vals[i] = sized.data;
+        X.cols = sized.h*sized.w*sized.c;
+    }
+    return X;
+}
+
+
+box_label *read_boxes(char *filename, int *n)
+{
+    box_label* boxes = (box_label*)xcalloc(1, sizeof(box_label));
+    FILE *file = fopen(filename, "r");
+    if (!file) {
+        printf("Can't open label file. (This can be normal only if you use MSCOCO): %s \n", filename);
+        //file_error(filename);
+        FILE* fw = fopen("bad.list", "a");
+        fwrite(filename, sizeof(char), strlen(filename), fw);
+        char *new_line = "\n";
+        fwrite(new_line, sizeof(char), strlen(new_line), fw);
+        fclose(fw);
+
+        *n = 0;
+        return boxes;
+    }
+    const int max_obj_img = 4000;// 30000;
+    const int img_hash = (custom_hash(filename) % max_obj_img)*max_obj_img;
+    //printf(" img_hash = %d, filename = %s; ", img_hash, filename);
+    float x, y, h, w;
+    int id;
+    int count = 0;
+    while(fscanf(file, "%d %f %f %f %f", &id, &x, &y, &w, &h) == 5){
+        boxes = (box_label*)xrealloc(boxes, (count + 1) * sizeof(box_label));
+        boxes[count].track_id = count + img_hash;
+        //printf(" boxes[count].track_id = %d, count = %d \n", boxes[count].track_id, count);
+        boxes[count].id = id;
+        boxes[count].x = x;
+        boxes[count].y = y;
+        boxes[count].h = h;
+        boxes[count].w = w;
+        boxes[count].left   = x - w/2;
+        boxes[count].right  = x + w/2;
+        boxes[count].top    = y - h/2;
+        boxes[count].bottom = y + h/2;
+        ++count;
+    }
+    fclose(file);
+    *n = count;
+    return boxes;
+}
+
+void randomize_boxes(box_label *b, int n)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        box_label swap = b[i];
+        int index = random_gen()%n;
+        b[i] = b[index];
+        b[index] = swap;
+    }
+}
+
+void correct_boxes(box_label *boxes, int n, float dx, float dy, float sx, float sy, int flip)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        if(boxes[i].x == 0 && boxes[i].y == 0) {
+            boxes[i].x = 999999;
+            boxes[i].y = 999999;
+            boxes[i].w = 999999;
+            boxes[i].h = 999999;
+            continue;
+        }
+        if ((boxes[i].x + boxes[i].w / 2) < 0 || (boxes[i].y + boxes[i].h / 2) < 0 ||
+            (boxes[i].x - boxes[i].w / 2) > 1 || (boxes[i].y - boxes[i].h / 2) > 1)
+        {
+            boxes[i].x = 999999;
+            boxes[i].y = 999999;
+            boxes[i].w = 999999;
+            boxes[i].h = 999999;
+            continue;
+        }
+        boxes[i].left   = boxes[i].left  * sx - dx;
+        boxes[i].right  = boxes[i].right * sx - dx;
+        boxes[i].top    = boxes[i].top   * sy - dy;
+        boxes[i].bottom = boxes[i].bottom* sy - dy;
+
+        if(flip){
+            float swap = boxes[i].left;
+            boxes[i].left = 1. - boxes[i].right;
+            boxes[i].right = 1. - swap;
+        }
+
+        boxes[i].left =  constrain(0, 1, boxes[i].left);
+        boxes[i].right = constrain(0, 1, boxes[i].right);
+        boxes[i].top =   constrain(0, 1, boxes[i].top);
+        boxes[i].bottom =   constrain(0, 1, boxes[i].bottom);
+
+        boxes[i].x = (boxes[i].left+boxes[i].right)/2;
+        boxes[i].y = (boxes[i].top+boxes[i].bottom)/2;
+        boxes[i].w = (boxes[i].right - boxes[i].left);
+        boxes[i].h = (boxes[i].bottom - boxes[i].top);
+
+        boxes[i].w = constrain(0, 1, boxes[i].w);
+        boxes[i].h = constrain(0, 1, boxes[i].h);
+    }
+}
+
+void fill_truth_swag(char *path, float *truth, int classes, int flip, float dx, float dy, float sx, float sy)
+{
+    char labelpath[4096];
+    replace_image_to_label(path, labelpath);
+
+    int count = 0;
+    box_label *boxes = read_boxes(labelpath, &count);
+    randomize_boxes(boxes, count);
+    correct_boxes(boxes, count, dx, dy, sx, sy, flip);
+    float x,y,w,h;
+    int id;
+    int i;
+
+    for (i = 0; i < count && i < 30; ++i) {
+        x =  boxes[i].x;
+        y =  boxes[i].y;
+        w =  boxes[i].w;
+        h =  boxes[i].h;
+        id = boxes[i].id;
+
+        if (w < .0 || h < .0) continue;
+
+        int index = (4+classes) * i;
+
+        truth[index++] = x;
+        truth[index++] = y;
+        truth[index++] = w;
+        truth[index++] = h;
+
+        if (id < classes) truth[index+id] = 1;
+    }
+    free(boxes);
+}
+
+void fill_truth_region(char *path, float *truth, int classes, int num_boxes, int flip, float dx, float dy, float sx, float sy)
+{
+    char labelpath[4096];
+    replace_image_to_label(path, labelpath);
+
+    int count = 0;
+    box_label *boxes = read_boxes(labelpath, &count);
+    randomize_boxes(boxes, count);
+    correct_boxes(boxes, count, dx, dy, sx, sy, flip);
+    float x,y,w,h;
+    int id;
+    int i;
+
+    for (i = 0; i < count; ++i) {
+        x =  boxes[i].x;
+        y =  boxes[i].y;
+        w =  boxes[i].w;
+        h =  boxes[i].h;
+        id = boxes[i].id;
+
+        if (w < .001 || h < .001) continue;
+
+        int col = (int)(x*num_boxes);
+        int row = (int)(y*num_boxes);
+
+        x = x*num_boxes - col;
+        y = y*num_boxes - row;
+
+        int index = (col+row*num_boxes)*(5+classes);
+        if (truth[index]) continue;
+        truth[index++] = 1;
+
+        if (id < classes) truth[index+id] = 1;
+        index += classes;
+
+        truth[index++] = x;
+        truth[index++] = y;
+        truth[index++] = w;
+        truth[index++] = h;
+    }
+    free(boxes);
+}
+
+int fill_truth_detection(const char *path, int num_boxes, int truth_size, float *truth, int classes, int flip, float dx, float dy, float sx, float sy,
+    int net_w, int net_h)
+{
+    char labelpath[4096];
+    replace_image_to_label(path, labelpath);
+
+    int count = 0;
+    int i;
+    box_label *boxes = read_boxes(labelpath, &count);
+    int min_w_h = 0;
+    float lowest_w = 1.F / net_w;
+    float lowest_h = 1.F / net_h;
+    randomize_boxes(boxes, count);
+    correct_boxes(boxes, count, dx, dy, sx, sy, flip);
+    if (count > num_boxes) count = num_boxes;
+    float x, y, w, h;
+    int id;
+    int sub = 0;
+
+    for (i = 0; i < count; ++i) {
+        x = boxes[i].x;
+        y = boxes[i].y;
+        w = boxes[i].w;
+        h = boxes[i].h;
+        id = boxes[i].id;
+        int track_id = boxes[i].track_id;
+
+        // not detect small objects
+        //if ((w < 0.001F || h < 0.001F)) continue;
+        // if truth (box for object) is smaller than 1x1 pix
+        char buff[256];
+        if (id >= classes) {
+            printf("\n Wrong annotation: class_id = %d. But class_id should be [from 0 to %d], file: %s \n", id, (classes-1), labelpath);
+            sprintf(buff, "echo %s \"Wrong annotation: class_id = %d. But class_id should be [from 0 to %d]\" >> bad_label.list", labelpath, id, (classes-1));
+            system(buff);
+            ++sub;
+            continue;
+        }
+        if ((w < lowest_w || h < lowest_h)) {
+            //sprintf(buff, "echo %s \"Very small object: w < lowest_w OR h < lowest_h\" >> bad_label.list", labelpath);
+            //system(buff);
+            ++sub;
+            continue;
+        }
+        if (x == 999999 || y == 999999) {
+            printf("\n Wrong annotation: x = 0, y = 0, < 0 or > 1, file: %s \n", labelpath);
+            sprintf(buff, "echo %s \"Wrong annotation: x = 0 or y = 0\" >> bad_label.list", labelpath);
+            system(buff);
+            ++sub;
+            continue;
+        }
+        if (x <= 0 || x > 1 || y <= 0 || y > 1) {
+            printf("\n Wrong annotation: x = %f, y = %f, file: %s \n", x, y, labelpath);
+            sprintf(buff, "echo %s \"Wrong annotation: x = %f, y = %f\" >> bad_label.list", labelpath, x, y);
+            system(buff);
+            ++sub;
+            continue;
+        }
+        if (w > 1) {
+            printf("\n Wrong annotation: w = %f, file: %s \n", w, labelpath);
+            sprintf(buff, "echo %s \"Wrong annotation: w = %f\" >> bad_label.list", labelpath, w);
+            system(buff);
+            w = 1;
+        }
+        if (h > 1) {
+            printf("\n Wrong annotation: h = %f, file: %s \n", h, labelpath);
+            sprintf(buff, "echo %s \"Wrong annotation: h = %f\" >> bad_label.list", labelpath, h);
+            system(buff);
+            h = 1;
+        }
+        if (x == 0) x += lowest_w;
+        if (y == 0) y += lowest_h;
+
+        truth[(i-sub)*truth_size +0] = x;
+        truth[(i-sub)*truth_size +1] = y;
+        truth[(i-sub)*truth_size +2] = w;
+        truth[(i-sub)*truth_size +3] = h;
+        truth[(i-sub)*truth_size +4] = id;
+        truth[(i-sub)*truth_size +5] = track_id;
+        //float val = track_id;
+        //printf(" i = %d, sub = %d, truth_size = %d, track_id = %d, %f, %f\n", i, sub, truth_size, track_id, truth[(i - sub)*truth_size + 5], val);
+
+        if (min_w_h == 0) min_w_h = w*net_w;
+        if (min_w_h > w*net_w) min_w_h = w*net_w;
+        if (min_w_h > h*net_h) min_w_h = h*net_h;
+    }
+    free(boxes);
+    return min_w_h;
+}
+
+
+void print_letters(float *pred, int n)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        int index = max_index(pred+i*NUMCHARS, NUMCHARS);
+        printf("%c", int_to_alphanum(index));
+    }
+    printf("\n");
+}
+
+void fill_truth_captcha(char *path, int n, float *truth)
+{
+    char *begin = strrchr(path, '/');
+    ++begin;
+    int i;
+    for(i = 0; i < strlen(begin) && i < n && begin[i] != '.'; ++i){
+        int index = alphanum_to_int(begin[i]);
+        if(index > 35) printf("Bad %c\n", begin[i]);
+        truth[i*NUMCHARS+index] = 1;
+    }
+    for(;i < n; ++i){
+        truth[i*NUMCHARS + NUMCHARS-1] = 1;
+    }
+}
+
+data load_data_captcha(char **paths, int n, int m, int k, int w, int h)
+{
+    if(m) paths = get_random_paths(paths, n, m);
+    data d = {0};
+    d.shallow = 0;
+    d.X = load_image_paths(paths, n, w, h);
+    d.y = make_matrix(n, k*NUMCHARS);
+    int i;
+    for(i = 0; i < n; ++i){
+        fill_truth_captcha(paths[i], k, d.y.vals[i]);
+    }
+    if(m) free(paths);
+    return d;
+}
+
+data load_data_captcha_encode(char **paths, int n, int m, int w, int h)
+{
+    if(m) paths = get_random_paths(paths, n, m);
+    data d = {0};
+    d.shallow = 0;
+    d.X = load_image_paths(paths, n, w, h);
+    d.X.cols = 17100;
+    d.y = d.X;
+    if(m) free(paths);
+    return d;
+}
+
+void fill_truth(char *path, char **labels, int k, float *truth)
+{
+    int i;
+    memset(truth, 0, k*sizeof(float));
+    int count = 0;
+    for(i = 0; i < k; ++i){
+        if(strstr(path, labels[i])){
+            truth[i] = 1;
+            ++count;
+        }
+    }
+    if (count != 1) {
+        printf("Too many or too few labels: %d, %s\n", count, path);
+        count = 0;
+        for (i = 0; i < k; ++i) {
+            if (strstr(path, labels[i])) {
+                printf("\t label %d: %s  \n", count, labels[i]);
+                count++;
+            }
+        }
+    }
+}
+
+void fill_truth_smooth(char *path, char **labels, int k, float *truth, float label_smooth_eps)
+{
+    int i;
+    memset(truth, 0, k * sizeof(float));
+    int count = 0;
+    for (i = 0; i < k; ++i) {
+        if (strstr(path, labels[i])) {
+            truth[i] = (1 - label_smooth_eps);
+            ++count;
+        }
+        else {
+            truth[i] = label_smooth_eps / (k - 1);
+        }
+    }
+    if (count != 1) {
+        printf("Too many or too few labels: %d, %s\n", count, path);
+        count = 0;
+        for (i = 0; i < k; ++i) {
+            if (strstr(path, labels[i])) {
+                printf("\t label %d: %s  \n", count, labels[i]);
+                count++;
+            }
+        }
+    }
+}
+
+void fill_hierarchy(float *truth, int k, tree *hierarchy)
+{
+    int j;
+    for(j = 0; j < k; ++j){
+        if(truth[j]){
+            int parent = hierarchy->parent[j];
+            while(parent >= 0){
+                truth[parent] = 1;
+                parent = hierarchy->parent[parent];
+            }
+        }
+    }
+    int i;
+    int count = 0;
+    for(j = 0; j < hierarchy->groups; ++j){
+        //printf("%d\n", count);
+        int mask = 1;
+        for(i = 0; i < hierarchy->group_size[j]; ++i){
+            if(truth[count + i]){
+                mask = 0;
+                break;
+            }
+        }
+        if (mask) {
+            for(i = 0; i < hierarchy->group_size[j]; ++i){
+                truth[count + i] = SECRET_NUM;
+            }
+        }
+        count += hierarchy->group_size[j];
+    }
+}
+
+int find_max(float *arr, int size) {
+    int i;
+    float max = 0;
+    int n = 0;
+    for (i = 0; i < size; ++i) {
+        if (arr[i] > max) {
+            max = arr[i];
+            n = i;
+        }
+    }
+    return n;
+}
+
+matrix load_labels_paths(char **paths, int n, char **labels, int k, tree *hierarchy, float label_smooth_eps, int contrastive)
+{
+    matrix y = make_matrix(n, k);
+    int i;
+    if (labels) {
+        // supervised learning
+        for (i = 0; i < n; ++i) {
+            const int img_index = (contrastive) ? (i / 2) : i;
+            fill_truth_smooth(paths[img_index], labels, k, y.vals[i], label_smooth_eps);
+            //printf(" n = %d, i = %d, img_index = %d, class_id = %d \n", n, i, img_index, find_max(y.vals[i], k));
+            if (hierarchy) {
+                fill_hierarchy(y.vals[i], k, hierarchy);
+            }
+        }
+    } else {
+        // unsupervised learning
+        for (i = 0; i < n; ++i) {
+            const int img_index = (contrastive) ? (i / 2) : i;
+            const uintptr_t path_p = (uintptr_t)paths[img_index];// abs(random_gen());
+            const int class_id = path_p % k;
+            int l;
+            for (l = 0; l < k; ++l) y.vals[i][l] = 0;
+            y.vals[i][class_id] = 1;
+        }
+    }
+    return y;
+}
+
+matrix load_tags_paths(char **paths, int n, int k)
+{
+    matrix y = make_matrix(n, k);
+    int i;
+    int count = 0;
+    for(i = 0; i < n; ++i){
+        char label[4096];
+        find_replace(paths[i], "imgs", "labels", label);
+        find_replace(label, "_iconl.jpeg", ".txt", label);
+        FILE *file = fopen(label, "r");
+        if(!file){
+            find_replace(label, "labels", "labels2", label);
+            file = fopen(label, "r");
+            if(!file) continue;
+        }
+        ++count;
+        int tag;
+        while(fscanf(file, "%d", &tag) == 1){
+            if(tag < k){
+                y.vals[i][tag] = 1;
+            }
+        }
+        fclose(file);
+    }
+    printf("%d/%d\n", count, n);
+    return y;
+}
+
+char **get_labels_custom(char *filename, int *size)
+{
+    list *plist = get_paths(filename);
+    if(size) *size = plist->size;
+    char **labels = (char **)list_to_array(plist);
+    free_list(plist);
+    return labels;
+}
+
+char **get_labels(char *filename)
+{
+    return get_labels_custom(filename, NULL);
+}
+
+void free_data(data d)
+{
+    if(!d.shallow){
+        free_matrix(d.X);
+        free_matrix(d.y);
+    }else{
+        free(d.X.vals);
+        free(d.y.vals);
+    }
+}
+
+data load_data_region(int n, char **paths, int m, int w, int h, int size, int classes, float jitter, float hue, float saturation, float exposure)
+{
+    char **random_paths = get_random_paths(paths, n, m);
+    int i;
+    data d = {0};
+    d.shallow = 0;
+
+    d.X.rows = n;
+    d.X.vals = (float**)xcalloc(d.X.rows, sizeof(float*));
+    d.X.cols = h*w*3;
+
+
+    int k = size*size*(5+classes);
+    d.y = make_matrix(n, k);
+    for(i = 0; i < n; ++i){
+        image orig = load_image_color(random_paths[i], 0, 0);
+
+        int oh = orig.h;
+        int ow = orig.w;
+
+        int dw = (ow*jitter);
+        int dh = (oh*jitter);
+
+        int pleft  = rand_uniform(-dw, dw);
+        int pright = rand_uniform(-dw, dw);
+        int ptop   = rand_uniform(-dh, dh);
+        int pbot   = rand_uniform(-dh, dh);
+
+        int swidth =  ow - pleft - pright;
+        int sheight = oh - ptop - pbot;
+
+        float sx = (float)swidth  / ow;
+        float sy = (float)sheight / oh;
+
+        int flip = random_gen()%2;
+        image cropped = crop_image(orig, pleft, ptop, swidth, sheight);
+
+        float dx = ((float)pleft/ow)/sx;
+        float dy = ((float)ptop /oh)/sy;
+
+        image sized = resize_image(cropped, w, h);
+        if(flip) flip_image(sized);
+        random_distort_image(sized, hue, saturation, exposure);
+        d.X.vals[i] = sized.data;
+
+        fill_truth_region(random_paths[i], d.y.vals[i], classes, size, flip, dx, dy, 1./sx, 1./sy);
+
+        free_image(orig);
+        free_image(cropped);
+    }
+    free(random_paths);
+    return d;
+}
+
+data load_data_compare(int n, char **paths, int m, int classes, int w, int h)
+{
+    if(m) paths = get_random_paths(paths, 2*n, m);
+    int i,j;
+    data d = {0};
+    d.shallow = 0;
+
+    d.X.rows = n;
+    d.X.vals = (float**)xcalloc(d.X.rows, sizeof(float*));
+    d.X.cols = h*w*6;
+
+    int k = 2*(classes);
+    d.y = make_matrix(n, k);
+    for(i = 0; i < n; ++i){
+        image im1 = load_image_color(paths[i*2],   w, h);
+        image im2 = load_image_color(paths[i*2+1], w, h);
+
+        d.X.vals[i] = (float*)xcalloc(d.X.cols, sizeof(float));
+        memcpy(d.X.vals[i],         im1.data, h*w*3*sizeof(float));
+        memcpy(d.X.vals[i] + h*w*3, im2.data, h*w*3*sizeof(float));
+
+        int id;
+        float iou;
+
+        char imlabel1[4096];
+        char imlabel2[4096];
+        find_replace(paths[i*2],   "imgs", "labels", imlabel1);
+        find_replace(imlabel1, "jpg", "txt", imlabel1);
+        FILE *fp1 = fopen(imlabel1, "r");
+
+        while(fscanf(fp1, "%d %f", &id, &iou) == 2){
+            if (d.y.vals[i][2*id] < iou) d.y.vals[i][2*id] = iou;
+        }
+
+        find_replace(paths[i*2+1], "imgs", "labels", imlabel2);
+        find_replace(imlabel2, "jpg", "txt", imlabel2);
+        FILE *fp2 = fopen(imlabel2, "r");
+
+        while(fscanf(fp2, "%d %f", &id, &iou) == 2){
+            if (d.y.vals[i][2*id + 1] < iou) d.y.vals[i][2*id + 1] = iou;
+        }
+
+        for (j = 0; j < classes; ++j){
+            if (d.y.vals[i][2*j] > .5 &&  d.y.vals[i][2*j+1] < .5){
+                d.y.vals[i][2*j] = 1;
+                d.y.vals[i][2*j+1] = 0;
+            } else if (d.y.vals[i][2*j] < .5 &&  d.y.vals[i][2*j+1] > .5){
+                d.y.vals[i][2*j] = 0;
+                d.y.vals[i][2*j+1] = 1;
+            } else {
+                d.y.vals[i][2*j]   = SECRET_NUM;
+                d.y.vals[i][2*j+1] = SECRET_NUM;
+            }
+        }
+        fclose(fp1);
+        fclose(fp2);
+
+        free_image(im1);
+        free_image(im2);
+    }
+    if(m) free(paths);
+    return d;
+}
+
+data load_data_swag(char **paths, int n, int classes, float jitter)
+{
+    int index = random_gen()%n;
+    char *random_path = paths[index];
+
+    image orig = load_image_color(random_path, 0, 0);
+    int h = orig.h;
+    int w = orig.w;
+
+    data d = {0};
+    d.shallow = 0;
+    d.w = w;
+    d.h = h;
+
+    d.X.rows = 1;
+    d.X.vals = (float**)xcalloc(d.X.rows, sizeof(float*));
+    d.X.cols = h*w*3;
+
+    int k = (4+classes)*30;
+    d.y = make_matrix(1, k);
+
+    int dw = w*jitter;
+    int dh = h*jitter;
+
+    int pleft  = rand_uniform(-dw, dw);
+    int pright = rand_uniform(-dw, dw);
+    int ptop   = rand_uniform(-dh, dh);
+    int pbot   = rand_uniform(-dh, dh);
+
+    int swidth =  w - pleft - pright;
+    int sheight = h - ptop - pbot;
+
+    float sx = (float)swidth  / w;
+    float sy = (float)sheight / h;
+
+    int flip = random_gen()%2;
+    image cropped = crop_image(orig, pleft, ptop, swidth, sheight);
+
+    float dx = ((float)pleft/w)/sx;
+    float dy = ((float)ptop /h)/sy;
+
+    image sized = resize_image(cropped, w, h);
+    if(flip) flip_image(sized);
+    d.X.vals[0] = sized.data;
+
+    fill_truth_swag(random_path, d.y.vals[0], classes, flip, dx, dy, 1./sx, 1./sy);
+
+    free_image(orig);
+    free_image(cropped);
+
+    return d;
+}
+
+void blend_truth(float *new_truth, int boxes, int truth_size, float *old_truth)
+{
+    int count_new_truth = 0;
+    int t;
+    for (t = 0; t < boxes; ++t) {
+        float x = new_truth[t*truth_size];
+        if (!x) break;
+        count_new_truth++;
+
+    }
+    for (t = count_new_truth; t < boxes; ++t) {
+        float *new_truth_ptr = new_truth + t*truth_size;
+        float *old_truth_ptr = old_truth + (t - count_new_truth)*truth_size;
+        float x = old_truth_ptr[0];
+        if (!x) break;
+
+        new_truth_ptr[0] = old_truth_ptr[0];
+        new_truth_ptr[1] = old_truth_ptr[1];
+        new_truth_ptr[2] = old_truth_ptr[2];
+        new_truth_ptr[3] = old_truth_ptr[3];
+        new_truth_ptr[4] = old_truth_ptr[4];
+    }
+    //printf("\n was %d bboxes, now %d bboxes \n", count_new_truth, t);
+}
+
+
+void blend_truth_mosaic(float *new_truth, int boxes, int truth_size, float *old_truth, int w, int h, float cut_x, float cut_y, int i_mixup,
+    int left_shift, int right_shift, int top_shift, int bot_shift,
+    int net_w, int net_h, int mosaic_bound)
+{
+    const float lowest_w = 1.F / net_w;
+    const float lowest_h = 1.F / net_h;
+
+    int count_new_truth = 0;
+    int t;
+    for (t = 0; t < boxes; ++t) {
+        float x = new_truth[t*truth_size];
+        if (!x) break;
+        count_new_truth++;
+
+    }
+    int new_t = count_new_truth;
+    for (t = count_new_truth; t < boxes; ++t) {
+        float *new_truth_ptr = new_truth + new_t*truth_size;
+        new_truth_ptr[0] = 0;
+        float *old_truth_ptr = old_truth + (t - count_new_truth)*truth_size;
+        float x = old_truth_ptr[0];
+        if (!x) break;
+
+        float xb = old_truth_ptr[0];
+        float yb = old_truth_ptr[1];
+        float wb = old_truth_ptr[2];
+        float hb = old_truth_ptr[3];
+
+
+
+        // shift 4 images
+        if (i_mixup == 0) {
+            xb = xb - (float)(w - cut_x - right_shift) / w;
+            yb = yb - (float)(h - cut_y - bot_shift) / h;
+        }
+        if (i_mixup == 1) {
+            xb = xb + (float)(cut_x - left_shift) / w;
+            yb = yb - (float)(h - cut_y - bot_shift) / h;
+        }
+        if (i_mixup == 2) {
+            xb = xb - (float)(w - cut_x - right_shift) / w;
+            yb = yb + (float)(cut_y - top_shift) / h;
+        }
+        if (i_mixup == 3) {
+            xb = xb + (float)(cut_x - left_shift) / w;
+            yb = yb + (float)(cut_y - top_shift) / h;
+        }
+
+        int left = (xb - wb / 2)*w;
+        int right = (xb + wb / 2)*w;
+        int top = (yb - hb / 2)*h;
+        int bot = (yb + hb / 2)*h;
+
+        if(mosaic_bound)
+        {
+            // fix out of Mosaic-bound
+            float left_bound = 0, right_bound = 0, top_bound = 0, bot_bound = 0;
+            if (i_mixup == 0) {
+                left_bound = 0;
+                right_bound = cut_x;
+                top_bound = 0;
+                bot_bound = cut_y;
+            }
+            if (i_mixup == 1) {
+                left_bound = cut_x;
+                right_bound = w;
+                top_bound = 0;
+                bot_bound = cut_y;
+            }
+            if (i_mixup == 2) {
+                left_bound = 0;
+                right_bound = cut_x;
+                top_bound = cut_y;
+                bot_bound = h;
+            }
+            if (i_mixup == 3) {
+                left_bound = cut_x;
+                right_bound = w;
+                top_bound = cut_y;
+                bot_bound = h;
+            }
+
+
+            if (left < left_bound) {
+                //printf(" i_mixup = %d, left = %d, left_bound = %f \n", i_mixup, left, left_bound);
+                left = left_bound;
+            }
+            if (right > right_bound) {
+                //printf(" i_mixup = %d, right = %d, right_bound = %f \n", i_mixup, right, right_bound);
+                right = right_bound;
+            }
+            if (top < top_bound) top = top_bound;
+            if (bot > bot_bound) bot = bot_bound;
+
+
+            xb = ((float)(right + left) / 2) / w;
+            wb = ((float)(right - left)) / w;
+            yb = ((float)(bot + top) / 2) / h;
+            hb = ((float)(bot - top)) / h;
+        }
+        else
+        {
+            // fix out of bound
+            if (left < 0) {
+                float diff = (float)left / w;
+                xb = xb - diff / 2;
+                wb = wb + diff;
+            }
+
+            if (right > w) {
+                float diff = (float)(right - w) / w;
+                xb = xb - diff / 2;
+                wb = wb - diff;
+            }
+
+            if (top < 0) {
+                float diff = (float)top / h;
+                yb = yb - diff / 2;
+                hb = hb + diff;
+            }
+
+            if (bot > h) {
+                float diff = (float)(bot - h) / h;
+                yb = yb - diff / 2;
+                hb = hb - diff;
+            }
+
+            left = (xb - wb / 2)*w;
+            right = (xb + wb / 2)*w;
+            top = (yb - hb / 2)*h;
+            bot = (yb + hb / 2)*h;
+        }
+
+
+        // leave only within the image
+        if(left >= 0 && right <= w && top >= 0 && bot <= h &&
+            wb > 0 && wb < 1 && hb > 0 && hb < 1 &&
+            xb > 0 && xb < 1 && yb > 0 && yb < 1 &&
+            wb > lowest_w && hb > lowest_h)
+        {
+            new_truth_ptr[0] = xb;
+            new_truth_ptr[1] = yb;
+            new_truth_ptr[2] = wb;
+            new_truth_ptr[3] = hb;
+            new_truth_ptr[4] = old_truth_ptr[4];
+            new_t++;
+        }
+    }
+    //printf("\n was %d bboxes, now %d bboxes \n", count_new_truth, t);
+}
+
+#ifdef OPENCV
+
+#include "http_stream.h"
+
+data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int truth_size, int classes, int use_flip, int use_gaussian_noise, int use_blur, int use_mixup,
+    float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int mosaic_bound, int contrastive, int contrastive_jit_flip, int contrastive_color, int show_imgs)
+{
+    const int random_index = random_gen();
+    c = c ? c : 3;
+
+    if (use_mixup == 2 || use_mixup == 4) {
+        printf("\n cutmix=1 - isn't supported for Detector (use cutmix=1 only for Classifier) \n");
+        if(use_mixup == 2) use_mixup = 0;
+        else use_mixup = 3;
+    }
+    if (use_mixup == 3 && letter_box) {
+        error("Combination: letter_box=1 & mosaic=1 - isn't supported, use only 1 of these parameters", DARKNET_LOC);
+    }
+    if (random_gen() % 2 == 0) use_mixup = 0;
+    int i;
+
+    int *cut_x = NULL, *cut_y = NULL;
+    if (use_mixup == 3) {
+        cut_x = (int*)calloc(n, sizeof(int));
+        cut_y = (int*)calloc(n, sizeof(int));
+        const float min_offset = 0.2; // 20%
+        for (i = 0; i < n; ++i) {
+            cut_x[i] = rand_int(w*min_offset, w*(1 - min_offset));
+            cut_y[i] = rand_int(h*min_offset, h*(1 - min_offset));
+        }
+    }
+
+    data d = {0};
+    d.shallow = 0;
+
+    d.X.rows = n;
+    d.X.vals = (float**)xcalloc(d.X.rows, sizeof(float*));
+    d.X.cols = h*w*c;
+
+    float r1 = 0, r2 = 0, r3 = 0, r4 = 0, r_scale = 0;
+    float resize_r1 = 0, resize_r2 = 0;
+    float dhue = 0, dsat = 0, dexp = 0, flip = 0, blur = 0;
+    int augmentation_calculated = 0, gaussian_noise = 0;
+
+    d.y = make_matrix(n, truth_size*boxes);
+    int i_mixup = 0;
+    for (i_mixup = 0; i_mixup <= use_mixup; i_mixup++) {
+        if (i_mixup) augmentation_calculated = 0;   // recalculate augmentation for the 2nd sequence if(track==1)
+
+        char **random_paths;
+        if (track) random_paths = get_sequential_paths(paths, n, m, mini_batch, augment_speed, contrastive);
+        else random_paths = get_random_paths_custom(paths, n, m, contrastive);
+
+        for (i = 0; i < n; ++i) {
+            float *truth = (float*)xcalloc(truth_size * boxes, sizeof(float));
+            const char *filename = random_paths[i];
+
+            int flag = (c >= 3);
+            mat_cv *src;
+            src = load_image_mat_cv(filename, flag);
+            if (src == NULL) {
+                printf("\n Error in load_data_detection() - OpenCV \n");
+                fflush(stdout);
+                continue;
+            }
+
+            int oh = get_height_mat(src);
+            int ow = get_width_mat(src);
+
+            int dw = (ow*jitter);
+            int dh = (oh*jitter);
+
+            float resize_down = resize, resize_up = resize;
+            if (resize_down > 1.0) resize_down = 1 / resize_down;
+            int min_rdw = ow*(1 - (1 / resize_down)) / 2;   // < 0
+            int min_rdh = oh*(1 - (1 / resize_down)) / 2;   // < 0
+
+            if (resize_up < 1.0) resize_up = 1 / resize_up;
+            int max_rdw = ow*(1 - (1 / resize_up)) / 2;     // > 0
+            int max_rdh = oh*(1 - (1 / resize_up)) / 2;     // > 0
+            //printf(" down = %f, up = %f \n", (1 - (1 / resize_down)) / 2, (1 - (1 / resize_up)) / 2);
+
+            if (!augmentation_calculated || !track)
+            {
+                augmentation_calculated = 1;
+                resize_r1 = random_float();
+                resize_r2 = random_float();
+
+                if (!contrastive || contrastive_jit_flip || i % 2 == 0)
+                {
+                    r1 = random_float();
+                    r2 = random_float();
+                    r3 = random_float();
+                    r4 = random_float();
+
+                    flip = use_flip ? random_gen() % 2 : 0;
+                }
+
+                r_scale = random_float();
+
+                if (!contrastive || contrastive_color || i % 2 == 0)
+                {
+                    dhue = rand_uniform_strong(-hue, hue);
+                    dsat = rand_scale(saturation);
+                    dexp = rand_scale(exposure);
+                }
+
+                if (use_blur) {
+                    int tmp_blur = rand_int(0, 2);  // 0 - disable, 1 - blur background, 2 - blur the whole image
+                    if (tmp_blur == 0) blur = 0;
+                    else if (tmp_blur == 1) blur = 1;
+                    else blur = use_blur;
+                }
+
+                if (use_gaussian_noise && rand_int(0, 1) == 1) gaussian_noise = use_gaussian_noise;
+                else gaussian_noise = 0;
+            }
+
+            int pleft = rand_precalc_random(-dw, dw, r1);
+            int pright = rand_precalc_random(-dw, dw, r2);
+            int ptop = rand_precalc_random(-dh, dh, r3);
+            int pbot = rand_precalc_random(-dh, dh, r4);
+
+            if (resize < 1) {
+                // downsize only
+                pleft += rand_precalc_random(min_rdw, 0, resize_r1);
+                pright += rand_precalc_random(min_rdw, 0, resize_r2);
+                ptop += rand_precalc_random(min_rdh, 0, resize_r1);
+                pbot += rand_precalc_random(min_rdh, 0, resize_r2);
+            }
+            else {
+                pleft += rand_precalc_random(min_rdw, max_rdw, resize_r1);
+                pright += rand_precalc_random(min_rdw, max_rdw, resize_r2);
+                ptop += rand_precalc_random(min_rdh, max_rdh, resize_r1);
+                pbot += rand_precalc_random(min_rdh, max_rdh, resize_r2);
+            }
+
+            //printf("\n pleft = %d, pright = %d, ptop = %d, pbot = %d, ow = %d, oh = %d \n", pleft, pright, ptop, pbot, ow, oh);
+
+            //float scale = rand_precalc_random(.25, 2, r_scale); // unused currently
+            //printf(" letter_box = %d \n", letter_box);
+
+            if (letter_box)
+            {
+                float img_ar = (float)ow / (float)oh;
+                float net_ar = (float)w / (float)h;
+                float result_ar = img_ar / net_ar;
+                //printf(" ow = %d, oh = %d, w = %d, h = %d, img_ar = %f, net_ar = %f, result_ar = %f \n", ow, oh, w, h, img_ar, net_ar, result_ar);
+                if (result_ar > 1)  // sheight - should be increased
+                {
+                    float oh_tmp = ow / net_ar;
+                    float delta_h = (oh_tmp - oh)/2;
+                    ptop = ptop - delta_h;
+                    pbot = pbot - delta_h;
+                    //printf(" result_ar = %f, oh_tmp = %f, delta_h = %d, ptop = %f, pbot = %f \n", result_ar, oh_tmp, delta_h, ptop, pbot);
+                }
+                else  // swidth - should be increased
+                {
+                    float ow_tmp = oh * net_ar;
+                    float delta_w = (ow_tmp - ow)/2;
+                    pleft = pleft - delta_w;
+                    pright = pright - delta_w;
+                    //printf(" result_ar = %f, ow_tmp = %f, delta_w = %d, pleft = %f, pright = %f \n", result_ar, ow_tmp, delta_w, pleft, pright);
+                }
+
+                //printf("\n pleft = %d, pright = %d, ptop = %d, pbot = %d, ow = %d, oh = %d \n", pleft, pright, ptop, pbot, ow, oh);
+            }
+
+            // move each 2nd image to the corner - so that most of it was visible
+            if (use_mixup == 3 && random_gen() % 2 == 0) {
+                if (flip) {
+                    if (i_mixup == 0) pleft += pright, pright = 0, pbot += ptop, ptop = 0;
+                    if (i_mixup == 1) pright += pleft, pleft = 0, pbot += ptop, ptop = 0;
+                    if (i_mixup == 2) pleft += pright, pright = 0, ptop += pbot, pbot = 0;
+                    if (i_mixup == 3) pright += pleft, pleft = 0, ptop += pbot, pbot = 0;
+                }
+                else {
+                    if (i_mixup == 0) pright += pleft, pleft = 0, pbot += ptop, ptop = 0;
+                    if (i_mixup == 1) pleft += pright, pright = 0, pbot += ptop, ptop = 0;
+                    if (i_mixup == 2) pright += pleft, pleft = 0, ptop += pbot, pbot = 0;
+                    if (i_mixup == 3) pleft += pright, pright = 0, ptop += pbot, pbot = 0;
+                }
+            }
+
+            int swidth = ow - pleft - pright;
+            int sheight = oh - ptop - pbot;
+
+            if (swidth <=  0 || sheight <= 0 || (ow - pleft) <= 0 || (oh - ptop) <= 0 ) {
+                printf("\n WARNING: invalid resize. Resetting swidth: %d , sheight:  %d, pleft: %d, ptop: %d \n", dw, dh, 0 ,0);
+                printf("\n Original values: \n swidth = %d, sheight = %d, pleft = %d, pright = %d, ptop = %d, pbot = %d, ow = %d, oh = %d \n", swidth, sheight, pleft, pright, ptop, pbot, ow, oh);
+                swidth = ow;
+                sheight = oh;
+                pleft = 0;
+                ptop = 0;
+            }
+
+            float sx = (float)swidth / ow;
+            float sy = (float)sheight / oh;
+
+            float dx = ((float)pleft / ow) / sx;
+            float dy = ((float)ptop / oh) / sy;
+
+
+            int min_w_h = fill_truth_detection(filename, boxes, truth_size, truth, classes, flip, dx, dy, 1. / sx, 1. / sy, w, h);
+            //for (int z = 0; z < boxes; ++z) if(truth[z*truth_size] > 0) printf(" track_id = %f \n", truth[z*truth_size + 5]);
+            //printf(" truth_size = %d \n", truth_size);
+
+            if ((min_w_h / 8) < blur && blur > 1) blur = min_w_h / 8;   // disable blur if one of the objects is too small
+
+            image ai = image_data_augmentation(src, w, h, pleft, ptop, swidth, sheight, flip, dhue, dsat, dexp,
+                gaussian_noise, blur, boxes, truth_size, truth);
+
+            if (use_mixup == 0) {
+                d.X.vals[i] = ai.data;
+                memcpy(d.y.vals[i], truth, truth_size * boxes * sizeof(float));
+            }
+            else if (use_mixup == 1) {
+                if (i_mixup == 0) {
+                    d.X.vals[i] = ai.data;
+                    memcpy(d.y.vals[i], truth, truth_size * boxes * sizeof(float));
+                }
+                else if (i_mixup == 1) {
+                    image old_img = make_empty_image(w, h, c);
+                    old_img.data = d.X.vals[i];
+                    //show_image(ai, "new");
+                    //show_image(old_img, "old");
+                    //wait_until_press_key_cv();
+                    blend_images_cv(ai, 0.5, old_img, 0.5);
+                    blend_truth(d.y.vals[i], boxes, truth_size, truth);
+                    free_image(old_img);
+                    d.X.vals[i] = ai.data;
+                }
+            }
+            else if (use_mixup == 3) {
+                if (i_mixup == 0) {
+                    image tmp_img = make_image(w, h, c);
+                    d.X.vals[i] = tmp_img.data;
+                }
+
+                if (flip) {
+                    int tmp = pleft;
+                    pleft = pright;
+                    pright = tmp;
+                }
+
+                const int left_shift = min_val_cmp(cut_x[i], max_val_cmp(0, (-pleft*w / ow)));
+                const int top_shift = min_val_cmp(cut_y[i], max_val_cmp(0, (-ptop*h / oh)));
+
+                const int right_shift = min_val_cmp((w - cut_x[i]), max_val_cmp(0, (-pright*w / ow)));
+                const int bot_shift = min_val_cmp(h - cut_y[i], max_val_cmp(0, (-pbot*h / oh)));
+
+
+                int k, x, y;
+                for (k = 0; k < c; ++k) {
+                    for (y = 0; y < h; ++y) {
+                        int j = y*w + k*w*h;
+                        if (i_mixup == 0 && y < cut_y[i]) {
+                            int j_src = (w - cut_x[i] - right_shift) + (y + h - cut_y[i] - bot_shift)*w + k*w*h;
+                            memcpy(&d.X.vals[i][j + 0], &ai.data[j_src], cut_x[i] * sizeof(float));
+                        }
+                        if (i_mixup == 1 && y < cut_y[i]) {
+                            int j_src = left_shift + (y + h - cut_y[i] - bot_shift)*w + k*w*h;
+                            memcpy(&d.X.vals[i][j + cut_x[i]], &ai.data[j_src], (w-cut_x[i]) * sizeof(float));
+                        }
+                        if (i_mixup == 2 && y >= cut_y[i]) {
+                            int j_src = (w - cut_x[i] - right_shift) + (top_shift + y - cut_y[i])*w + k*w*h;
+                            memcpy(&d.X.vals[i][j + 0], &ai.data[j_src], cut_x[i] * sizeof(float));
+                        }
+                        if (i_mixup == 3 && y >= cut_y[i]) {
+                            int j_src = left_shift + (top_shift + y - cut_y[i])*w + k*w*h;
+                            memcpy(&d.X.vals[i][j + cut_x[i]], &ai.data[j_src], (w - cut_x[i]) * sizeof(float));
+                        }
+                    }
+                }
+
+                blend_truth_mosaic(d.y.vals[i], boxes, truth_size, truth, w, h, cut_x[i], cut_y[i], i_mixup, left_shift, right_shift, top_shift, bot_shift, w, h, mosaic_bound);
+
+                free_image(ai);
+                ai.data = d.X.vals[i];
+            }
+
+
+            if (show_imgs && i_mixup == use_mixup)   // delete i_mixup
+            {
+                image tmp_ai = copy_image(ai);
+                char buff[1000];
+                //sprintf(buff, "aug_%d_%d_%s_%d", random_index, i, basecfg((char*)filename), random_gen());
+                sprintf(buff, "aug_%d_%d_%d", random_index, i, random_gen());
+                int t;
+                for (t = 0; t < boxes; ++t) {
+                    box b = float_to_box_stride(d.y.vals[i] + t*truth_size, 1);
+                    if (!b.x) break;
+                    int left = (b.x - b.w / 2.)*ai.w;
+                    int right = (b.x + b.w / 2.)*ai.w;
+                    int top = (b.y - b.h / 2.)*ai.h;
+                    int bot = (b.y + b.h / 2.)*ai.h;
+                    draw_box_width(tmp_ai, left, top, right, bot, 1, 150, 100, 50); // 3 channels RGB
+                }
+
+                save_image(tmp_ai, buff);
+                if (show_imgs == 1) {
+                    //char buff_src[1000];
+                    //sprintf(buff_src, "src_%d_%d_%s_%d", random_index, i, basecfg((char*)filename), random_gen());
+                    //show_image_mat(src, buff_src);
+                    show_image(tmp_ai, buff);
+                    wait_until_press_key_cv();
+                }
+                printf("\nYou use flag -show_imgs, so will be saved aug_...jpg images. Click on window and press ESC button \n");
+                free_image(tmp_ai);
+            }
+
+            release_mat(&src);
+            free(truth);
+        }
+        if (random_paths) free(random_paths);
+    }
+
+
+    return d;
+}
+#else    // OPENCV
+void blend_images(image new_img, float alpha, image old_img, float beta)
+{
+    int data_size = new_img.w * new_img.h * new_img.c;
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < data_size; ++i)
+        new_img.data[i] = new_img.data[i] * alpha + old_img.data[i] * beta;
+}
+
+data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int truth_size, int classes, int use_flip, int gaussian_noise, int use_blur, int use_mixup,
+    float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int mosaic_bound, int contrastive, int contrastive_jit_flip, int contrastive_color, int show_imgs)
+{
+    const int random_index = random_gen();
+    c = c ? c : 3;
+    char **random_paths;
+    char **mixup_random_paths = NULL;
+    if(track) random_paths = get_sequential_paths(paths, n, m, mini_batch, augment_speed, contrastive);
+    else random_paths = get_random_paths_custom(paths, n, m, contrastive);
+
+    //assert(use_mixup < 2);
+    if (use_mixup == 2) {
+        error("cutmix=1 - isn't supported for Detector", DARKNET_LOC);
+    }
+    if (use_mixup == 3 || use_mixup == 4) {
+        error("mosaic=1 - compile Darknet with OpenCV for using mosaic=1", DARKNET_LOC);
+    }
+    int mixup = use_mixup ? random_gen() % 2 : 0;
+    //printf("\n mixup = %d \n", mixup);
+    if (mixup) {
+        if (track) mixup_random_paths = get_sequential_paths(paths, n, m, mini_batch, augment_speed, contrastive);
+        else mixup_random_paths = get_random_paths(paths, n, m);
+    }
+
+    int i;
+    data d = { 0 };
+    d.shallow = 0;
+
+    d.X.rows = n;
+    d.X.vals = (float**)xcalloc(d.X.rows, sizeof(float*));
+    d.X.cols = h*w*c;
+
+    float r1 = 0, r2 = 0, r3 = 0, r4 = 0, r_scale;
+    float resize_r1 = 0, resize_r2 = 0;
+    float dhue = 0, dsat = 0, dexp = 0, flip = 0;
+    int augmentation_calculated = 0;
+
+    d.y = make_matrix(n, truth_size * boxes);
+    int i_mixup = 0;
+    for (i_mixup = 0; i_mixup <= mixup; i_mixup++) {
+        if (i_mixup) augmentation_calculated = 0;
+        for (i = 0; i < n; ++i) {
+            float *truth = (float*)xcalloc(truth_size * boxes, sizeof(float));
+            char *filename = (i_mixup) ? mixup_random_paths[i] : random_paths[i];
+
+            image orig = load_image(filename, 0, 0, c);
+
+            int oh = orig.h;
+            int ow = orig.w;
+
+            int dw = (ow*jitter);
+            int dh = (oh*jitter);
+
+            float resize_down = resize, resize_up = resize;
+            if (resize_down > 1.0) resize_down = 1 / resize_down;
+            int min_rdw = ow*(1 - (1 / resize_down)) / 2;
+            int min_rdh = oh*(1 - (1 / resize_down)) / 2;
+
+            if (resize_up < 1.0) resize_up = 1 / resize_up;
+            int max_rdw = ow*(1 - (1 / resize_up)) / 2;
+            int max_rdh = oh*(1 - (1 / resize_up)) / 2;
+
+            if (!augmentation_calculated || !track)
+            {
+                augmentation_calculated = 1;
+                resize_r1 = random_float();
+                resize_r2 = random_float();
+
+                if (!contrastive || contrastive_jit_flip || i % 2 == 0)
+                {
+                    r1 = random_float();
+                    r2 = random_float();
+                    r3 = random_float();
+                    r4 = random_float();
+
+                    flip = use_flip ? random_gen() % 2 : 0;
+                }
+
+                r_scale = random_float();
+
+                if (!contrastive || contrastive_color || i % 2 == 0)
+                {
+                    dhue = rand_uniform_strong(-hue, hue);
+                    dsat = rand_scale(saturation);
+                    dexp = rand_scale(exposure);
+                }
+            }
+
+            int pleft = rand_precalc_random(-dw, dw, r1);
+            int pright = rand_precalc_random(-dw, dw, r2);
+            int ptop = rand_precalc_random(-dh, dh, r3);
+            int pbot = rand_precalc_random(-dh, dh, r4);
+
+            if (resize < 1) {
+                // downsize only
+                pleft += rand_precalc_random(min_rdw, 0, resize_r1);
+                pright += rand_precalc_random(min_rdw, 0, resize_r2);
+                ptop += rand_precalc_random(min_rdh, 0, resize_r1);
+                pbot += rand_precalc_random(min_rdh, 0, resize_r2);
+            }
+            else {
+                pleft += rand_precalc_random(min_rdw, max_rdw, resize_r1);
+                pright += rand_precalc_random(min_rdw, max_rdw, resize_r2);
+                ptop += rand_precalc_random(min_rdh, max_rdh, resize_r1);
+                pbot += rand_precalc_random(min_rdh, max_rdh, resize_r2);
+            }
+
+            if (letter_box)
+            {
+                float img_ar = (float)ow / (float)oh;
+                float net_ar = (float)w / (float)h;
+                float result_ar = img_ar / net_ar;
+                //printf(" ow = %d, oh = %d, w = %d, h = %d, img_ar = %f, net_ar = %f, result_ar = %f \n", ow, oh, w, h, img_ar, net_ar, result_ar);
+                if (result_ar > 1)  // sheight - should be increased
+                {
+                    float oh_tmp = ow / net_ar;
+                    float delta_h = (oh_tmp - oh) / 2;
+                    ptop = ptop - delta_h;
+                    pbot = pbot - delta_h;
+                    //printf(" result_ar = %f, oh_tmp = %f, delta_h = %d, ptop = %f, pbot = %f \n", result_ar, oh_tmp, delta_h, ptop, pbot);
+                }
+                else  // swidth - should be increased
+                {
+                    float ow_tmp = oh * net_ar;
+                    float delta_w = (ow_tmp - ow) / 2;
+                    pleft = pleft - delta_w;
+                    pright = pright - delta_w;
+                    //printf(" result_ar = %f, ow_tmp = %f, delta_w = %d, pleft = %f, pright = %f \n", result_ar, ow_tmp, delta_w, pleft, pright);
+                }
+            }
+
+            int swidth = ow - pleft - pright;
+            int sheight = oh - ptop - pbot;
+
+            float sx = (float)swidth / ow;
+            float sy = (float)sheight / oh;
+
+            image cropped = crop_image(orig, pleft, ptop, swidth, sheight);
+
+            float dx = ((float)pleft / ow) / sx;
+            float dy = ((float)ptop / oh) / sy;
+
+            image sized = resize_image(cropped, w, h);
+            if (flip) flip_image(sized);
+            distort_image(sized, dhue, dsat, dexp);
+            //random_distort_image(sized, hue, saturation, exposure);
+
+            fill_truth_detection(filename, boxes, truth_size, truth, classes, flip, dx, dy, 1. / sx, 1. / sy, w, h);
+
+            if (i_mixup) {
+                image old_img = sized;
+                old_img.data = d.X.vals[i];
+                //show_image(sized, "new");
+                //show_image(old_img, "old");
+                //wait_until_press_key_cv();
+                blend_images(sized, 0.5, old_img, 0.5);
+                blend_truth(truth, boxes, truth_size, d.y.vals[i]);
+                free_image(old_img);
+            }
+
+            d.X.vals[i] = sized.data;
+            memcpy(d.y.vals[i], truth, truth_size * boxes * sizeof(float));
+
+            if (show_imgs)// && i_mixup)
+            {
+                char buff[1000];
+                sprintf(buff, "aug_%d_%d_%s_%d", random_index, i, basecfg(filename), random_gen());
+
+                int t;
+                for (t = 0; t < boxes; ++t) {
+                    box b = float_to_box_stride(d.y.vals[i] + t*truth_size, 1);
+                    if (!b.x) break;
+                    int left = (b.x - b.w / 2.)*sized.w;
+                    int right = (b.x + b.w / 2.)*sized.w;
+                    int top = (b.y - b.h / 2.)*sized.h;
+                    int bot = (b.y + b.h / 2.)*sized.h;
+                    draw_box_width(sized, left, top, right, bot, 1, 150, 100, 50); // 3 channels RGB
+                }
+
+                save_image(sized, buff);
+                if (show_imgs == 1) {
+                    show_image(sized, buff);
+                    wait_until_press_key_cv();
+                }
+                printf("\nYou use flag -show_imgs, so will be saved aug_...jpg images\n");
+            }
+
+            free_image(orig);
+            free_image(cropped);
+            free(truth);
+        }
+    }
+    free(random_paths);
+    if (mixup_random_paths) free(mixup_random_paths);
+    return d;
+}
+#endif    // OPENCV
+
+void *load_thread(void *ptr)
+{
+    //srand(time(0));
+    //printf("Loading data: %d\n", random_gen());
+    load_args a = *(struct load_args*)ptr;
+    if(a.exposure == 0) a.exposure = 1;
+    if(a.saturation == 0) a.saturation = 1;
+    if(a.aspect == 0) a.aspect = 1;
+
+    if (a.type == OLD_CLASSIFICATION_DATA){
+        *a.d = load_data_old(a.paths, a.n, a.m, a.labels, a.classes, a.w, a.h);
+    } else if (a.type == CLASSIFICATION_DATA){
+        *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.flip, a.min, a.max, a.w, a.h, a.angle, a.aspect, a.hue, a.saturation, a.exposure, a.mixup, a.blur, a.show_imgs, a.label_smooth_eps, a.dontuse_opencv, a.contrastive);
+    } else if (a.type == SUPER_DATA){
+        *a.d = load_data_super(a.paths, a.n, a.m, a.w, a.h, a.scale);
+    } else if (a.type == WRITING_DATA){
+        *a.d = load_data_writing(a.paths, a.n, a.m, a.w, a.h, a.out_w, a.out_h);
+    } else if (a.type == REGION_DATA){
+        *a.d = load_data_region(a.n, a.paths, a.m, a.w, a.h, a.num_boxes, a.classes, a.jitter, a.hue, a.saturation, a.exposure);
+    } else if (a.type == DETECTION_DATA){
+        *a.d = load_data_detection(a.n, a.paths, a.m, a.w, a.h, a.c, a.num_boxes, a.truth_size, a.classes, a.flip, a.gaussian_noise, a.blur, a.mixup, a.jitter, a.resize,
+            a.hue, a.saturation, a.exposure, a.mini_batch, a.track, a.augment_speed, a.letter_box, a.mosaic_bound, a.contrastive, a.contrastive_jit_flip, a.contrastive_color, a.show_imgs);
+    } else if (a.type == SWAG_DATA){
+        *a.d = load_data_swag(a.paths, a.n, a.classes, a.jitter);
+    } else if (a.type == COMPARE_DATA){
+        *a.d = load_data_compare(a.n, a.paths, a.m, a.classes, a.w, a.h);
+    } else if (a.type == IMAGE_DATA){
+        *(a.im) = load_image(a.path, 0, 0, a.c);
+        *(a.resized) = resize_image(*(a.im), a.w, a.h);
+    }else if (a.type == LETTERBOX_DATA) {
+        *(a.im) = load_image(a.path, 0, 0, a.c);
+        *(a.resized) = letterbox_image(*(a.im), a.w, a.h);
+    } else if (a.type == TAG_DATA){
+        *a.d = load_data_tag(a.paths, a.n, a.m, a.classes, a.flip, a.min, a.max, a.w, a.h, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
+    }
+    free(ptr);
+    return 0;
+}
+
+pthread_t load_data_in_thread(load_args args)
+{
+    pthread_t thread;
+    struct load_args* ptr = (load_args*)xcalloc(1, sizeof(struct load_args));
+    *ptr = args;
+    if(pthread_create(&thread, 0, load_thread, ptr)) error("Thread creation failed", DARKNET_LOC);
+    return thread;
+}
+
+static const int thread_wait_ms = 5;
+static volatile int flag_exit;
+static volatile int * run_load_data = NULL;
+static load_args * args_swap = NULL;
+static pthread_t* threads = NULL;
+
+pthread_mutex_t mtx_load_data = PTHREAD_MUTEX_INITIALIZER;
+
+void *run_thread_loop(void *ptr)
+{
+    const int i = *(int *)ptr;
+
+    while (!custom_atomic_load_int(&flag_exit)) {
+        while (!custom_atomic_load_int(&run_load_data[i])) {
+            if (custom_atomic_load_int(&flag_exit)) {
+                free(ptr);
+                return 0;
+            }
+            this_thread_sleep_for(thread_wait_ms);
+        }
+
+        pthread_mutex_lock(&mtx_load_data);
+        load_args *args_local = (load_args *)xcalloc(1, sizeof(load_args));
+        *args_local = args_swap[i];
+        pthread_mutex_unlock(&mtx_load_data);
+
+        load_thread(args_local);
+
+        custom_atomic_store_int(&run_load_data[i], 0);
+    }
+    free(ptr);
+    return 0;
+}
+
+void *load_threads(void *ptr)
+{
+    //srand(time(0));
+    int i;
+    load_args args = *(load_args *)ptr;
+    if (args.threads == 0) args.threads = 1;
+    data *out = args.d;
+    int total = args.n;
+    free(ptr);
+    data* buffers = (data*)xcalloc(args.threads, sizeof(data));
+    if (!threads) {
+        threads = (pthread_t*)xcalloc(args.threads, sizeof(pthread_t));
+        run_load_data = (volatile int *)xcalloc(args.threads, sizeof(int));
+        args_swap = (load_args *)xcalloc(args.threads, sizeof(load_args));
+        fprintf(stderr, " Create %d permanent cpu-threads \n", args.threads);
+
+        for (i = 0; i < args.threads; ++i) {
+            int* ptr = (int*)xcalloc(1, sizeof(int));
+            *ptr = i;
+            if (pthread_create(&threads[i], 0, run_thread_loop, ptr)) error("Thread creation failed", DARKNET_LOC);
+        }
+    }
+
+    for (i = 0; i < args.threads; ++i) {
+        args.d = buffers + i;
+        args.n = (i + 1) * total / args.threads - i * total / args.threads;
+
+        pthread_mutex_lock(&mtx_load_data);
+        args_swap[i] = args;
+        pthread_mutex_unlock(&mtx_load_data);
+
+        custom_atomic_store_int(&run_load_data[i], 1);  // run thread
+    }
+    for (i = 0; i < args.threads; ++i) {
+        while (custom_atomic_load_int(&run_load_data[i])) this_thread_sleep_for(thread_wait_ms); //   join
+    }
+
+    /*
+    pthread_t* threads = (pthread_t*)xcalloc(args.threads, sizeof(pthread_t));
+    for(i = 0; i < args.threads; ++i){
+        args.d = buffers + i;
+        args.n = (i+1) * total/args.threads - i * total/args.threads;
+        threads[i] = load_data_in_thread(args);
+    }
+    for(i = 0; i < args.threads; ++i){
+        pthread_join(threads[i], 0);
+    }
+    */
+
+    *out = concat_datas(buffers, args.threads);
+    out->shallow = 0;
+    for(i = 0; i < args.threads; ++i){
+        buffers[i].shallow = 1;
+        free_data(buffers[i]);
+    }
+    free(buffers);
+    //free(threads);
+    return 0;
+}
+
+void free_load_threads(void *ptr)
+{
+    load_args args = *(load_args *)ptr;
+    if (args.threads == 0) args.threads = 1;
+    int i;
+    if (threads) {
+        custom_atomic_store_int(&flag_exit, 1);
+        for (i = 0; i < args.threads; ++i) {
+            pthread_join(threads[i], 0);
+        }
+        free((void*)run_load_data);
+        free(args_swap);
+        free(threads);
+        threads = NULL;
+        custom_atomic_store_int(&flag_exit, 0);
+    }
+}
+
+pthread_t load_data(load_args args)
+{
+    pthread_t thread;
+    struct load_args* ptr = (load_args*)xcalloc(1, sizeof(struct load_args));
+    *ptr = args;
+    if(pthread_create(&thread, 0, load_threads, ptr)) error("Thread creation failed", DARKNET_LOC);
+    return thread;
+}
+
+data load_data_writing(char **paths, int n, int m, int w, int h, int out_w, int out_h)
+{
+    if(m) paths = get_random_paths(paths, n, m);
+    char **replace_paths = find_replace_paths(paths, n, ".png", "-label.png");
+    data d = {0};
+    d.shallow = 0;
+    d.X = load_image_paths(paths, n, w, h);
+    d.y = load_image_paths_gray(replace_paths, n, out_w, out_h);
+    if(m) free(paths);
+    int i;
+    for(i = 0; i < n; ++i) free(replace_paths[i]);
+    free(replace_paths);
+    return d;
+}
+
+data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h)
+{
+    if(m) paths = get_random_paths(paths, n, m);
+    data d = {0};
+    d.shallow = 0;
+    d.X = load_image_paths(paths, n, w, h);
+    d.y = load_labels_paths(paths, n, labels, k, 0, 0, 0);
+    if(m) free(paths);
+    return d;
+}
+
+/*
+   data load_data_study(char **paths, int n, int m, char **labels, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
+   {
+   data d = {0};
+   d.indexes = calloc(n, sizeof(int));
+   if(m) paths = get_random_paths_indexes(paths, n, m, d.indexes);
+   d.shallow = 0;
+   d.X = load_image_augment_paths(paths, n, flip, min, max, size, angle, aspect, hue, saturation, exposure);
+   d.y = load_labels_paths(paths, n, labels, k);
+   if(m) free(paths);
+   return d;
+   }
+ */
+
+data load_data_super(char **paths, int n, int m, int w, int h, int scale)
+{
+    if(m) paths = get_random_paths(paths, n, m);
+    data d = {0};
+    d.shallow = 0;
+
+    int i;
+    d.X.rows = n;
+    d.X.vals = (float**)xcalloc(n, sizeof(float*));
+    d.X.cols = w*h*3;
+
+    d.y.rows = n;
+    d.y.vals = (float**)xcalloc(n, sizeof(float*));
+    d.y.cols = w*scale * h*scale * 3;
+
+    for(i = 0; i < n; ++i){
+        image im = load_image_color(paths[i], 0, 0);
+        image crop = random_crop_image(im, w*scale, h*scale);
+        int flip = random_gen()%2;
+        if (flip) flip_image(crop);
+        image resize = resize_image(crop, w, h);
+        d.X.vals[i] = resize.data;
+        d.y.vals[i] = crop.data;
+        free_image(im);
+    }
+
+    if(m) free(paths);
+    return d;
+}
+
+data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int use_flip, int min, int max, int w, int h, float angle,
+    float aspect, float hue, float saturation, float exposure, int use_mixup, int use_blur, int show_imgs, float label_smooth_eps, int dontuse_opencv, int contrastive)
+{
+    char **paths_stored = paths;
+    if(m) paths = get_random_paths(paths, n, m);
+    data d = {0};
+    d.shallow = 0;
+    d.X = load_image_augment_paths(paths, n, use_flip, min, max, w, h, angle, aspect, hue, saturation, exposure, dontuse_opencv, contrastive);
+    d.y = load_labels_paths(paths, n, labels, k, hierarchy, label_smooth_eps, contrastive);
+
+    if (use_mixup && rand_int(0, 1)) {
+        char **paths_mix = get_random_paths(paths_stored, n, m);
+        data d2 = { 0 };
+        d2.shallow = 0;
+        d2.X = load_image_augment_paths(paths_mix, n, use_flip, min, max, w, h, angle, aspect, hue, saturation, exposure, dontuse_opencv, contrastive);
+        d2.y = load_labels_paths(paths_mix, n, labels, k, hierarchy, label_smooth_eps, contrastive);
+        free(paths_mix);
+
+        data d3 = { 0 };
+        d3.shallow = 0;
+        data d4 = { 0 };
+        d4.shallow = 0;
+        if (use_mixup >= 3) {
+            char **paths_mix3 = get_random_paths(paths_stored, n, m);
+            d3.X = load_image_augment_paths(paths_mix3, n, use_flip, min, max, w, h, angle, aspect, hue, saturation, exposure, dontuse_opencv, contrastive);
+            d3.y = load_labels_paths(paths_mix3, n, labels, k, hierarchy, label_smooth_eps, contrastive);
+            free(paths_mix3);
+
+            char **paths_mix4 = get_random_paths(paths_stored, n, m);
+            d4.X = load_image_augment_paths(paths_mix4, n, use_flip, min, max, w, h, angle, aspect, hue, saturation, exposure, dontuse_opencv, contrastive);
+            d4.y = load_labels_paths(paths_mix4, n, labels, k, hierarchy, label_smooth_eps, contrastive);
+            free(paths_mix4);
+        }
+
+
+        // mix
+        int i, j;
+        for (i = 0; i < d2.X.rows; ++i) {
+
+            int mixup = use_mixup;
+            if (use_mixup == 4) mixup = rand_int(2, 3); // alternate CutMix and Mosaic
+
+            // MixUp -----------------------------------
+            if (mixup == 1) {
+                // mix images
+                for (j = 0; j < d2.X.cols; ++j) {
+                    d.X.vals[i][j] = (d.X.vals[i][j] + d2.X.vals[i][j]) / 2.0f;
+                }
+
+                // mix labels
+                for (j = 0; j < d2.y.cols; ++j) {
+                    d.y.vals[i][j] = (d.y.vals[i][j] + d2.y.vals[i][j]) / 2.0f;
+                }
+            }
+            // CutMix -----------------------------------
+            else if (mixup == 2) {
+                const float min = 0.3;  // 0.3*0.3 = 9%
+                const float max = 0.8;  // 0.8*0.8 = 64%
+                const int cut_w = rand_int(w*min, w*max);
+                const int cut_h = rand_int(h*min, h*max);
+                const int cut_x = rand_int(0, w - cut_w - 1);
+                const int cut_y = rand_int(0, h - cut_h - 1);
+                const int left = cut_x;
+                const int right = cut_x + cut_w;
+                const int top = cut_y;
+                const int bot = cut_y + cut_h;
+
+                assert(cut_x >= 0 && cut_x <= w);
+                assert(cut_y >= 0 && cut_y <= h);
+                assert(cut_w >= 0 && cut_w <= w);
+                assert(cut_h >= 0 && cut_h <= h);
+
+                assert(right >= 0 && right <= w);
+                assert(bot >= 0 && bot <= h);
+
+                assert(top <= bot);
+                assert(left <= right);
+
+                const float alpha = (float)(cut_w*cut_h) / (float)(w*h);
+                const float beta = 1 - alpha;
+
+                int c, x, y;
+                for (c = 0; c < 3; ++c) {
+                    for (y = top; y < bot; ++y) {
+                        for (x = left; x < right; ++x) {
+                            int j = x + y*w + c*w*h;
+                            d.X.vals[i][j] = d2.X.vals[i][j];
+                        }
+                    }
+                }
+
+                //printf("\n alpha = %f, beta = %f \n", alpha, beta);
+                // mix labels
+                for (j = 0; j < d.y.cols; ++j) {
+                    d.y.vals[i][j] = d.y.vals[i][j] * beta + d2.y.vals[i][j] * alpha;
+                }
+            }
+            // Mosaic -----------------------------------
+            else if (mixup == 3)
+            {
+                const float min_offset = 0.2; // 20%
+                const int cut_x = rand_int(w*min_offset, w*(1 - min_offset));
+                const int cut_y = rand_int(h*min_offset, h*(1 - min_offset));
+
+                float s1 = (float)(cut_x * cut_y) / (w*h);
+                float s2 = (float)((w - cut_x) * cut_y) / (w*h);
+                float s3 = (float)(cut_x * (h - cut_y)) / (w*h);
+                float s4 = (float)((w - cut_x) * (h - cut_y)) / (w*h);
+
+                int c, x, y;
+                for (c = 0; c < 3; ++c) {
+                    for (y = 0; y < h; ++y) {
+                        for (x = 0; x < w; ++x) {
+                            int j = x + y*w + c*w*h;
+                            if (x < cut_x && y < cut_y) d.X.vals[i][j] = d.X.vals[i][j];
+                            if (x >= cut_x && y < cut_y) d.X.vals[i][j] = d2.X.vals[i][j];
+                            if (x < cut_x && y >= cut_y) d.X.vals[i][j] = d3.X.vals[i][j];
+                            if (x >= cut_x && y >= cut_y) d.X.vals[i][j] = d4.X.vals[i][j];
+                        }
+                    }
+                }
+
+                for (j = 0; j < d.y.cols; ++j) {
+                    const float max_s = 1;// max_val_cmp(s1, max_val_cmp(s2, max_val_cmp(s3, s4)));
+
+                    d.y.vals[i][j] = d.y.vals[i][j] * s1 / max_s + d2.y.vals[i][j] * s2 / max_s + d3.y.vals[i][j] * s3 / max_s + d4.y.vals[i][j] * s4 / max_s;
+                }
+            }
+        }
+
+        free_data(d2);
+
+        if (use_mixup >= 3) {
+            free_data(d3);
+            free_data(d4);
+        }
+    }
+
+#ifdef OPENCV
+    if (use_blur) {
+        int i;
+        for (i = 0; i < d.X.rows; ++i) {
+            if (random_gen() % 4 == 0) {
+                image im = make_empty_image(w, h, 3);
+                im.data = d.X.vals[i];
+                int ksize = use_blur;
+                if (use_blur == 1) ksize = 15;
+                image blurred = blur_image(im, ksize);
+                free_image(im);
+                d.X.vals[i] = blurred.data;
+                //if (i == 0) {
+                //    show_image(im, "Not blurred");
+                //    show_image(blurred, "blurred");
+                //    wait_until_press_key_cv();
+                //}
+            }
+        }
+    }
+#endif  // OPENCV
+
+    if (show_imgs) {
+        int i, j;
+        for (i = 0; i < d.X.rows; ++i) {
+            image im = make_empty_image(w, h, 3);
+            im.data = d.X.vals[i];
+            char buff[1000];
+            sprintf(buff, "aug_%d_%s_%d", i, basecfg((char*)paths[i]), random_gen());
+            save_image(im, buff);
+
+            char buff_string[1000];
+            sprintf(buff_string, "\n Classes: ");
+            for (j = 0; j < d.y.cols; ++j) {
+                if (d.y.vals[i][j] > 0) {
+                    char buff_tmp[100];
+                    sprintf(buff_tmp, " %d (%f), ", j, d.y.vals[i][j]);
+                    strcat(buff_string, buff_tmp);
+                }
+            }
+            printf("%s \n", buff_string);
+
+            if (show_imgs == 1) {
+                show_image(im, buff);
+                wait_until_press_key_cv();
+            }
+        }
+        printf("\nYou use flag -show_imgs, so will be saved aug_...jpg images. Click on window and press ESC button \n");
+    }
+
+    if (m) free(paths);
+
+    return d;
+}
+
+data load_data_tag(char **paths, int n, int m, int k, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure)
+{
+    if(m) paths = get_random_paths(paths, n, m);
+    data d = {0};
+    d.w = w;
+    d.h = h;
+    d.shallow = 0;
+    d.X = load_image_augment_paths(paths, n, use_flip, min, max, w, h, angle, aspect, hue, saturation, exposure, 0, 0);
+    d.y = load_tags_paths(paths, n, k);
+    if(m) free(paths);
+    return d;
+}
+
+matrix concat_matrix(matrix m1, matrix m2)
+{
+    int i, count = 0;
+    matrix m;
+    m.cols = m1.cols;
+    m.rows = m1.rows+m2.rows;
+    m.vals = (float**)xcalloc(m1.rows + m2.rows, sizeof(float*));
+    for(i = 0; i < m1.rows; ++i){
+        m.vals[count++] = m1.vals[i];
+    }
+    for(i = 0; i < m2.rows; ++i){
+        m.vals[count++] = m2.vals[i];
+    }
+    return m;
+}
+
+data concat_data(data d1, data d2)
+{
+    data d = {0};
+    d.shallow = 1;
+    d.X = concat_matrix(d1.X, d2.X);
+    d.y = concat_matrix(d1.y, d2.y);
+    return d;
+}
+
+data concat_datas(data *d, int n)
+{
+    int i;
+    data out = {0};
+    for(i = 0; i < n; ++i){
+        data newdata = concat_data(d[i], out);
+        free_data(out);
+        out = newdata;
+    }
+    return out;
+}
+
+data load_categorical_data_csv(char *filename, int target, int k)
+{
+    data d = {0};
+    d.shallow = 0;
+    matrix X = csv_to_matrix(filename);
+    float *truth_1d = pop_column(&X, target);
+    float **truth = one_hot_encode(truth_1d, X.rows, k);
+    matrix y;
+    y.rows = X.rows;
+    y.cols = k;
+    y.vals = truth;
+    d.X = X;
+    d.y = y;
+    free(truth_1d);
+    return d;
+}
+
+data load_cifar10_data(char *filename)
+{
+    data d = {0};
+    d.shallow = 0;
+    long i,j;
+    matrix X = make_matrix(10000, 3072);
+    matrix y = make_matrix(10000, 10);
+    d.X = X;
+    d.y = y;
+
+    FILE *fp = fopen(filename, "rb");
+    if(!fp) file_error(filename);
+    for(i = 0; i < 10000; ++i){
+        unsigned char bytes[3073];
+        fread(bytes, 1, 3073, fp);
+        int class_id = bytes[0];
+        y.vals[i][class_id] = 1;
+        for(j = 0; j < X.cols; ++j){
+            X.vals[i][j] = (double)bytes[j+1];
+        }
+    }
+    //translate_data_rows(d, -128);
+    scale_data_rows(d, 1./255);
+    //normalize_data_rows(d);
+    fclose(fp);
+    return d;
+}
+
+void get_random_batch(data d, int n, float *X, float *y)
+{
+    int j;
+    for(j = 0; j < n; ++j){
+        int index = random_gen()%d.X.rows;
+        memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
+        memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
+    }
+}
+
+void get_next_batch(data d, int n, int offset, float *X, float *y)
+{
+    int j;
+    for(j = 0; j < n; ++j){
+        int index = offset + j;
+        memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
+        memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
+    }
+}
+
+void smooth_data(data d)
+{
+    int i, j;
+    float scale = 1. / d.y.cols;
+    float eps = .1;
+    for(i = 0; i < d.y.rows; ++i){
+        for(j = 0; j < d.y.cols; ++j){
+            d.y.vals[i][j] = eps * scale + (1-eps) * d.y.vals[i][j];
+        }
+    }
+}
+
+data load_all_cifar10()
+{
+    data d = {0};
+    d.shallow = 0;
+    int i,j,b;
+    matrix X = make_matrix(50000, 3072);
+    matrix y = make_matrix(50000, 10);
+    d.X = X;
+    d.y = y;
+
+
+    for(b = 0; b < 5; ++b){
+        char buff[256];
+        sprintf(buff, "data/cifar/cifar-10-batches-bin/data_batch_%d.bin", b+1);
+        FILE *fp = fopen(buff, "rb");
+        if(!fp) file_error(buff);
+        for(i = 0; i < 10000; ++i){
+            unsigned char bytes[3073];
+            fread(bytes, 1, 3073, fp);
+            int class_id = bytes[0];
+            y.vals[i+b*10000][class_id] = 1;
+            for(j = 0; j < X.cols; ++j){
+                X.vals[i+b*10000][j] = (double)bytes[j+1];
+            }
+        }
+        fclose(fp);
+    }
+    //normalize_data_rows(d);
+    //translate_data_rows(d, -128);
+    scale_data_rows(d, 1./255);
+    smooth_data(d);
+    return d;
+}
+
+data load_go(char *filename)
+{
+    FILE *fp = fopen(filename, "rb");
+    matrix X = make_matrix(3363059, 361);
+    matrix y = make_matrix(3363059, 361);
+    int row, col;
+
+    if(!fp) file_error(filename);
+    char *label;
+    int count = 0;
+    while((label = fgetl(fp))){
+        int i;
+        if(count == X.rows){
+            X = resize_matrix(X, count*2);
+            y = resize_matrix(y, count*2);
+        }
+        sscanf(label, "%d %d", &row, &col);
+        char *board = fgetl(fp);
+
+        int index = row*19 + col;
+        y.vals[count][index] = 1;
+
+        for(i = 0; i < 19*19; ++i){
+            float val = 0;
+            if(board[i] == '1') val = 1;
+            else if(board[i] == '2') val = -1;
+            X.vals[count][i] = val;
+        }
+        ++count;
+        free(label);
+        free(board);
+    }
+    X = resize_matrix(X, count);
+    y = resize_matrix(y, count);
+
+    data d = {0};
+    d.shallow = 0;
+    d.X = X;
+    d.y = y;
+
+
+    fclose(fp);
+
+    return d;
+}
+
+
+void randomize_data(data d)
+{
+    int i;
+    for(i = d.X.rows-1; i > 0; --i){
+        int index = random_gen()%i;
+        float *swap = d.X.vals[index];
+        d.X.vals[index] = d.X.vals[i];
+        d.X.vals[i] = swap;
+
+        swap = d.y.vals[index];
+        d.y.vals[index] = d.y.vals[i];
+        d.y.vals[i] = swap;
+    }
+}
+
+void scale_data_rows(data d, float s)
+{
+    int i;
+    for(i = 0; i < d.X.rows; ++i){
+        scale_array(d.X.vals[i], d.X.cols, s);
+    }
+}
+
+void translate_data_rows(data d, float s)
+{
+    int i;
+    for(i = 0; i < d.X.rows; ++i){
+        translate_array(d.X.vals[i], d.X.cols, s);
+    }
+}
+
+void normalize_data_rows(data d)
+{
+    int i;
+    for(i = 0; i < d.X.rows; ++i){
+        normalize_array(d.X.vals[i], d.X.cols);
+    }
+}
+
+data get_data_part(data d, int part, int total)
+{
+    data p = {0};
+    p.shallow = 1;
+    p.X.rows = d.X.rows * (part + 1) / total - d.X.rows * part / total;
+    p.y.rows = d.y.rows * (part + 1) / total - d.y.rows * part / total;
+    p.X.cols = d.X.cols;
+    p.y.cols = d.y.cols;
+    p.X.vals = d.X.vals + d.X.rows * part / total;
+    p.y.vals = d.y.vals + d.y.rows * part / total;
+    return p;
+}
+
+data get_random_data(data d, int num)
+{
+    data r = {0};
+    r.shallow = 1;
+
+    r.X.rows = num;
+    r.y.rows = num;
+
+    r.X.cols = d.X.cols;
+    r.y.cols = d.y.cols;
+
+    r.X.vals = (float**)xcalloc(num, sizeof(float*));
+    r.y.vals = (float**)xcalloc(num, sizeof(float*));
+
+    int i;
+    for(i = 0; i < num; ++i){
+        int index = random_gen()%d.X.rows;
+        r.X.vals[i] = d.X.vals[index];
+        r.y.vals[i] = d.y.vals[index];
+    }
+    return r;
+}
+
+data *split_data(data d, int part, int total)
+{
+    data* split = (data*)xcalloc(2, sizeof(data));
+    int i;
+    int start = part*d.X.rows/total;
+    int end = (part+1)*d.X.rows/total;
+    data train ={0};
+    data test ={0};
+    train.shallow = test.shallow = 1;
+
+    test.X.rows = test.y.rows = end-start;
+    train.X.rows = train.y.rows = d.X.rows - (end-start);
+    train.X.cols = test.X.cols = d.X.cols;
+    train.y.cols = test.y.cols = d.y.cols;
+
+    train.X.vals = (float**)xcalloc(train.X.rows, sizeof(float*));
+    test.X.vals = (float**)xcalloc(test.X.rows, sizeof(float*));
+    train.y.vals = (float**)xcalloc(train.y.rows, sizeof(float*));
+    test.y.vals = (float**)xcalloc(test.y.rows, sizeof(float*));
+
+    for(i = 0; i < start; ++i){
+        train.X.vals[i] = d.X.vals[i];
+        train.y.vals[i] = d.y.vals[i];
+    }
+    for(i = start; i < end; ++i){
+        test.X.vals[i-start] = d.X.vals[i];
+        test.y.vals[i-start] = d.y.vals[i];
+    }
+    for(i = end; i < d.X.rows; ++i){
+        train.X.vals[i-(end-start)] = d.X.vals[i];
+        train.y.vals[i-(end-start)] = d.y.vals[i];
+    }
+    split[0] = train;
+    split[1] = test;
+    return split;
+}
diff --git a/darknet-master/src/data.h b/darknet-master/src/data.h
new file mode 100644
index 0000000..9f12343
--- /dev/null
+++ b/darknet-master/src/data.h
@@ -0,0 +1,125 @@
+#ifndef DATA_H
+#define DATA_H
+#include <pthread.h>
+
+#include "darknet.h"
+#include "darknet.h"
+#include "matrix.h"
+#include "list.h"
+#include "image.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "tree.h"
+
+static inline float distance_from_edge(int x, int max)
+{
+    int dx = (max/2) - x;
+    if (dx < 0) dx = -dx;
+    dx = (max/2) + 1 - dx;
+    dx *= 2;
+    float dist = (float)dx/max;
+    if (dist > 1) dist = 1;
+    return dist;
+}
+
+//typedef struct{
+//    int w, h;
+//    matrix X;
+//    matrix y;
+//    int shallow;
+//    int *num_boxes;
+//    box **boxes;
+//} data;
+
+//typedef enum {
+//    CLASSIFICATION_DATA, DETECTION_DATA, CAPTCHA_DATA, REGION_DATA, IMAGE_DATA, LETTERBOX_DATA, COMPARE_DATA, WRITING_DATA, SWAG_DATA, TAG_DATA, OLD_CLASSIFICATION_DATA, STUDY_DATA, DET_DATA, SUPER_DATA
+//} data_type;
+/*
+typedef struct load_args{
+    int threads;
+    char **paths;
+    char *path;
+    int n;
+    int m;
+    char **labels;
+    int h;
+    int w;
+    int c; // color depth
+    int out_w;
+    int out_h;
+    int nh;
+    int nw;
+    int num_boxes;
+    int min, max, size;
+    int classes;
+    int background;
+    int scale;
+    int small_object;
+    float jitter;
+    int flip;
+    float angle;
+    float aspect;
+    float saturation;
+    float exposure;
+    float hue;
+    data *d;
+    image *im;
+    image *resized;
+    data_type type;
+    tree *hierarchy;
+} load_args;
+
+typedef struct{
+    int id;
+    float x,y,w,h;
+    float left, right, top, bottom;
+} box_label;
+
+void free_data(data d);
+
+pthread_t load_data(load_args args);
+
+pthread_t load_data_in_thread(load_args args);
+*/
+void print_letters(float *pred, int n);
+data load_data_captcha(char **paths, int n, int m, int k, int w, int h);
+data load_data_captcha_encode(char **paths, int n, int m, int w, int h);
+data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h);
+data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int truth_size, int classes, int use_flip, int gaussian_noise, int use_blur, int use_mixup,
+    float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int mosaic_bound, int contrastive, int contrastive_jit_flip, int contrastive_color, int show_imgs);
+data load_data_tag(char **paths, int n, int m, int k, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure);
+matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure, int dontuse_opencv, int contrastive);
+data load_data_super(char **paths, int n, int m, int w, int h, int scale);
+data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int use_flip, int min, int max, int w, int h, float angle,
+    float aspect, float hue, float saturation, float exposure, int use_mixup, int use_blur, int show_imgs, float label_smooth_eps, int dontuse_opencv, int contrastive);
+data load_go(char *filename);
+
+box_label *read_boxes(char *filename, int *n);
+data load_cifar10_data(char *filename);
+data load_all_cifar10();
+
+data load_data_writing(char **paths, int n, int m, int w, int h, int out_w, int out_h);
+
+list *get_paths(char *filename);
+char **get_labels(char *filename);
+char **get_labels_custom(char *filename, int *size);
+void get_random_batch(data d, int n, float *X, float *y);
+data get_data_part(data d, int part, int total);
+data get_random_data(data d, int num);
+void get_next_batch(data d, int n, int offset, float *X, float *y);
+data load_categorical_data_csv(char *filename, int target, int k);
+void normalize_data_rows(data d);
+void scale_data_rows(data d, float s);
+void translate_data_rows(data d, float s);
+void randomize_data(data d);
+data *split_data(data d, int part, int total);
+data concat_data(data d1, data d2);
+data concat_datas(data *d, int n);
+void fill_truth(char *path, char **labels, int k, float *truth);
+void fill_truth_smooth(char *path, char **labels, int k, float *truth, float label_smooth_eps);
+#ifdef __cplusplus
+}
+
+#endif
+#endif
diff --git a/darknet-master/src/deconvolutional_kernels.cu b/darknet-master/src/deconvolutional_kernels.cu
new file mode 100644
index 0000000..6af65eb
--- /dev/null
+++ b/darknet-master/src/deconvolutional_kernels.cu
@@ -0,0 +1,106 @@
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+
+#include "convolutional_layer.h"
+#include "deconvolutional_layer.h"
+#include "gemm.h"
+#include "blas.h"
+#include "im2col.h"
+#include "col2im.h"
+#include "utils.h"
+#include "dark_cuda.h"
+
+extern "C" void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state)
+{
+    int i;
+    int out_h = deconvolutional_out_height(layer);
+    int out_w = deconvolutional_out_width(layer);
+    int size = out_h*out_w;
+
+    int m = layer.size*layer.size*layer.n;
+    int n = layer.h*layer.w;
+    int k = layer.c;
+
+    fill_ongpu(layer.outputs*layer.batch, 0, layer.output_gpu, 1);
+
+    for(i = 0; i < layer.batch; ++i){
+        float *a = layer.weights_gpu;
+        float *b = state.input + i*layer.c*layer.h*layer.w;
+        float *c = layer.col_image_gpu;
+
+        gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
+
+        col2im_ongpu(c, layer.n, out_h, out_w, layer.size, layer.stride, 0, layer.output_gpu+i*layer.n*size);
+    }
+    add_bias_gpu(layer.output_gpu, layer.biases_gpu, layer.batch, layer.n, size);
+    activate_array(layer.output_gpu, layer.batch*layer.n*size, layer.activation);
+}
+
+extern "C" void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state)
+{
+    float alpha = 1./layer.batch;
+    int out_h = deconvolutional_out_height(layer);
+    int out_w = deconvolutional_out_width(layer);
+    int size = out_h*out_w;
+    int i;
+
+    gradient_array(layer.output_gpu, size*layer.n*layer.batch, layer.activation, layer.delta_gpu);
+    backward_bias(layer.bias_updates_gpu, layer.delta, layer.batch, layer.n, size);
+
+    if(state.delta) memset(state.delta, 0, layer.batch*layer.h*layer.w*layer.c*sizeof(float));
+
+    for(i = 0; i < layer.batch; ++i){
+        int m = layer.c;
+        int n = layer.size*layer.size*layer.n;
+        int k = layer.h*layer.w;
+
+        float *a = state.input + i*m*n;
+        float *b = layer.col_image_gpu;
+        float *c = layer.weight_updates_gpu;
+
+        im2col_ongpu(layer.delta_gpu + i*layer.n*size, layer.n, out_h, out_w,
+                layer.size, layer.stride, 0, b);
+        gemm_ongpu(0,1,m,n,k,alpha,a,k,b,k,1,c,n);
+
+        if(state.delta){
+            int m = layer.c;
+            int n = layer.h*layer.w;
+            int k = layer.size*layer.size*layer.n;
+
+            float *a = layer.weights_gpu;
+            float *b = layer.col_image_gpu;
+            float *c = state.delta + i*n*m;
+
+            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+        }
+    }
+}
+
+extern "C" void pull_deconvolutional_layer(deconvolutional_layer layer)
+{
+    cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
+    cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
+    cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
+    cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
+}
+
+extern "C" void push_deconvolutional_layer(deconvolutional_layer layer)
+{
+    cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
+    cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
+    cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
+    cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
+}
+
+extern "C" void update_deconvolutional_layer_gpu(deconvolutional_layer layer, int skip, float learning_rate, float momentum, float decay)
+{
+    int size = layer.size*layer.size*layer.c*layer.n;
+
+    axpy_ongpu(layer.n, learning_rate, layer.bias_updates_gpu, 1, layer.biases_gpu, 1);
+    scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1);
+
+    axpy_ongpu(size, -decay, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
+    axpy_ongpu(size, learning_rate, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
+    scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);
+}
diff --git a/darknet-master/src/deconvolutional_layer.c b/darknet-master/src/deconvolutional_layer.c
new file mode 100644
index 0000000..4f4e4cc
--- /dev/null
+++ b/darknet-master/src/deconvolutional_layer.c
@@ -0,0 +1,203 @@
+#include "deconvolutional_layer.h"
+#include "convolutional_layer.h"
+#include "utils.h"
+#include "im2col.h"
+#include "col2im.h"
+#include "blas.h"
+#include "gemm.h"
+#include <stdio.h>
+#include <time.h>
+
+int deconvolutional_out_height(deconvolutional_layer l)
+{
+    int h = l.stride*(l.h - 1) + l.size;
+    return h;
+}
+
+int deconvolutional_out_width(deconvolutional_layer l)
+{
+    int w = l.stride*(l.w - 1) + l.size;
+    return w;
+}
+
+int deconvolutional_out_size(deconvolutional_layer l)
+{
+    return deconvolutional_out_height(l) * deconvolutional_out_width(l);
+}
+
+image get_deconvolutional_image(deconvolutional_layer l)
+{
+    int h,w,c;
+    h = deconvolutional_out_height(l);
+    w = deconvolutional_out_width(l);
+    c = l.n;
+    return float_to_image(w,h,c,l.output);
+}
+
+image get_deconvolutional_delta(deconvolutional_layer l)
+{
+    int h,w,c;
+    h = deconvolutional_out_height(l);
+    w = deconvolutional_out_width(l);
+    c = l.n;
+    return float_to_image(w,h,c,l.delta);
+}
+
+deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation)
+{
+    int i;
+    deconvolutional_layer l = { (LAYER_TYPE)0 };
+    l.type = DECONVOLUTIONAL;
+
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.n = n;
+    l.batch = batch;
+    l.stride = stride;
+    l.size = size;
+
+    l.weights = (float*)xcalloc(c * n * size * size, sizeof(float));
+    l.weight_updates = (float*)xcalloc(c * n * size * size, sizeof(float));
+
+    l.biases = (float*)xcalloc(n, sizeof(float));
+    l.bias_updates = (float*)xcalloc(n, sizeof(float));
+    float scale = 1./sqrt(size*size*c);
+    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_normal();
+    for(i = 0; i < n; ++i){
+        l.biases[i] = scale;
+    }
+    int out_h = deconvolutional_out_height(l);
+    int out_w = deconvolutional_out_width(l);
+
+    l.out_h = out_h;
+    l.out_w = out_w;
+    l.out_c = n;
+    l.outputs = l.out_w * l.out_h * l.out_c;
+    l.inputs = l.w * l.h * l.c;
+
+    l.col_image = (float*)xcalloc(h * w * size * size * n, sizeof(float));
+    l.output = (float*)xcalloc(l.batch * out_h * out_w * n, sizeof(float));
+    l.delta = (float*)xcalloc(l.batch * out_h * out_w * n, sizeof(float));
+
+    l.forward = forward_deconvolutional_layer;
+    l.backward = backward_deconvolutional_layer;
+    l.update = update_deconvolutional_layer;
+
+    #ifdef GPU
+    l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+    l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+
+    l.biases_gpu = cuda_make_array(l.biases, n);
+    l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
+
+    l.col_image_gpu = cuda_make_array(l.col_image, h*w*size*size*n);
+    l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
+    l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+    #endif
+
+    l.activation = activation;
+
+    fprintf(stderr, "Deconvolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
+
+    return l;
+}
+
+void resize_deconvolutional_layer(deconvolutional_layer *l, int h, int w)
+{
+    l->h = h;
+    l->w = w;
+    int out_h = deconvolutional_out_height(*l);
+    int out_w = deconvolutional_out_width(*l);
+
+    l->col_image = (float*)xrealloc(l->col_image,
+                                out_h*out_w*l->size*l->size*l->c*sizeof(float));
+    l->output = (float*)xrealloc(l->output,
+                                l->batch*out_h * out_w * l->n*sizeof(float));
+    l->delta = (float*)xrealloc(l->delta,
+                                l->batch*out_h * out_w * l->n*sizeof(float));
+    #ifdef GPU
+    cuda_free(l->col_image_gpu);
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+
+    l->col_image_gpu = cuda_make_array(l->col_image, out_h*out_w*l->size*l->size*l->c);
+    l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n);
+    l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n);
+    #endif
+}
+
+void forward_deconvolutional_layer(const deconvolutional_layer l, network_state state)
+{
+    int i;
+    int out_h = deconvolutional_out_height(l);
+    int out_w = deconvolutional_out_width(l);
+    int size = out_h*out_w;
+
+    int m = l.size*l.size*l.n;
+    int n = l.h*l.w;
+    int k = l.c;
+
+    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+
+    for(i = 0; i < l.batch; ++i){
+        float *a = l.weights;
+        float *b = state.input + i*l.c*l.h*l.w;
+        float *c = l.col_image;
+
+        gemm(1,0,m,n,k,1,a,m,b,n,0,c,n);
+
+        col2im_cpu(c, l.n, out_h, out_w, l.size, l.stride, 0, l.output+i*l.n*size);
+    }
+    add_bias(l.output, l.biases, l.batch, l.n, size);
+    activate_array(l.output, l.batch*l.n*size, l.activation);
+}
+
+void backward_deconvolutional_layer(deconvolutional_layer l, network_state state)
+{
+    float alpha = 1./l.batch;
+    int out_h = deconvolutional_out_height(l);
+    int out_w = deconvolutional_out_width(l);
+    int size = out_h*out_w;
+    int i;
+
+    gradient_array(l.output, size*l.n*l.batch, l.activation, l.delta);
+    backward_bias(l.bias_updates, l.delta, l.batch, l.n, size);
+
+    for(i = 0; i < l.batch; ++i){
+        int m = l.c;
+        int n = l.size*l.size*l.n;
+        int k = l.h*l.w;
+
+        float *a = state.input + i*m*n;
+        float *b = l.col_image;
+        float *c = l.weight_updates;
+
+        im2col_cpu(l.delta + i*l.n*size, l.n, out_h, out_w,
+                l.size, l.stride, 0, b);
+        gemm(0,1,m,n,k,alpha,a,k,b,k,1,c,n);
+
+        if(state.delta){
+            int m = l.c;
+            int n = l.h*l.w;
+            int k = l.size*l.size*l.n;
+
+            float *a = l.weights;
+            float *b = l.col_image;
+            float *c = state.delta + i*n*m;
+
+            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+        }
+    }
+}
+
+void update_deconvolutional_layer(deconvolutional_layer l, int skip, float learning_rate, float momentum, float decay)
+{
+    int size = l.size*l.size*l.c*l.n;
+    axpy_cpu(l.n, learning_rate, l.bias_updates, 1, l.biases, 1);
+    scal_cpu(l.n, momentum, l.bias_updates, 1);
+
+    axpy_cpu(size, -decay, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(size, learning_rate, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(size, momentum, l.weight_updates, 1);
+}
diff --git a/darknet-master/src/deconvolutional_layer.h b/darknet-master/src/deconvolutional_layer.h
new file mode 100644
index 0000000..bb15a42
--- /dev/null
+++ b/darknet-master/src/deconvolutional_layer.h
@@ -0,0 +1,40 @@
+#ifndef DECONVOLUTIONAL_LAYER_H
+#define DECONVOLUTIONAL_LAYER_H
+
+#include "dark_cuda.h"
+#include "image.h"
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+
+typedef layer deconvolutional_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef GPU
+void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state);
+void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state);
+void update_deconvolutional_layer_gpu(deconvolutional_layer layer, int skip, float learning_rate, float momentum, float decay);
+void push_deconvolutional_layer(deconvolutional_layer layer);
+void pull_deconvolutional_layer(deconvolutional_layer layer);
+#endif
+
+deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation);
+void resize_deconvolutional_layer(deconvolutional_layer *layer, int h, int w);
+void forward_deconvolutional_layer(const deconvolutional_layer layer, network_state state);
+void update_deconvolutional_layer(deconvolutional_layer layer, int skip, float learning_rate, float momentum, float decay);
+void backward_deconvolutional_layer(deconvolutional_layer layer, network_state state);
+
+image get_deconvolutional_image(deconvolutional_layer layer);
+image get_deconvolutional_delta(deconvolutional_layer layer);
+image get_deconvolutional_filter(deconvolutional_layer layer, int i);
+
+int deconvolutional_out_height(deconvolutional_layer layer);
+int deconvolutional_out_width(deconvolutional_layer layer);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/demo.c b/darknet-master/src/demo.c
new file mode 100644
index 0000000..5a01faf
--- /dev/null
+++ b/darknet-master/src/demo.c
@@ -0,0 +1,449 @@
+#include "network.h"
+#include "detection_layer.h"
+#include "region_layer.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "parser.h"
+#include "box.h"
+#include "image.h"
+#include "demo.h"
+#include "darknet.h"
+#ifdef WIN32
+#include <time.h>
+#include "gettimeofday.h"
+#else
+#include <sys/time.h>
+#endif
+
+#ifdef OPENCV
+
+#include "http_stream.h"
+
+static char **demo_names;
+static image **demo_alphabet;
+static int demo_classes;
+
+static int nboxes = 0;
+static detection *dets = NULL;
+
+static network net;
+static image in_s ;
+static image det_s;
+
+static cap_cv *cap;
+static float fps = 0;
+static float demo_thresh = 0;
+static int demo_ext_output = 0;
+static long long int frame_id = 0;
+static int demo_json_port = -1;
+static bool demo_skip_frame = false;
+
+
+static int avg_frames;
+static int demo_index = 0;
+static mat_cv** cv_images;
+
+mat_cv* in_img;
+mat_cv* det_img;
+mat_cv* show_img;
+
+static volatile int flag_exit;
+static int letter_box = 0;
+
+static const int thread_wait_ms = 1;
+static volatile int run_fetch_in_thread = 0;
+static volatile int run_detect_in_thread = 0;
+
+
+void *fetch_in_thread(void *ptr)
+{
+    while (!custom_atomic_load_int(&flag_exit)) {
+        while (!custom_atomic_load_int(&run_fetch_in_thread)) {
+            if (custom_atomic_load_int(&flag_exit)) return 0;
+            if (demo_skip_frame)
+                consume_frame(cap);
+            this_thread_yield();
+        }
+        int dont_close_stream = 0;    // set 1 if your IP-camera periodically turns off and turns on video-stream
+        if (letter_box)
+            in_s = get_image_from_stream_letterbox(cap, net.w, net.h, net.c, &in_img, dont_close_stream);
+        else
+            in_s = get_image_from_stream_resize(cap, net.w, net.h, net.c, &in_img, dont_close_stream);
+        if (!in_s.data) {
+            printf("Stream closed.\n");
+            custom_atomic_store_int(&flag_exit, 1);
+            custom_atomic_store_int(&run_fetch_in_thread, 0);
+            return 0;
+        }
+        //in_s = resize_image(in, net.w, net.h);
+
+        custom_atomic_store_int(&run_fetch_in_thread, 0);
+    }
+    return 0;
+}
+
+void *fetch_in_thread_sync(void *ptr)
+{
+    custom_atomic_store_int(&run_fetch_in_thread, 1);
+    while (custom_atomic_load_int(&run_fetch_in_thread)) this_thread_sleep_for(thread_wait_ms);
+    return 0;
+}
+
+void *detect_in_thread(void *ptr)
+{
+    while (!custom_atomic_load_int(&flag_exit)) {
+        while (!custom_atomic_load_int(&run_detect_in_thread)) {
+            if (custom_atomic_load_int(&flag_exit)) return 0;
+            this_thread_yield();
+        }
+
+        layer l = net.layers[net.n - 1];
+        float *X = det_s.data;
+        //float *prediction =
+        network_predict(net, X);
+
+        cv_images[demo_index] = det_img;
+        det_img = cv_images[(demo_index + avg_frames / 2 + 1) % avg_frames];
+        demo_index = (demo_index + 1) % avg_frames;
+
+        if (letter_box)
+            dets = get_network_boxes(&net, get_width_mat(in_img), get_height_mat(in_img), demo_thresh, demo_thresh, 0, 1, &nboxes, 1); // letter box
+        else
+            dets = get_network_boxes(&net, net.w, net.h, demo_thresh, demo_thresh, 0, 1, &nboxes, 0); // resized
+
+        //const float nms = .45;
+        //if (nms) {
+        //    if (l.nms_kind == DEFAULT_NMS) do_nms_sort(dets, nboxes, l.classes, nms);
+        //    else diounms_sort(dets, nboxes, l.classes, nms, l.nms_kind, l.beta_nms);
+        //}
+
+        custom_atomic_store_int(&run_detect_in_thread, 0);
+    }
+
+    return 0;
+}
+
+void *detect_in_thread_sync(void *ptr)
+{
+    custom_atomic_store_int(&run_detect_in_thread, 1);
+    while (custom_atomic_load_int(&run_detect_in_thread)) this_thread_sleep_for(thread_wait_ms);
+    return 0;
+}
+
+double get_wall_time()
+{
+    struct timeval walltime;
+    if (gettimeofday(&walltime, NULL)) {
+        return 0;
+    }
+    return (double)walltime.tv_sec + (double)walltime.tv_usec * .000001;
+}
+
+void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int cam_index, const char *filename, char **names, int classes, int avgframes,
+    int frame_skip, char *prefix, char *out_filename, int mjpeg_port, int dontdraw_bbox, int json_port, int dont_show, int ext_output, int letter_box_in, int time_limit_sec, char *http_post_host,
+    int benchmark, int benchmark_layers, char *json_file_output)
+{
+    if (avgframes < 1) avgframes = 1;
+    avg_frames = avgframes;
+    letter_box = letter_box_in;
+    in_img = det_img = show_img = NULL;
+    //skip = frame_skip;
+    image **alphabet = load_alphabet();
+    int delay = frame_skip;
+    demo_names = names;
+    demo_alphabet = alphabet;
+    demo_classes = classes;
+    demo_thresh = thresh;
+    demo_ext_output = ext_output;
+    demo_json_port = json_port;
+    char *json_buf = NULL;
+    FILE* json_file = NULL;
+
+    if (json_file_output) {
+        json_file = fopen(json_file_output, "wb");
+        char *tmp = "[\n";
+        fwrite(tmp, sizeof(char), strlen(tmp), json_file);
+    }
+
+    printf("Demo\n");
+    net = parse_network_cfg_custom(cfgfile, 1, 1);    // set batch=1
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    if (net.letter_box) letter_box = 1;
+    net.benchmark_layers = benchmark_layers;
+    fuse_conv_batchnorm(net);
+    calculate_binary_weights(net);
+    srand(2222222);
+
+    if(filename){
+        printf("video file: %s\n", filename);
+        cap = get_capture_video_stream(filename);
+        demo_skip_frame = is_live_stream(filename);
+    }else{
+        printf("Webcam index: %d\n", cam_index);
+        cap = get_capture_webcam(cam_index);
+        demo_skip_frame = true;
+    }
+
+    if (!cap) {
+#ifdef WIN32
+        printf("Check that you have copied file opencv_ffmpeg340_64.dll to the same directory where is darknet.exe \n");
+#endif
+        error("Couldn't connect to webcam.", DARKNET_LOC);
+    }
+
+    layer l = net.layers[net.n-1];
+    int j;
+
+    cv_images = (mat_cv**)xcalloc(avg_frames, sizeof(mat_cv));
+
+    int i;
+    for (i = 0; i < net.n; ++i) {
+        layer lc = net.layers[i];
+        if (lc.type == YOLO) {
+            lc.mean_alpha = 1.0 / avg_frames;
+            l = lc;
+        }
+    }
+
+    if (l.classes != demo_classes) {
+        printf("\n Parameters don't match: in cfg-file classes=%d, in data-file classes=%d \n", l.classes, demo_classes);
+        error("Error!", DARKNET_LOC);
+    }
+
+    flag_exit = 0;
+
+    custom_thread_t fetch_thread = NULL;
+    custom_thread_t detect_thread = NULL;
+    if (custom_create_thread(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed", DARKNET_LOC);
+    if (custom_create_thread(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed", DARKNET_LOC);
+
+    fetch_in_thread_sync(0); //fetch_in_thread(0);
+    det_img = in_img;
+    det_s = in_s;
+
+    fetch_in_thread_sync(0); //fetch_in_thread(0);
+    detect_in_thread_sync(0); //fetch_in_thread(0);
+    det_img = in_img;
+    det_s = in_s;
+
+    for (j = 0; j < avg_frames / 2; ++j) {
+        free_detections(dets, nboxes);
+        fetch_in_thread_sync(0); //fetch_in_thread(0);
+        detect_in_thread_sync(0); //fetch_in_thread(0);
+        det_img = in_img;
+        det_s = in_s;
+    }
+
+    int count = 0;
+    if(!prefix && !dont_show){
+        int full_screen = 0;
+        create_window_cv("Demo", full_screen, 1352, 1013);
+    }
+
+
+    write_cv* output_video_writer = NULL;
+    if (out_filename && !flag_exit)
+    {
+        int src_fps = 25;
+        src_fps = get_stream_fps_cpp_cv(cap);
+        output_video_writer =
+            create_video_writer(out_filename, 'D', 'I', 'V', 'X', src_fps, get_width_mat(det_img), get_height_mat(det_img), 1);
+
+        //'H', '2', '6', '4'
+        //'D', 'I', 'V', 'X'
+        //'M', 'J', 'P', 'G'
+        //'M', 'P', '4', 'V'
+        //'M', 'P', '4', '2'
+        //'X', 'V', 'I', 'D'
+        //'W', 'M', 'V', '2'
+    }
+
+    int send_http_post_once = 0;
+    const double start_time_lim = get_time_point();
+    double before = get_time_point();
+    double start_time = get_time_point();
+    float avg_fps = 0;
+    int frame_counter = 0;
+    int global_frame_counter = 0;
+
+    while(1){
+        ++count;
+        {
+            const float nms = .45;    // 0.4F
+            int local_nboxes = nboxes;
+            detection *local_dets = dets;
+            this_thread_yield();
+
+            if (!benchmark) custom_atomic_store_int(&run_fetch_in_thread, 1); // if (custom_create_thread(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed", DARKNET_LOC);
+            custom_atomic_store_int(&run_detect_in_thread, 1); // if (custom_create_thread(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed", DARKNET_LOC);
+
+            //if (nms) do_nms_obj(local_dets, local_nboxes, l.classes, nms);    // bad results
+            if (nms) {
+                if (l.nms_kind == DEFAULT_NMS) do_nms_sort(local_dets, local_nboxes, l.classes, nms);
+                else diounms_sort(local_dets, local_nboxes, l.classes, nms, l.nms_kind, l.beta_nms);
+            }
+
+            if (l.embedding_size) set_track_id(local_dets, local_nboxes, demo_thresh, l.sim_thresh, l.track_ciou_norm, l.track_history_size, l.dets_for_track, l.dets_for_show);
+
+            printf("\033[H\033[J");
+            //printf("\nFPS:%.1f\n", fps);
+            printf("Objects:\n\n");
+
+            ++frame_id;
+            if (demo_json_port > 0) {
+                int timeout = 400000;
+                send_json(local_dets, local_nboxes, l.classes, demo_names, frame_id, demo_json_port, timeout);
+            }
+
+            if (json_file_output) {
+                if (json_buf) {
+                    char *tmp = ", \n";
+                    fwrite(tmp, sizeof(char), strlen(tmp), json_file);
+                }
+                json_buf = detection_to_json(local_dets, local_nboxes, l.classes, demo_names, frame_id, NULL);
+                fwrite(json_buf, sizeof(char), strlen(json_buf), json_file);
+                free(json_buf);
+            }
+
+            //char *http_post_server = "webhook.site/898bbd9b-0ddd-49cf-b81d-1f56be98d870";
+            if (http_post_host && !send_http_post_once) {
+                int timeout = 3;            // 3 seconds
+                int http_post_port = 80;    // 443 https, 80 http
+                if (send_http_post_request(http_post_host, http_post_port, filename,
+                    local_dets, nboxes, classes, names, frame_id, ext_output, timeout))
+                {
+                    if (time_limit_sec > 0) send_http_post_once = 1;
+                }
+            }
+
+            if (!benchmark && !dontdraw_bbox) draw_detections_cv_v3(show_img, local_dets, local_nboxes, demo_thresh, demo_names, demo_alphabet, demo_classes, demo_ext_output);
+            free_detections(local_dets, local_nboxes);
+
+            printf("\nFPS:%.1f \t AVG_FPS:%.1f\n", fps, avg_fps);
+
+            if(!prefix){
+                if (!dont_show) {
+                    const int each_frame = max_val_cmp(1, avg_fps / 60);
+                    if(global_frame_counter % each_frame == 0) show_image_mat(show_img, "Demo");
+                    int c = wait_key_cv(1);
+                    if (c == 10) {
+                        if (frame_skip == 0) frame_skip = 60;
+                        else if (frame_skip == 4) frame_skip = 0;
+                        else if (frame_skip == 60) frame_skip = 4;
+                        else frame_skip = 0;
+                    }
+                    else if (c == 27 || c == 1048603) // ESC - exit (OpenCV 2.x / 3.x)
+                    {
+                        flag_exit = 1;
+                    }
+                }
+            }else{
+                char buff[256];
+                sprintf(buff, "%s_%08d.jpg", prefix, count);
+                if(show_img) save_cv_jpg(show_img, buff);
+            }
+
+            // if you run it with param -mjpeg_port 8090  then open URL in your web-browser: http://localhost:8090
+            if (mjpeg_port > 0 && show_img) {
+                int port = mjpeg_port;
+                int timeout = 400000;
+                int jpeg_quality = 40;    // 1 - 100
+                send_mjpeg(show_img, port, timeout, jpeg_quality);
+            }
+
+            // save video file
+            if (output_video_writer && show_img) {
+                write_frame_cv(output_video_writer, show_img);
+                printf("\n cvWriteFrame \n");
+            }
+
+            while (custom_atomic_load_int(&run_detect_in_thread)) {
+                if(avg_fps > 50) this_thread_yield();
+                else this_thread_sleep_for(thread_wait_ms);   // custom_join(detect_thread, 0);
+            }
+            if (!benchmark) {
+                while (custom_atomic_load_int(&run_fetch_in_thread)) {
+                    if (avg_fps > 50) this_thread_yield();
+                    else this_thread_sleep_for(thread_wait_ms);   // custom_join(fetch_thread, 0);
+                }
+                free_image(det_s);
+            }
+
+            if (time_limit_sec > 0 && (get_time_point() - start_time_lim)/1000000 > time_limit_sec) {
+                printf(" start_time_lim = %f, get_time_point() = %f, time spent = %f \n", start_time_lim, get_time_point(), get_time_point() - start_time_lim);
+                break;
+            }
+
+            if (flag_exit == 1) break;
+
+            if(delay == 0){
+                if(!benchmark) release_mat(&show_img);
+                show_img = det_img;
+            }
+            det_img = in_img;
+            det_s = in_s;
+        }
+        --delay;
+        if(delay < 0){
+            delay = frame_skip;
+
+            //double after = get_wall_time();
+            //float curr = 1./(after - before);
+            double after = get_time_point();    // more accurate time measurements
+            float curr = 1000000. / (after - before);
+            fps = fps*0.9 + curr*0.1;
+            before = after;
+
+            float spent_time = (get_time_point() - start_time) / 1000000;
+            frame_counter++;
+            global_frame_counter++;
+            if (spent_time >= 3.0f) {
+                //printf(" spent_time = %f \n", spent_time);
+                avg_fps = frame_counter / spent_time;
+                frame_counter = 0;
+                start_time = get_time_point();
+            }
+        }
+    }
+    printf("input video stream closed. \n");
+    if (output_video_writer) {
+        release_video_writer(&output_video_writer);
+        printf("output_video_writer closed. \n");
+    }
+
+    if (json_file_output) {
+        char *tmp = "\n]";
+        fwrite(tmp, sizeof(char), strlen(tmp), json_file);
+        fclose(json_file);
+    }
+    this_thread_sleep_for(thread_wait_ms);
+
+    custom_join(detect_thread, 0);
+    custom_join(fetch_thread, 0);
+
+    // free memory
+    free_image(in_s);
+    free_detections(dets, nboxes);
+
+    demo_index = (avg_frames + demo_index - 1) % avg_frames;
+    for (j = 0; j < avg_frames; ++j) {
+            release_mat(&cv_images[j]);
+    }
+    free(cv_images);
+
+    free_ptrs((void **)names, net.layers[net.n - 1].classes);
+
+    free_alphabet(alphabet);
+    free_network(net);
+    //cudaProfilerStop();
+}
+#else
+void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int cam_index, const char *filename, char **names, int classes, int avgframes,
+    int frame_skip, char *prefix, char *out_filename, int mjpeg_port, int dontdraw_bbox, int json_port, int dont_show, int ext_output, int letter_box_in, int time_limit_sec, char *http_post_host,
+    int benchmark, int benchmark_layers, char *json_file_output)
+{
+    fprintf(stderr, "Demo needs OpenCV for webcam images.\n");
+}
+#endif
diff --git a/darknet-master/src/demo.h b/darknet-master/src/demo.h
new file mode 100644
index 0000000..15f359d
--- /dev/null
+++ b/darknet-master/src/demo.h
@@ -0,0 +1,14 @@
+#ifndef DEMO_H
+#define DEMO_H
+
+#include "image.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int cam_index, const char *filename, char **names, int classes, int avgframes,
+    int frame_skip, char *prefix, char *out_filename, int mjpeg_port, int dontdraw_bbox, int json_port, int dont_show, int ext_output, int letter_box_in, int time_limit_sec, char *http_post_host, int benchmark, int benchmark_layers, char *json_file_output);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/detection_layer.c b/darknet-master/src/detection_layer.c
new file mode 100644
index 0000000..d3b9af8
--- /dev/null
+++ b/darknet-master/src/detection_layer.c
@@ -0,0 +1,315 @@
+#include "detection_layer.h"
+#include "activations.h"
+#include "softmax_layer.h"
+#include "blas.h"
+#include "box.h"
+#include "dark_cuda.h"
+#include "utils.h"
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+detection_layer make_detection_layer(int batch, int inputs, int n, int side, int classes, int coords, int rescore)
+{
+    detection_layer l = { (LAYER_TYPE)0 };
+    l.type = DETECTION;
+
+    l.n = n;
+    l.batch = batch;
+    l.inputs = inputs;
+    l.classes = classes;
+    l.coords = coords;
+    l.rescore = rescore;
+    l.side = side;
+    l.w = side;
+    l.h = side;
+    assert(side*side*((1 + l.coords)*l.n + l.classes) == inputs);
+    l.cost = (float*)xcalloc(1, sizeof(float));
+    l.outputs = l.inputs;
+    l.truths = l.side*l.side*(1+l.coords+l.classes);
+    l.output = (float*)xcalloc(batch * l.outputs, sizeof(float));
+    l.delta = (float*)xcalloc(batch * l.outputs, sizeof(float));
+
+    l.forward = forward_detection_layer;
+    l.backward = backward_detection_layer;
+#ifdef GPU
+    l.forward_gpu = forward_detection_layer_gpu;
+    l.backward_gpu = backward_detection_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
+    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
+#endif
+
+    fprintf(stderr, "Detection Layer\n");
+    srand(time(0));
+
+    return l;
+}
+
+void forward_detection_layer(const detection_layer l, network_state state)
+{
+    int locations = l.side*l.side;
+    int i,j;
+    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
+    //if(l.reorg) reorg(l.output, l.w*l.h, size*l.n, l.batch, 1);
+    int b;
+    if (l.softmax){
+        for(b = 0; b < l.batch; ++b){
+            int index = b*l.inputs;
+            for (i = 0; i < locations; ++i) {
+                int offset = i*l.classes;
+                softmax(l.output + index + offset, l.classes, 1,
+                        l.output + index + offset, 1);
+            }
+        }
+    }
+    if(state.train){
+        float avg_iou = 0;
+        float avg_cat = 0;
+        float avg_allcat = 0;
+        float avg_obj = 0;
+        float avg_anyobj = 0;
+        int count = 0;
+        *(l.cost) = 0;
+        int size = l.inputs * l.batch;
+        memset(l.delta, 0, size * sizeof(float));
+        for (b = 0; b < l.batch; ++b){
+            int index = b*l.inputs;
+            for (i = 0; i < locations; ++i) {
+                int truth_index = (b*locations + i)*(1+l.coords+l.classes);
+                int is_obj = state.truth[truth_index];
+                for (j = 0; j < l.n; ++j) {
+                    int p_index = index + locations*l.classes + i*l.n + j;
+                    l.delta[p_index] = l.noobject_scale*(0 - l.output[p_index]);
+                    *(l.cost) += l.noobject_scale*pow(l.output[p_index], 2);
+                    avg_anyobj += l.output[p_index];
+                }
+
+                int best_index = -1;
+                float best_iou = 0;
+                float best_rmse = 20;
+
+                if (!is_obj){
+                    continue;
+                }
+
+                int class_index = index + i*l.classes;
+                for(j = 0; j < l.classes; ++j) {
+                    l.delta[class_index+j] = l.class_scale * (state.truth[truth_index+1+j] - l.output[class_index+j]);
+                    *(l.cost) += l.class_scale * pow(state.truth[truth_index+1+j] - l.output[class_index+j], 2);
+                    if(state.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j];
+                    avg_allcat += l.output[class_index+j];
+                }
+
+                box truth = float_to_box(state.truth + truth_index + 1 + l.classes);
+                truth.x /= l.side;
+                truth.y /= l.side;
+
+                for(j = 0; j < l.n; ++j){
+                    int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords;
+                    box out = float_to_box(l.output + box_index);
+                    out.x /= l.side;
+                    out.y /= l.side;
+
+                    if (l.sqrt){
+                        out.w = out.w*out.w;
+                        out.h = out.h*out.h;
+                    }
+
+                    float iou  = box_iou(out, truth);
+                    //iou = 0;
+                    float rmse = box_rmse(out, truth);
+                    if(best_iou > 0 || iou > 0){
+                        if(iou > best_iou){
+                            best_iou = iou;
+                            best_index = j;
+                        }
+                    }else{
+                        if(rmse < best_rmse){
+                            best_rmse = rmse;
+                            best_index = j;
+                        }
+                    }
+                }
+
+                if(l.forced){
+                    if(truth.w*truth.h < .1){
+                        best_index = 1;
+                    }else{
+                        best_index = 0;
+                    }
+                }
+                if(l.random && *(state.net.seen) < 64000){
+                    best_index = rand()%l.n;
+                }
+
+                int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords;
+                int tbox_index = truth_index + 1 + l.classes;
+
+                box out = float_to_box(l.output + box_index);
+                out.x /= l.side;
+                out.y /= l.side;
+                if (l.sqrt) {
+                    out.w = out.w*out.w;
+                    out.h = out.h*out.h;
+                }
+                float iou  = box_iou(out, truth);
+
+                //printf("%d,", best_index);
+                int p_index = index + locations*l.classes + i*l.n + best_index;
+                *(l.cost) -= l.noobject_scale * pow(l.output[p_index], 2);
+                *(l.cost) += l.object_scale * pow(1-l.output[p_index], 2);
+                avg_obj += l.output[p_index];
+                l.delta[p_index] = l.object_scale * (1.-l.output[p_index]);
+
+                if(l.rescore){
+                    l.delta[p_index] = l.object_scale * (iou - l.output[p_index]);
+                }
+
+                l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]);
+                l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]);
+                l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]);
+                l.delta[box_index+3] = l.coord_scale*(state.truth[tbox_index + 3] - l.output[box_index + 3]);
+                if(l.sqrt){
+                    l.delta[box_index+2] = l.coord_scale*(sqrt(state.truth[tbox_index + 2]) - l.output[box_index + 2]);
+                    l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]);
+                }
+
+                *(l.cost) += pow(1-iou, 2);
+                avg_iou += iou;
+                ++count;
+            }
+        }
+
+        if(0){
+            float* costs = (float*)xcalloc(l.batch * locations * l.n, sizeof(float));
+            for (b = 0; b < l.batch; ++b) {
+                int index = b*l.inputs;
+                for (i = 0; i < locations; ++i) {
+                    for (j = 0; j < l.n; ++j) {
+                        int p_index = index + locations*l.classes + i*l.n + j;
+                        costs[b*locations*l.n + i*l.n + j] = l.delta[p_index]*l.delta[p_index];
+                    }
+                }
+            }
+            int indexes[100];
+            top_k(costs, l.batch*locations*l.n, 100, indexes);
+            float cutoff = costs[indexes[99]];
+            for (b = 0; b < l.batch; ++b) {
+                int index = b*l.inputs;
+                for (i = 0; i < locations; ++i) {
+                    for (j = 0; j < l.n; ++j) {
+                        int p_index = index + locations*l.classes + i*l.n + j;
+                        if (l.delta[p_index]*l.delta[p_index] < cutoff) l.delta[p_index] = 0;
+                    }
+                }
+            }
+            free(costs);
+        }
+
+
+        *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+
+
+        printf("Detection Avg IOU: %f, Pos Cat: %f, All Cat: %f, Pos Obj: %f, Any Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_allcat/(count*l.classes), avg_obj/count, avg_anyobj/(l.batch*locations*l.n), count);
+        //if(l.reorg) reorg(l.delta, l.w*l.h, size*l.n, l.batch, 0);
+    }
+}
+
+void backward_detection_layer(const detection_layer l, network_state state)
+{
+    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
+}
+
+void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness)
+{
+    int i,j,n;
+    float *predictions = l.output;
+    //int per_cell = 5*num+classes;
+    for (i = 0; i < l.side*l.side; ++i){
+        int row = i / l.side;
+        int col = i % l.side;
+        for(n = 0; n < l.n; ++n){
+            int index = i*l.n + n;
+            int p_index = l.side*l.side*l.classes + i*l.n + n;
+            float scale = predictions[p_index];
+            int box_index = l.side*l.side*(l.classes + l.n) + (i*l.n + n)*4;
+            boxes[index].x = (predictions[box_index + 0] + col) / l.side * w;
+            boxes[index].y = (predictions[box_index + 1] + row) / l.side * h;
+            boxes[index].w = pow(predictions[box_index + 2], (l.sqrt?2:1)) * w;
+            boxes[index].h = pow(predictions[box_index + 3], (l.sqrt?2:1)) * h;
+            for(j = 0; j < l.classes; ++j){
+                int class_index = i*l.classes;
+                float prob = scale*predictions[class_index+j];
+                probs[index][j] = (prob > thresh) ? prob : 0;
+            }
+            if(only_objectness){
+                probs[index][0] = scale;
+            }
+        }
+    }
+}
+
+#ifdef GPU
+
+void forward_detection_layer_gpu(const detection_layer l, network_state state)
+{
+    if(!state.train){
+        copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
+        return;
+    }
+
+    float* in_cpu = (float*)xcalloc(l.batch * l.inputs, sizeof(float));
+    float *truth_cpu = 0;
+    if(state.truth){
+        int num_truth = l.batch*l.side*l.side*(1+l.coords+l.classes);
+        truth_cpu = (float*)xcalloc(num_truth, sizeof(float));
+        cuda_pull_array(state.truth, truth_cpu, num_truth);
+    }
+    cuda_pull_array(state.input, in_cpu, l.batch*l.inputs);
+    network_state cpu_state = state;
+    cpu_state.train = state.train;
+    cpu_state.truth = truth_cpu;
+    cpu_state.input = in_cpu;
+    forward_detection_layer(l, cpu_state);
+    cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs);
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.inputs);
+    free(cpu_state.input);
+    if(cpu_state.truth) free(cpu_state.truth);
+}
+
+void backward_detection_layer_gpu(detection_layer l, network_state state)
+{
+    axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1);
+    //copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, state.delta, 1);
+}
+#endif
+
+void get_detection_detections(layer l, int w, int h, float thresh, detection *dets)
+{
+    int i, j, n;
+    float *predictions = l.output;
+    //int per_cell = 5*num+classes;
+    for (i = 0; i < l.side*l.side; ++i) {
+        int row = i / l.side;
+        int col = i % l.side;
+        for (n = 0; n < l.n; ++n) {
+            int index = i*l.n + n;
+            int p_index = l.side*l.side*l.classes + i*l.n + n;
+            float scale = predictions[p_index];
+            int box_index = l.side*l.side*(l.classes + l.n) + (i*l.n + n) * 4;
+            box b;
+            b.x = (predictions[box_index + 0] + col) / l.side * w;
+            b.y = (predictions[box_index + 1] + row) / l.side * h;
+            b.w = pow(predictions[box_index + 2], (l.sqrt ? 2 : 1)) * w;
+            b.h = pow(predictions[box_index + 3], (l.sqrt ? 2 : 1)) * h;
+            dets[index].bbox = b;
+            dets[index].objectness = scale;
+            for (j = 0; j < l.classes; ++j) {
+                int class_index = i*l.classes;
+                float prob = scale*predictions[class_index + j];
+                dets[index].prob[j] = (prob > thresh) ? prob : 0;
+            }
+        }
+    }
+}
diff --git a/darknet-master/src/detection_layer.h b/darknet-master/src/detection_layer.h
new file mode 100644
index 0000000..f97bc39
--- /dev/null
+++ b/darknet-master/src/detection_layer.h
@@ -0,0 +1,26 @@
+#ifndef DETECTION_LAYER_H
+#define DETECTION_LAYER_H
+
+#include "layer.h"
+#include "network.h"
+
+typedef layer detection_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+detection_layer make_detection_layer(int batch, int inputs, int n, int size, int classes, int coords, int rescore);
+void forward_detection_layer(const detection_layer l, network_state state);
+void backward_detection_layer(const detection_layer l, network_state state);
+void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness);
+void get_detection_detections(layer l, int w, int h, float thresh, detection *dets);
+
+#ifdef GPU
+void forward_detection_layer_gpu(const detection_layer l, network_state state);
+void backward_detection_layer_gpu(detection_layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/detector.c b/darknet-master/src/detector.c
new file mode 100644
index 0000000..0fc3614
--- /dev/null
+++ b/darknet-master/src/detector.c
@@ -0,0 +1,2048 @@
+#include <stdlib.h>
+#include "darknet.h"
+#include "network.h"
+#include "region_layer.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "parser.h"
+#include "box.h"
+#include "demo.h"
+#include "option_list.h"
+
+#ifndef __COMPAR_FN_T
+#define __COMPAR_FN_T
+typedef int (*__compar_fn_t)(const void*, const void*);
+#ifdef __USE_GNU
+typedef __compar_fn_t comparison_fn_t;
+#endif
+#endif
+
+#include "http_stream.h"
+
+static int coco_ids[] = { 1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90 };
+
+void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int dont_show, int calc_map, float thresh, float iou_thresh, int mjpeg_port, int show_imgs, int benchmark_layers, char* chart_path, int mAP_epochs)
+{
+    list *options = read_data_cfg(datacfg);
+    char *train_images = option_find_str(options, "train", "data/train.txt");
+    char *valid_images = option_find_str(options, "valid", train_images);
+    char *backup_directory = option_find_str(options, "backup", "/backup/");
+
+
+    network net_map;
+    if (calc_map) {
+        FILE* valid_file = fopen(valid_images, "r");
+        if (!valid_file) {
+            printf("\n Error: There is no %s file for mAP calculation!\n Don't use -map flag.\n Or set valid=%s in your %s file. \n", valid_images, train_images, datacfg);
+            error("Error!", DARKNET_LOC);
+        }
+        else fclose(valid_file);
+
+        cuda_set_device(gpus[0]);
+        printf(" Prepare additional network for mAP calculation...\n");
+        net_map = parse_network_cfg_custom(cfgfile, 1, 1);
+        net_map.benchmark_layers = benchmark_layers;
+        const int net_classes = net_map.layers[net_map.n - 1].classes;
+
+        int k;  // free memory unnecessary arrays
+        for (k = 0; k < net_map.n - 1; ++k) free_layer_custom(net_map.layers[k], 1);
+
+        char *name_list = option_find_str(options, "names", "data/names.list");
+        int names_size = 0;
+        char **names = get_labels_custom(name_list, &names_size);
+        if (net_classes != names_size) {
+            printf("\n Error: in the file %s number of names %d that isn't equal to classes=%d in the file %s \n",
+                name_list, names_size, net_classes, cfgfile);
+        }
+        free_ptrs((void**)names, net_map.layers[net_map.n - 1].classes);
+    }
+
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    float avg_loss = -1;
+    float avg_contrastive_acc = 0;
+    network* nets = (network*)xcalloc(ngpus, sizeof(network));
+
+    srand(time(0));
+    int seed = rand();
+    int k;
+    for (k = 0; k < ngpus; ++k) {
+        srand(seed);
+#ifdef GPU
+        cuda_set_device(gpus[k]);
+#endif
+        nets[k] = parse_network_cfg(cfgfile);
+        nets[k].benchmark_layers = benchmark_layers;
+        if (weightfile) {
+            load_weights(&nets[k], weightfile);
+        }
+        if (clear) {
+            *nets[k].seen = 0;
+            *nets[k].cur_iteration = 0;
+        }
+        nets[k].learning_rate *= ngpus;
+    }
+    srand(time(0));
+    network net = nets[0];
+
+    const int actual_batch_size = net.batch * net.subdivisions;
+    if (actual_batch_size == 1) {
+        error("Error: You set incorrect value batch=1 for Training! You should set batch=64 subdivision=64", DARKNET_LOC);
+    }
+    else if (actual_batch_size < 8) {
+        printf("\n Warning: You set batch=%d lower than 64! It is recommended to set batch=64 subdivision=64 \n", actual_batch_size);
+    }
+
+    int save_after_iterations = option_find_int(options, "saveweights", (net.max_batches < 10000) ? 1000 : 10000 );  // configure when to write weights. Very useful for smaller datasets!
+	int save_last_weights_after = option_find_int(options, "savelast", 100);
+    printf("Weights are saved after: %d iterations. Last weights (*_last.weight) are stored every %d iterations. \n", save_after_iterations, save_last_weights_after );
+
+
+    int imgs = net.batch * net.subdivisions * ngpus;
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    data train, buffer;
+
+    layer l = net.layers[net.n - 1];
+    for (k = 0; k < net.n; ++k) {
+        layer lk = net.layers[k];
+        if (lk.type == YOLO || lk.type == GAUSSIAN_YOLO || lk.type == REGION) {
+            l = lk;
+            printf(" Detection layer: %d - type = %d \n", k, l.type);
+        }
+    }
+
+    int classes = l.classes;
+
+    list *plist = get_paths(train_images);
+    int train_images_num = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    const int init_w = net.w;
+    const int init_h = net.h;
+    const int init_b = net.batch;
+    int iter_save, iter_save_last, iter_map;
+    iter_save = get_current_iteration(net);
+    iter_save_last = get_current_iteration(net);
+    iter_map = get_current_iteration(net);
+    float mean_average_precision = -1;
+    float best_map = mean_average_precision;
+
+    load_args args = { 0 };
+    args.w = net.w;
+    args.h = net.h;
+    args.c = net.c;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.classes = classes;
+    args.flip = net.flip;
+    args.jitter = l.jitter;
+    args.resize = l.resize;
+    args.num_boxes = l.max_boxes;
+    args.truth_size = l.truth_size;
+    net.num_boxes = args.num_boxes;
+    net.train_images_num = train_images_num;
+    args.d = &buffer;
+    args.type = DETECTION_DATA;
+    args.threads = 64;    // 16 or 64
+
+    args.angle = net.angle;
+    args.gaussian_noise = net.gaussian_noise;
+    args.blur = net.blur;
+    args.mixup = net.mixup;
+    args.exposure = net.exposure;
+    args.saturation = net.saturation;
+    args.hue = net.hue;
+    args.letter_box = net.letter_box;
+    args.mosaic_bound = net.mosaic_bound;
+    args.contrastive = net.contrastive;
+    args.contrastive_jit_flip = net.contrastive_jit_flip;
+    args.contrastive_color = net.contrastive_color;
+    if (dont_show && show_imgs) show_imgs = 2;
+    args.show_imgs = show_imgs;
+
+#ifdef OPENCV
+    //int num_threads = get_num_threads();
+    //if(num_threads > 2) args.threads = get_num_threads() - 2;
+    args.threads = 6 * ngpus;   // 3 for - Amazon EC2 Tesla V100: p3.2xlarge (8 logical cores) - p3.16xlarge
+    //args.threads = 12 * ngpus;    // Ryzen 7 2700X (16 logical cores)
+    mat_cv* img = NULL;
+    float max_img_loss = net.max_chart_loss;
+    int number_of_lines = 100;
+    int img_size = 1000;
+    char windows_name[100];
+    sprintf(windows_name, "chart_%s.png", base);
+    img = draw_train_chart(windows_name, max_img_loss, net.max_batches, number_of_lines, img_size, dont_show, chart_path);
+#endif    //OPENCV
+    if (net.contrastive && args.threads > net.batch/2) args.threads = net.batch / 2;
+    if (net.track) {
+        args.track = net.track;
+        args.augment_speed = net.augment_speed;
+        if (net.sequential_subdivisions) args.threads = net.sequential_subdivisions * ngpus;
+        else args.threads = net.subdivisions * ngpus;
+        args.mini_batch = net.batch / net.time_steps;
+        printf("\n Tracking! batch = %d, subdiv = %d, time_steps = %d, mini_batch = %d \n", net.batch, net.subdivisions, net.time_steps, args.mini_batch);
+    }
+    //printf(" imgs = %d \n", imgs);
+
+    pthread_t load_thread = load_data(args);
+
+    int count = 0;
+    double time_remaining, avg_time = -1, alpha_time = 0.01;
+
+    //while(i*imgs < N*120){
+    while (get_current_iteration(net) < net.max_batches) {
+        if (l.random && count++ % 10 == 0) {
+            float rand_coef = 1.4;
+            if (l.random != 1.0) rand_coef = l.random;
+            printf("Resizing, random_coef = %.2f \n", rand_coef);
+            float random_val = rand_scale(rand_coef);    // *x or /x
+            int dim_w = roundl(random_val*init_w / net.resize_step + 1) * net.resize_step;
+            int dim_h = roundl(random_val*init_h / net.resize_step + 1) * net.resize_step;
+            if (random_val < 1 && (dim_w > init_w || dim_h > init_h)) dim_w = init_w, dim_h = init_h;
+
+            int max_dim_w = roundl(rand_coef*init_w / net.resize_step + 1) * net.resize_step;
+            int max_dim_h = roundl(rand_coef*init_h / net.resize_step + 1) * net.resize_step;
+
+            // at the beginning (check if enough memory) and at the end (calc rolling mean/variance)
+            if (avg_loss < 0 || get_current_iteration(net) > net.max_batches - 100) {
+                dim_w = max_dim_w;
+                dim_h = max_dim_h;
+            }
+
+            if (dim_w < net.resize_step) dim_w = net.resize_step;
+            if (dim_h < net.resize_step) dim_h = net.resize_step;
+            int dim_b = (init_b * max_dim_w * max_dim_h) / (dim_w * dim_h);
+            int new_dim_b = (int)(dim_b * 0.8);
+            if (new_dim_b > init_b) dim_b = new_dim_b;
+
+            args.w = dim_w;
+            args.h = dim_h;
+
+            int k;
+            if (net.dynamic_minibatch) {
+                for (k = 0; k < ngpus; ++k) {
+                    (*nets[k].seen) = init_b * net.subdivisions * get_current_iteration(net); // remove this line, when you will save to weights-file both: seen & cur_iteration
+                    nets[k].batch = dim_b;
+                    int j;
+                    for (j = 0; j < nets[k].n; ++j)
+                        nets[k].layers[j].batch = dim_b;
+                }
+                net.batch = dim_b;
+                imgs = net.batch * net.subdivisions * ngpus;
+                args.n = imgs;
+                printf("\n %d x %d  (batch = %d) \n", dim_w, dim_h, net.batch);
+            }
+            else
+                printf("\n %d x %d \n", dim_w, dim_h);
+
+            pthread_join(load_thread, 0);
+            train = buffer;
+            free_data(train);
+            load_thread = load_data(args);
+
+            for (k = 0; k < ngpus; ++k) {
+                resize_network(nets + k, dim_w, dim_h);
+            }
+            net = nets[0];
+        }
+        double time = what_time_is_it_now();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        if (net.track) {
+            net.sequential_subdivisions = get_current_seq_subdivisions(net);
+            args.threads = net.sequential_subdivisions * ngpus;
+            printf(" sequential_subdivisions = %d, sequence = %d \n", net.sequential_subdivisions, get_sequence_value(net));
+        }
+        load_thread = load_data(args);
+        //wait_key_cv(500);
+
+        /*
+        int k;
+        for(k = 0; k < l.max_boxes; ++k){
+        box b = float_to_box(train.y.vals[10] + 1 + k*5);
+        if(!b.x) break;
+        printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h);
+        }
+        image im = float_to_image(448, 448, 3, train.X.vals[10]);
+        int k;
+        for(k = 0; k < l.max_boxes; ++k){
+        box b = float_to_box(train.y.vals[10] + 1 + k*5);
+        printf("%d %d %d %d\n", truth.x, truth.y, truth.w, truth.h);
+        draw_bbox(im, b, 8, 1,0,0);
+        }
+        save_image(im, "truth11");
+        */
+
+        const double load_time = (what_time_is_it_now() - time);
+        printf("Loaded: %lf seconds", load_time);
+        if (load_time > 0.1 && avg_loss > 0) printf(" - performance bottleneck on CPU or Disk HDD/SSD");
+        printf("\n");
+
+        time = what_time_is_it_now();
+        float loss = 0;
+#ifdef GPU
+        if (ngpus == 1) {
+            int wait_key = (dont_show) ? 0 : 1;
+            loss = train_network_waitkey(net, train, wait_key);
+        }
+        else {
+            loss = train_networks(nets, ngpus, train, 4);
+        }
+#else
+        loss = train_network(net, train);
+#endif
+        if (avg_loss < 0 || avg_loss != avg_loss) avg_loss = loss;    // if(-inf or nan)
+        avg_loss = avg_loss*.9 + loss*.1;
+
+        const int iteration = get_current_iteration(net);
+        //i = get_current_batch(net);
+
+        int calc_map_for_each = mAP_epochs * train_images_num / (net.batch * net.subdivisions);  // calculate mAP every mAP_epochs
+        calc_map_for_each = fmax(calc_map_for_each, 100);
+        int next_map_calc = iter_map + calc_map_for_each;
+        next_map_calc = fmax(next_map_calc, net.burn_in);
+        //next_map_calc = fmax(next_map_calc, 400);
+        if (calc_map) {
+            printf("\n (next mAP calculation at %d iterations) ", next_map_calc);
+            if (mean_average_precision > 0) printf("\n Last accuracy mAP@%0.2f = %2.2f %%, best = %2.2f %% ", iou_thresh, mean_average_precision * 100, best_map * 100);
+        }
+
+        printf("\033[H\033[J");
+        if (mean_average_precision > 0.0) {
+            printf("%d/%d: loss=%0.1f map=%0.2f best=%0.2f hours left=%0.1f\007", iteration, net.max_batches, loss, mean_average_precision, best_map, avg_time);
+        }
+        else {
+            printf("%d/%d: loss=%0.1f hours left=%0.1f\007", iteration, net.max_batches, loss, avg_time);
+        }
+
+        if (net.cudnn_half) {
+            if (iteration < net.burn_in * 3) fprintf(stderr, "\n Tensor Cores are disabled until the first %d iterations are reached.\n", 3 * net.burn_in);
+            else fprintf(stderr, "\n Tensor Cores are used.\n");
+            fflush(stderr);
+        }
+        printf("\n %d: %f, %f avg loss, %f rate, %lf seconds, %d images, %f hours left\n", iteration, loss, avg_loss, get_current_rate(net), (what_time_is_it_now() - time), iteration*imgs, avg_time);
+        fflush(stdout);
+
+        int draw_precision = 0;
+        if (calc_map && (iteration >= next_map_calc || iteration == net.max_batches)) {
+            if (l.random) {
+                printf("Resizing to initial size: %d x %d ", init_w, init_h);
+                args.w = init_w;
+                args.h = init_h;
+                int k;
+                if (net.dynamic_minibatch) {
+                    for (k = 0; k < ngpus; ++k) {
+                        for (k = 0; k < ngpus; ++k) {
+                            nets[k].batch = init_b;
+                            int j;
+                            for (j = 0; j < nets[k].n; ++j)
+                                nets[k].layers[j].batch = init_b;
+                        }
+                    }
+                    net.batch = init_b;
+                    imgs = init_b * net.subdivisions * ngpus;
+                    args.n = imgs;
+                    printf("\n %d x %d  (batch = %d) \n", init_w, init_h, init_b);
+                }
+                pthread_join(load_thread, 0);
+                free_data(train);
+                train = buffer;
+                load_thread = load_data(args);
+                for (k = 0; k < ngpus; ++k) {
+                    resize_network(nets + k, init_w, init_h);
+                }
+                net = nets[0];
+            }
+
+            copy_weights_net(net, &net_map);
+
+            // combine Training and Validation networks
+            //network net_combined = combine_train_valid_networks(net, net_map);
+
+            iter_map = iteration;
+            mean_average_precision = validate_detector_map(datacfg, cfgfile, weightfile, thresh, iou_thresh, 0, net.letter_box, &net_map);// &net_combined);
+            printf("\n mean_average_precision (mAP@%0.2f) = %f \n", iou_thresh, mean_average_precision);
+            if (mean_average_precision >= best_map) {
+                best_map = mean_average_precision;
+                printf("New best mAP!\n");
+                char buff[256];
+                sprintf(buff, "%s/%s_best.weights", backup_directory, base);
+                save_weights(net, buff);
+            }
+
+            draw_precision = 1;
+        }
+        time_remaining = ((net.max_batches - iteration) / ngpus)*(what_time_is_it_now() - time + load_time) / 60 / 60;
+        // set initial value, even if resume training from 10000 iteration
+        if (avg_time < 0) avg_time = time_remaining;
+        else avg_time = alpha_time * time_remaining + (1 -  alpha_time) * avg_time;
+#ifdef OPENCV
+        if (net.contrastive) {
+            float cur_con_acc = -1;
+            for (k = 0; k < net.n; ++k)
+                if (net.layers[k].type == CONTRASTIVE) cur_con_acc = *net.layers[k].loss;
+            if (cur_con_acc >= 0) avg_contrastive_acc = avg_contrastive_acc*0.99 + cur_con_acc * 0.01;
+            printf("  avg_contrastive_acc = %f \n", avg_contrastive_acc);
+        }
+        draw_train_loss(windows_name, img, img_size, avg_loss, max_img_loss, iteration, net.max_batches, mean_average_precision, draw_precision, "mAP%", avg_contrastive_acc / 100, dont_show, mjpeg_port, avg_time);
+#endif    // OPENCV
+
+        if ( (iteration >= (iter_save + save_after_iterations) || iteration % save_after_iterations == 0) )
+        {
+            iter_save = iteration;
+#ifdef GPU
+            if (ngpus != 1) sync_nets(nets, ngpus, 0);
+#endif
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, iteration);
+            save_weights(net, buff);
+        }
+
+        if ( (save_after_iterations > save_last_weights_after) && (iteration >= (iter_save_last + save_last_weights_after) || (iteration % save_last_weights_after == 0 && iteration > 1))) {
+            iter_save_last = iteration;
+#ifdef GPU
+            if (ngpus != 1) sync_nets(nets, ngpus, 0);
+#endif
+            char buff[256];
+            sprintf(buff, "%s/%s_last.weights", backup_directory, base);
+            save_weights(net, buff);
+
+            if (net.ema_alpha && is_ema_initialized(net)) {
+                sprintf(buff, "%s/%s_ema.weights", backup_directory, base);
+                save_weights_upto(net, buff, net.n, 1);
+                printf(" EMA weights are saved to the file: %s \n", buff);
+            }
+        }
+        free_data(train);
+    }
+#ifdef GPU
+    if (ngpus != 1) sync_nets(nets, ngpus, 0);
+#endif
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+    printf("If you want to train from the beginning, then use flag in the end of training command: -clear \n");
+
+#ifdef OPENCV
+    release_mat(&img);
+    destroy_all_windows_cv();
+#endif
+
+    // free memory
+    pthread_join(load_thread, 0);
+    free_data(buffer);
+
+    free_load_threads(&args);
+
+    free(base);
+    free(paths);
+    free_list_contents(plist);
+    free_list(plist);
+
+    free_list_contents_kvp(options);
+    free_list(options);
+
+    for (k = 0; k < ngpus; ++k) free_network(nets[k]);
+    free(nets);
+    //free_network(net);
+
+    if (calc_map) {
+        net_map.n = 0;
+        free_network(net_map);
+    }
+}
+
+
+static int get_coco_image_id(char *filename)
+{
+    char *p = strrchr(filename, '/');
+    char *c = strrchr(filename, '_');
+    if (c) p = c;
+    return atoi(p + 1);
+}
+
+static void print_cocos(FILE *fp, char *image_path, detection *dets, int num_boxes, int classes, int w, int h)
+{
+    int i, j;
+    //int image_id = get_coco_image_id(image_path);
+    char *p = basecfg(image_path);
+    int image_id = atoi(p);
+    for (i = 0; i < num_boxes; ++i) {
+        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2.;
+
+        if (xmin < 0) xmin = 0;
+        if (ymin < 0) ymin = 0;
+        if (xmax > w) xmax = w;
+        if (ymax > h) ymax = h;
+
+        float bx = xmin;
+        float by = ymin;
+        float bw = xmax - xmin;
+        float bh = ymax - ymin;
+
+        for (j = 0; j < classes; ++j) {
+            if (dets[i].prob[j] > 0) {
+                char buff[1024];
+                sprintf(buff, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, dets[i].prob[j]);
+                fprintf(fp, "%s", buff);
+                //printf("%s", buff);
+            }
+        }
+    }
+}
+
+void print_detector_detections(FILE **fps, char *id, detection *dets, int total, int classes, int w, int h)
+{
+    int i, j;
+    for (i = 0; i < total; ++i) {
+        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2. + 1;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2. + 1;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2. + 1;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2. + 1;
+
+        if (xmin < 1) xmin = 1;
+        if (ymin < 1) ymin = 1;
+        if (xmax > w) xmax = w;
+        if (ymax > h) ymax = h;
+
+        for (j = 0; j < classes; ++j) {
+            if (dets[i].prob[j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
+                xmin, ymin, xmax, ymax);
+        }
+    }
+}
+
+void print_imagenet_detections(FILE *fp, int id, detection *dets, int total, int classes, int w, int h)
+{
+    int i, j;
+    for (i = 0; i < total; ++i) {
+        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2.;
+
+        if (xmin < 0) xmin = 0;
+        if (ymin < 0) ymin = 0;
+        if (xmax > w) xmax = w;
+        if (ymax > h) ymax = h;
+
+        for (j = 0; j < classes; ++j) {
+            int myclass = j;
+            if (dets[i].prob[myclass] > 0) fprintf(fp, "%d %d %f %f %f %f %f\n", id, j + 1, dets[i].prob[myclass],
+                xmin, ymin, xmax, ymax);
+        }
+    }
+}
+
+static void print_kitti_detections(FILE **fps, char *id, detection *dets, int total, int classes, int w, int h, char *outfile, char *prefix)
+{
+    char *kitti_ids[] = { "car", "pedestrian", "cyclist" };
+    FILE *fpd = 0;
+    char buffd[1024];
+    snprintf(buffd, 1024, "%s/%s/data/%s.txt", prefix, outfile, id);
+
+    fpd = fopen(buffd, "w");
+    int i, j;
+    for (i = 0; i < total; ++i)
+    {
+        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2.;
+
+        if (xmin < 0) xmin = 0;
+        if (ymin < 0) ymin = 0;
+        if (xmax > w) xmax = w;
+        if (ymax > h) ymax = h;
+
+        for (j = 0; j < classes; ++j)
+        {
+            //if (dets[i].prob[j]) fprintf(fpd, "%s 0 0 0 %f %f %f %f -1 -1 -1 -1 0 0 0 %f\n", kitti_ids[j], xmin, ymin, xmax, ymax, dets[i].prob[j]);
+            if (dets[i].prob[j]) fprintf(fpd, "%s -1 -1 -10 %f %f %f %f -1 -1 -1 -1000 -1000 -1000 -10 %f\n", kitti_ids[j], xmin, ymin, xmax, ymax, dets[i].prob[j]);
+        }
+    }
+    fclose(fpd);
+}
+
+static void eliminate_bdd(char *buf, char *a)
+{
+    int n = 0;
+    int i, k;
+    for (i = 0; buf[i] != '\0'; i++)
+    {
+        if (buf[i] == a[n])
+        {
+            k = i;
+            while (buf[i] == a[n])
+            {
+                if (a[++n] == '\0')
+                {
+                    for (k; buf[k + n] != '\0'; k++)
+                    {
+                        buf[k] = buf[k + n];
+                    }
+                    buf[k] = '\0';
+                    break;
+                }
+                i++;
+            }
+            n = 0; i--;
+        }
+    }
+}
+
+static void get_bdd_image_id(char *filename)
+{
+    char *p = strrchr(filename, '/');
+    eliminate_bdd(p, ".jpg");
+    eliminate_bdd(p, "/");
+    strcpy(filename, p);
+}
+
+static void print_bdd_detections(FILE *fp, char *image_path, detection *dets, int num_boxes, int classes, int w, int h)
+{
+    char *bdd_ids[] = { "bike" , "bus" , "car" , "motor" ,"person", "rider", "traffic light", "traffic sign", "train", "truck" };
+    get_bdd_image_id(image_path);
+    int i, j;
+
+    for (i = 0; i < num_boxes; ++i)
+    {
+        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2.;
+
+        if (xmin < 0) xmin = 0;
+        if (ymin < 0) ymin = 0;
+        if (xmax > w) xmax = w;
+        if (ymax > h) ymax = h;
+
+        float bx1 = xmin;
+        float by1 = ymin;
+        float bx2 = xmax;
+        float by2 = ymax;
+
+        for (j = 0; j < classes; ++j)
+        {
+            if (dets[i].prob[j])
+            {
+                fprintf(fp, "\t{\n\t\t\"name\":\"%s\",\n\t\t\"category\":\"%s\",\n\t\t\"bbox\":[%f, %f, %f, %f],\n\t\t\"score\":%f\n\t},\n", image_path, bdd_ids[j], bx1, by1, bx2, by2, dets[i].prob[j]);
+            }
+        }
+    }
+}
+
+void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *outfile)
+{
+    int j;
+    list *options = read_data_cfg(datacfg);
+    char *valid_images = option_find_str(options, "valid", "data/train.list");
+    char *name_list = option_find_str(options, "names", "data/names.list");
+    char *prefix = option_find_str(options, "results", "results");
+    char **names = get_labels(name_list);
+    char *mapf = option_find_str(options, "map", 0);
+    int *map = 0;
+    if (mapf) map = read_map(mapf);
+
+    network net = parse_network_cfg_custom(cfgfile, 1, 1);    // set batch=1
+    if (weightfile) {
+        load_weights(&net, weightfile);
+    }
+    //set_batch_network(&net, 1);
+    fuse_conv_batchnorm(net);
+    calculate_binary_weights(net);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    srand(time(0));
+
+    list *plist = get_paths(valid_images);
+    char **paths = (char **)list_to_array(plist);
+
+    layer l = net.layers[net.n - 1];
+    int k;
+    for (k = 0; k < net.n; ++k) {
+        layer lk = net.layers[k];
+        if (lk.type == YOLO || lk.type == GAUSSIAN_YOLO || lk.type == REGION) {
+            l = lk;
+            printf(" Detection layer: %d - type = %d \n", k, l.type);
+        }
+    }
+    int classes = l.classes;
+
+    char buff[1024];
+    char *type = option_find_str(options, "eval", "voc");
+    FILE *fp = 0;
+    FILE **fps = 0;
+    int coco = 0;
+    int imagenet = 0;
+    int bdd = 0;
+    int kitti = 0;
+
+    if (0 == strcmp(type, "coco")) {
+        if (!outfile) outfile = "coco_results";
+        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
+        fp = fopen(buff, "w");
+        fprintf(fp, "[\n");
+        coco = 1;
+    }
+    else if (0 == strcmp(type, "bdd")) {
+        if (!outfile) outfile = "bdd_results";
+        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
+        fp = fopen(buff, "w");
+        fprintf(fp, "[\n");
+        bdd = 1;
+    }
+    else if (0 == strcmp(type, "kitti")) {
+        char buff2[1024];
+        if (!outfile) outfile = "kitti_results";
+        printf("%s\n", outfile);
+        snprintf(buff, 1024, "%s/%s", prefix, outfile);
+        int mkd = make_directory(buff, 0777);
+        snprintf(buff2, 1024, "%s/%s/data", prefix, outfile);
+        int mkd2 = make_directory(buff2, 0777);
+        kitti = 1;
+    }
+    else if (0 == strcmp(type, "imagenet")) {
+        if (!outfile) outfile = "imagenet-detection";
+        snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
+        fp = fopen(buff, "w");
+        imagenet = 1;
+        classes = 200;
+    }
+    else {
+        if (!outfile) outfile = "comp4_det_test_";
+        fps = (FILE**) xcalloc(classes, sizeof(FILE *));
+        for (j = 0; j < classes; ++j) {
+            snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
+            fps[j] = fopen(buff, "w");
+        }
+    }
+
+
+    int m = plist->size;
+    int i = 0;
+    int t;
+
+    float thresh = .001;
+    float nms = .6;
+
+    int nthreads = 4;
+    if (m < 4) nthreads = m;
+    image* val = (image*)xcalloc(nthreads, sizeof(image));
+    image* val_resized = (image*)xcalloc(nthreads, sizeof(image));
+    image* buf = (image*)xcalloc(nthreads, sizeof(image));
+    image* buf_resized = (image*)xcalloc(nthreads, sizeof(image));
+    pthread_t* thr = (pthread_t*)xcalloc(nthreads, sizeof(pthread_t));
+
+    load_args args = { 0 };
+    args.w = net.w;
+    args.h = net.h;
+    args.c = net.c;
+    args.type = IMAGE_DATA;
+    const int letter_box = net.letter_box;
+    if (letter_box) args.type = LETTERBOX_DATA;
+
+    for (t = 0; t < nthreads; ++t) {
+        args.path = paths[i + t];
+        args.im = &buf[t];
+        args.resized = &buf_resized[t];
+        thr[t] = load_data_in_thread(args);
+    }
+    time_t start = time(0);
+    for (i = nthreads; i < m + nthreads; i += nthreads) {
+        fprintf(stderr, "%d\n", i);
+        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
+            pthread_join(thr[t], 0);
+            val[t] = buf[t];
+            val_resized[t] = buf_resized[t];
+        }
+        for (t = 0; t < nthreads && i + t < m; ++t) {
+            args.path = paths[i + t];
+            args.im = &buf[t];
+            args.resized = &buf_resized[t];
+            thr[t] = load_data_in_thread(args);
+        }
+        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
+            char *path = paths[i + t - nthreads];
+            char *id = basecfg(path);
+            float *X = val_resized[t].data;
+            network_predict(net, X);
+            int w = val[t].w;
+            int h = val[t].h;
+            int nboxes = 0;
+            detection *dets = get_network_boxes(&net, w, h, thresh, .5, map, 0, &nboxes, letter_box);
+            if (nms) {
+                if (l.nms_kind == DEFAULT_NMS) do_nms_sort(dets, nboxes, l.classes, nms);
+                else diounms_sort(dets, nboxes, l.classes, nms, l.nms_kind, l.beta_nms);
+            }
+
+            if (coco) {
+                print_cocos(fp, path, dets, nboxes, classes, w, h);
+            }
+            else if (imagenet) {
+                print_imagenet_detections(fp, i + t - nthreads + 1, dets, nboxes, classes, w, h);
+            }
+            else if (bdd) {
+                print_bdd_detections(fp, path, dets, nboxes, classes, w, h);
+            }
+            else if (kitti) {
+                print_kitti_detections(fps, id, dets, nboxes, classes, w, h, outfile, prefix);
+            }
+            else {
+                print_detector_detections(fps, id, dets, nboxes, classes, w, h);
+            }
+
+            free_detections(dets, nboxes);
+            free(id);
+            free_image(val[t]);
+            free_image(val_resized[t]);
+        }
+    }
+    if (fps) {
+        for (j = 0; j < classes; ++j) {
+            fclose(fps[j]);
+        }
+        free(fps);
+    }
+    if (coco) {
+#ifdef WIN32
+        fseek(fp, -3, SEEK_CUR);
+#else
+        fseek(fp, -2, SEEK_CUR);
+#endif
+        fprintf(fp, "\n]\n");
+    }
+
+    if (bdd) {
+#ifdef WIN32
+        fseek(fp, -3, SEEK_CUR);
+#else
+        fseek(fp, -2, SEEK_CUR);
+#endif
+        fprintf(fp, "\n]\n");
+        fclose(fp);
+    }
+
+    if (fp) fclose(fp);
+
+    if (val) free(val);
+    if (val_resized) free(val_resized);
+    if (thr) free(thr);
+    if (buf) free(buf);
+    if (buf_resized) free(buf_resized);
+
+    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)time(0) - start);
+}
+
+void validate_detector_recall(char *datacfg, char *cfgfile, char *weightfile)
+{
+    network net = parse_network_cfg_custom(cfgfile, 1, 1);    // set batch=1
+    if (weightfile) {
+        load_weights(&net, weightfile);
+    }
+    //set_batch_network(&net, 1);
+    fuse_conv_batchnorm(net);
+    srand(time(0));
+
+    //list *plist = get_paths("data/coco_val_5k.list");
+    list *options = read_data_cfg(datacfg);
+    char *valid_images = option_find_str(options, "valid", "data/train.txt");
+    list *plist = get_paths(valid_images);
+    char **paths = (char **)list_to_array(plist);
+
+    //layer l = net.layers[net.n - 1];
+
+    int j, k;
+
+    int m = plist->size;
+    int i = 0;
+
+    float thresh = .001;
+    float iou_thresh = .5;
+    float nms = .4;
+
+    int total = 0;
+    int correct = 0;
+    int proposals = 0;
+    float avg_iou = 0;
+
+    for (i = 0; i < m; ++i) {
+        char *path = paths[i];
+        image orig = load_image(path, 0, 0, net.c);
+        image sized = resize_image(orig, net.w, net.h);
+        char *id = basecfg(path);
+        network_predict(net, sized.data);
+        int nboxes = 0;
+        int letterbox = 0;
+        detection *dets = get_network_boxes(&net, sized.w, sized.h, thresh, .5, 0, 1, &nboxes, letterbox);
+        if (nms) do_nms_obj(dets, nboxes, 1, nms);
+
+        char labelpath[4096];
+        replace_image_to_label(path, labelpath);
+
+        int num_labels = 0;
+        box_label *truth = read_boxes(labelpath, &num_labels);
+        for (k = 0; k < nboxes; ++k) {
+            if (dets[k].objectness > thresh) {
+                ++proposals;
+            }
+        }
+        for (j = 0; j < num_labels; ++j) {
+            ++total;
+            box t = { truth[j].x, truth[j].y, truth[j].w, truth[j].h };
+            float best_iou = 0;
+            for (k = 0; k < nboxes; ++k) {
+                float iou = box_iou(dets[k].bbox, t);
+                if (dets[k].objectness > thresh && iou > best_iou) {
+                    best_iou = iou;
+                }
+            }
+            avg_iou += best_iou;
+            if (best_iou > iou_thresh) {
+                ++correct;
+            }
+        }
+        //fprintf(stderr, " %s - %s - ", paths[i], labelpath);
+        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals / (i + 1), avg_iou * 100 / total, 100.*correct / total);
+        free(truth);
+        free(id);
+        free_image(orig);
+        free_image(sized);
+    }
+}
+
+typedef struct {
+    box b;
+    float p;
+    int class_id;
+    int image_index;
+    int truth_flag;
+    int unique_truth_index;
+} box_prob;
+
+int detections_comparator(const void *pa, const void *pb)
+{
+    box_prob a = *(const box_prob *)pa;
+    box_prob b = *(const box_prob *)pb;
+    float diff = a.p - b.p;
+    if (diff < 0) return 1;
+    else if (diff > 0) return -1;
+    return 0;
+}
+
+float validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, float thresh_calc_avg_iou, const float iou_thresh, const int map_points, int letter_box, network *existing_net)
+{
+    int j;
+    list *options = read_data_cfg(datacfg);
+    char *valid_images = option_find_str(options, "valid", "data/train.txt");
+    char *difficult_valid_images = option_find_str(options, "difficult", NULL);
+    char *name_list = option_find_str(options, "names", "data/names.list");
+    int names_size = 0;
+    char **names = get_labels_custom(name_list, &names_size); //get_labels(name_list);
+    //char *mapf = option_find_str(options, "map", 0);
+    //int *map = 0;
+    //if (mapf) map = read_map(mapf);
+    FILE* reinforcement_fd = NULL;
+
+    network net;
+    //int initial_batch;
+    if (existing_net) {
+        char *train_images = option_find_str(options, "train", "data/train.txt");
+        valid_images = option_find_str(options, "valid", train_images);
+        net = *existing_net;
+        remember_network_recurrent_state(*existing_net);
+        free_network_recurrent_state(*existing_net);
+    }
+    else {
+        net = parse_network_cfg_custom(cfgfile, 1, 1);    // set batch=1
+        if (weightfile) {
+            load_weights(&net, weightfile);
+        }
+        //set_batch_network(&net, 1);
+        fuse_conv_batchnorm(net);
+        calculate_binary_weights(net);
+    }
+    if (net.layers[net.n - 1].classes != names_size) {
+        printf("\n Error: in the file %s number of names %d that isn't equal to classes=%d in the file %s \n",
+            name_list, names_size, net.layers[net.n - 1].classes, cfgfile);
+        error("Error!", DARKNET_LOC);
+    }
+    srand(time(0));
+    printf("\n calculation mAP (mean average precision)...\n");
+
+    list *plist = get_paths(valid_images);
+    char **paths = (char **)list_to_array(plist);
+
+    list *plist_dif = NULL;
+    char **paths_dif = NULL;
+    if (difficult_valid_images) {
+        plist_dif = get_paths(difficult_valid_images);
+        paths_dif = (char **)list_to_array(plist_dif);
+    }
+
+    layer l = net.layers[net.n - 1];
+    int k;
+    for (k = 0; k < net.n; ++k) {
+        layer lk = net.layers[k];
+        if (lk.type == YOLO || lk.type == GAUSSIAN_YOLO || lk.type == REGION) {
+            l = lk;
+            printf(" Detection layer: %d - type = %d \n", k, l.type);
+        }
+    }
+    int classes = l.classes;
+
+    int m = plist->size;
+    int i = 0;
+    int t;
+
+    const float thresh = .005;
+    const float nms = .45;
+    //const float iou_thresh = 0.5;
+
+    int nthreads = 4;
+    if (m < 4) nthreads = m;
+    image* val = (image*)xcalloc(nthreads, sizeof(image));
+    image* val_resized = (image*)xcalloc(nthreads, sizeof(image));
+    image* buf = (image*)xcalloc(nthreads, sizeof(image));
+    image* buf_resized = (image*)xcalloc(nthreads, sizeof(image));
+    pthread_t* thr = (pthread_t*)xcalloc(nthreads, sizeof(pthread_t));
+
+    load_args args = { 0 };
+    args.w = net.w;
+    args.h = net.h;
+    args.c = net.c;
+    letter_box = net.letter_box;
+    if (letter_box) args.type = LETTERBOX_DATA;
+    else args.type = IMAGE_DATA;
+
+    //const float thresh_calc_avg_iou = 0.24;
+    float avg_iou = 0;
+    int tp_for_thresh = 0;
+    int fp_for_thresh = 0;
+
+    box_prob* detections = (box_prob*)xcalloc(1, sizeof(box_prob));
+    int detections_count = 0;
+    int unique_truth_count = 0;
+
+    int* truth_classes_count = (int*)xcalloc(classes, sizeof(int));
+
+    // For multi-class precision and recall computation
+    float *avg_iou_per_class = (float*)xcalloc(classes, sizeof(float));
+    int *tp_for_thresh_per_class = (int*)xcalloc(classes, sizeof(int));
+    int *fp_for_thresh_per_class = (int*)xcalloc(classes, sizeof(int));
+
+    for (t = 0; t < nthreads; ++t) {
+        args.path = paths[i + t];
+        args.im = &buf[t];
+        args.resized = &buf_resized[t];
+        thr[t] = load_data_in_thread(args);
+    }
+    time_t start = time(0);
+    for (i = nthreads; i < m + nthreads; i += nthreads) {
+        fprintf(stderr, "\r%d", i);
+        for (t = 0; t < nthreads && (i + t - nthreads) < m; ++t) {
+            pthread_join(thr[t], 0);
+            val[t] = buf[t];
+            val_resized[t] = buf_resized[t];
+        }
+        for (t = 0; t < nthreads && (i + t) < m; ++t) {
+            args.path = paths[i + t];
+            args.im = &buf[t];
+            args.resized = &buf_resized[t];
+            thr[t] = load_data_in_thread(args);
+        }
+        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
+            const int image_index = i + t - nthreads;
+            char *path = paths[image_index];
+            char *id = basecfg(path);
+            float *X = val_resized[t].data;
+            network_predict(net, X);
+
+            int nboxes = 0;
+            float hier_thresh = 0;
+            detection *dets;
+            if (args.type == LETTERBOX_DATA) {
+                dets = get_network_boxes(&net, val[t].w, val[t].h, thresh, hier_thresh, 0, 1, &nboxes, letter_box);
+            }
+            else {
+                dets = get_network_boxes(&net, 1, 1, thresh, hier_thresh, 0, 0, &nboxes, letter_box);
+            }
+            //detection *dets = get_network_boxes(&net, val[t].w, val[t].h, thresh, hier_thresh, 0, 1, &nboxes, letter_box); // for letter_box=1
+            if (nms) {
+                if (l.nms_kind == DEFAULT_NMS) do_nms_sort(dets, nboxes, l.classes, nms);
+                else diounms_sort(dets, nboxes, l.classes, nms, l.nms_kind, l.beta_nms);
+            }
+
+            //if (l.embedding_size) set_track_id(dets, nboxes, thresh, l.sim_thresh, l.track_ciou_norm, l.track_history_size, l.dets_for_track, l.dets_for_show);
+
+            char labelpath[4096];
+            replace_image_to_label(path, labelpath);
+            int num_labels = 0;
+            box_label *truth = read_boxes(labelpath, &num_labels);
+            int j;
+            for (j = 0; j < num_labels; ++j) {
+                truth_classes_count[truth[j].id]++;
+            }
+
+            // difficult
+            box_label *truth_dif = NULL;
+            int num_labels_dif = 0;
+            if (paths_dif)
+            {
+                char *path_dif = paths_dif[image_index];
+
+                char labelpath_dif[4096];
+                replace_image_to_label(path_dif, labelpath_dif);
+
+                truth_dif = read_boxes(labelpath_dif, &num_labels_dif);
+            }
+
+            const int checkpoint_detections_count = detections_count;
+
+            int i;
+            for (i = 0; i < nboxes; ++i) {
+
+                int class_id;
+                for (class_id = 0; class_id < classes; ++class_id) {
+                    float prob = dets[i].prob[class_id];
+                    if (prob > 0) {
+                        detections_count++;
+                        detections = (box_prob*)xrealloc(detections, detections_count * sizeof(box_prob));
+                        detections[detections_count - 1].b = dets[i].bbox;
+                        detections[detections_count - 1].p = prob;
+                        detections[detections_count - 1].image_index = image_index;
+                        detections[detections_count - 1].class_id = class_id;
+                        detections[detections_count - 1].truth_flag = 0;
+                        detections[detections_count - 1].unique_truth_index = -1;
+
+                        int truth_index = -1;
+                        float max_iou = 0;
+                        for (j = 0; j < num_labels; ++j)
+                        {
+                            box t = { truth[j].x, truth[j].y, truth[j].w, truth[j].h };
+                            //printf(" IoU = %f, prob = %f, class_id = %d, truth[j].id = %d \n",
+                            //    box_iou(dets[i].bbox, t), prob, class_id, truth[j].id);
+                            float current_iou = box_iou(dets[i].bbox, t);
+                            if (current_iou > iou_thresh && class_id == truth[j].id) {
+                                if (current_iou > max_iou) {
+                                    max_iou = current_iou;
+                                    truth_index = unique_truth_count + j;
+                                }
+                            }
+                        }
+
+                        // best IoU
+                        if (truth_index > -1) {
+                            detections[detections_count - 1].truth_flag = 1;
+                            detections[detections_count - 1].unique_truth_index = truth_index;
+                        }
+                        else {
+                            // if object is difficult then remove detection
+                            for (j = 0; j < num_labels_dif; ++j) {
+                                box t = { truth_dif[j].x, truth_dif[j].y, truth_dif[j].w, truth_dif[j].h };
+                                float current_iou = box_iou(dets[i].bbox, t);
+                                if (current_iou > iou_thresh && class_id == truth_dif[j].id) {
+                                    --detections_count;
+                                    break;
+                                }
+                            }
+                        }
+
+                        // calc avg IoU, true-positives, false-positives for required Threshold
+                        if (prob > thresh_calc_avg_iou) {
+                            int z, found = 0;
+                            for (z = checkpoint_detections_count; z < detections_count - 1; ++z) {
+                                if (detections[z].unique_truth_index == truth_index) {
+                                    found = 1; break;
+                                }
+                            }
+
+                            if (truth_index > -1 && found == 0) {
+                                avg_iou += max_iou;
+                                ++tp_for_thresh;
+                                avg_iou_per_class[class_id] += max_iou;
+                                tp_for_thresh_per_class[class_id]++;
+                            }
+                            else{
+                                fp_for_thresh++;
+                                fp_for_thresh_per_class[class_id]++;
+                            }
+                        }
+                    }
+                }
+            }
+
+            unique_truth_count += num_labels;
+
+            //static int previous_errors = 0;
+            //int total_errors = fp_for_thresh + (unique_truth_count - tp_for_thresh);
+            //int errors_in_this_image = total_errors - previous_errors;
+            //previous_errors = total_errors;
+            //if(reinforcement_fd == NULL) reinforcement_fd = fopen("reinforcement.txt", "wb");
+            //char buff[1000];
+            //sprintf(buff, "%s\n", path);
+            //if(errors_in_this_image > 0) fwrite(buff, sizeof(char), strlen(buff), reinforcement_fd);
+
+            free_detections(dets, nboxes);
+            free(truth);
+            free(truth_dif);
+            free(id);
+            free_image(val[t]);
+            free_image(val_resized[t]);
+        }
+    }
+
+    //for (t = 0; t < nthreads; ++t) {
+    //    pthread_join(thr[t], 0);
+    //}
+
+    if ((tp_for_thresh + fp_for_thresh) > 0)
+        avg_iou = avg_iou / (tp_for_thresh + fp_for_thresh);
+
+    int class_id;
+    for(class_id = 0; class_id < classes; class_id++){
+        if ((tp_for_thresh_per_class[class_id] + fp_for_thresh_per_class[class_id]) > 0)
+            avg_iou_per_class[class_id] = avg_iou_per_class[class_id] / (tp_for_thresh_per_class[class_id] + fp_for_thresh_per_class[class_id]);
+    }
+
+    // SORT(detections)
+    qsort(detections, detections_count, sizeof(box_prob), detections_comparator);
+
+    typedef struct {
+        double prob;
+        double precision;
+        double recall;
+        int tp, fp, fn;
+    } pr_t;
+
+    // for PR-curve
+    pr_t** pr = (pr_t**)xcalloc(classes, sizeof(pr_t*));
+    for (i = 0; i < classes; ++i) {
+        pr[i] = (pr_t*)xcalloc(detections_count, sizeof(pr_t));
+    }
+    printf("\n detections_count = %d, unique_truth_count = %d  \n", detections_count, unique_truth_count);
+
+
+    int* detection_per_class_count = (int*)xcalloc(classes, sizeof(int));
+    for (j = 0; j < detections_count; ++j) {
+        detection_per_class_count[detections[j].class_id]++;
+    }
+
+    int* truth_flags = (int*)xcalloc(unique_truth_count, sizeof(int));
+
+    int rank;
+    for (rank = 0; rank < detections_count; ++rank) {
+        if (rank % 100 == 0)
+            printf(" rank = %d of ranks = %d \r", rank, detections_count);
+
+        if (rank > 0) {
+            int class_id;
+            for (class_id = 0; class_id < classes; ++class_id) {
+                pr[class_id][rank].tp = pr[class_id][rank - 1].tp;
+                pr[class_id][rank].fp = pr[class_id][rank - 1].fp;
+            }
+        }
+
+        box_prob d = detections[rank];
+        pr[d.class_id][rank].prob = d.p;
+        // if (detected && isn't detected before)
+        if (d.truth_flag == 1) {
+            if (truth_flags[d.unique_truth_index] == 0)
+            {
+                truth_flags[d.unique_truth_index] = 1;
+                pr[d.class_id][rank].tp++;    // true-positive
+            } else
+                pr[d.class_id][rank].fp++;
+        }
+        else {
+            pr[d.class_id][rank].fp++;    // false-positive
+        }
+
+        for (i = 0; i < classes; ++i)
+        {
+            const int tp = pr[i][rank].tp;
+            const int fp = pr[i][rank].fp;
+            const int fn = truth_classes_count[i] - tp;    // false-negative = objects - true-positive
+            pr[i][rank].fn = fn;
+
+            if ((tp + fp) > 0) pr[i][rank].precision = (double)tp / (double)(tp + fp);
+            else pr[i][rank].precision = 0;
+
+            if ((tp + fn) > 0) pr[i][rank].recall = (double)tp / (double)(tp + fn);
+            else pr[i][rank].recall = 0;
+
+            if (rank == (detections_count - 1) && detection_per_class_count[i] != (tp + fp)) {    // check for last rank
+                    printf(" class_id: %d - detections = %d, tp+fp = %d, tp = %d, fp = %d \n", i, detection_per_class_count[i], tp+fp, tp, fp);
+            }
+        }
+    }
+
+    free(truth_flags);
+
+    double mean_average_precision = 0;
+
+    for (i = 0; i < classes; ++i) {
+        double avg_precision = 0;
+
+        // MS COCO - uses 101-Recall-points on PR-chart.
+        // PascalVOC2007 - uses 11-Recall-points on PR-chart.
+        // PascalVOC2010-2012 - uses Area-Under-Curve on PR-chart.
+        // ImageNet - uses Area-Under-Curve on PR-chart.
+
+        // correct mAP calculation: ImageNet, PascalVOC 2010-2012
+        if (map_points == 0)
+        {
+            double last_recall = pr[i][detections_count - 1].recall;
+            double last_precision = pr[i][detections_count - 1].precision;
+            for (rank = detections_count - 2; rank >= 0; --rank)
+            {
+                double delta_recall = last_recall - pr[i][rank].recall;
+                last_recall = pr[i][rank].recall;
+
+                if (pr[i][rank].precision > last_precision) {
+                    last_precision = pr[i][rank].precision;
+                }
+
+                avg_precision += delta_recall * last_precision;
+            }
+            //add remaining area of PR curve when recall isn't 0 at rank-1
+            double delta_recall = last_recall - 0;
+            avg_precision += delta_recall * last_precision;
+        }
+        // MSCOCO - 101 Recall-points, PascalVOC - 11 Recall-points
+        else
+        {
+            int point;
+            for (point = 0; point < map_points; ++point) {
+                double cur_recall = point * 1.0 / (map_points-1);
+                double cur_precision = 0;
+                double cur_prob = 0;
+                for (rank = 0; rank < detections_count; ++rank)
+                {
+                    if (pr[i][rank].recall >= cur_recall) {    // > or >=
+                        if (pr[i][rank].precision > cur_precision) {
+                            cur_precision = pr[i][rank].precision;
+                            cur_prob = pr[i][rank].prob;
+                        }
+                    }
+                }
+                //printf("class_id = %d, point = %d, cur_prob = %.4f, cur_recall = %.4f, cur_precision = %.4f \n", i, point, cur_prob, cur_recall, cur_precision);
+
+                avg_precision += cur_precision;
+            }
+            avg_precision = avg_precision / map_points;
+        }
+
+        printf("class_id = %d, name = %s, ap = %2.2f%%   \t (TP = %d, FP = %d) \n",
+            i, names[i], avg_precision * 100, tp_for_thresh_per_class[i], fp_for_thresh_per_class[i]);
+
+        float class_precision = (float)tp_for_thresh_per_class[i] / ((float)tp_for_thresh_per_class[i] + (float)fp_for_thresh_per_class[i]);
+        float class_recall = (float)tp_for_thresh_per_class[i] / ((float)tp_for_thresh_per_class[i] + (float)(truth_classes_count[i] - tp_for_thresh_per_class[i]));
+        //printf("Precision = %1.2f, Recall = %1.2f, avg IOU = %2.2f%% \n\n", class_precision, class_recall, avg_iou_per_class[i]);
+
+        mean_average_precision += avg_precision;
+    }
+
+    const float cur_precision = (float)tp_for_thresh / ((float)tp_for_thresh + (float)fp_for_thresh);
+    const float cur_recall = (float)tp_for_thresh / ((float)tp_for_thresh + (float)(unique_truth_count - tp_for_thresh));
+    const float f1_score = 2.F * cur_precision * cur_recall / (cur_precision + cur_recall);
+    printf("\n for conf_thresh = %1.2f, precision = %1.2f, recall = %1.2f, F1-score = %1.2f \n",
+        thresh_calc_avg_iou, cur_precision, cur_recall, f1_score);
+
+    printf(" for conf_thresh = %0.2f, TP = %d, FP = %d, FN = %d, average IoU = %2.2f %% \n",
+        thresh_calc_avg_iou, tp_for_thresh, fp_for_thresh, unique_truth_count - tp_for_thresh, avg_iou * 100);
+
+    mean_average_precision = mean_average_precision / classes;
+    printf("\n IoU threshold = %2.0f %%, ", iou_thresh * 100);
+    if (map_points) printf("used %d Recall-points \n", map_points);
+    else printf("used Area-Under-Curve for each unique Recall \n");
+
+    printf(" mean average precision (mAP@%0.2f) = %f, or %2.2f %% \n", iou_thresh, mean_average_precision, mean_average_precision * 100);
+
+    for (i = 0; i < classes; ++i) {
+        free(pr[i]);
+    }
+    free(pr);
+    free(detections);
+    free(truth_classes_count);
+    free(detection_per_class_count);
+    free(paths);
+    free(paths_dif);
+    free_list_contents(plist);
+    free_list(plist);
+    if (plist_dif) {
+        free_list_contents(plist_dif);
+        free_list(plist_dif);
+    }
+    free(avg_iou_per_class);
+    free(tp_for_thresh_per_class);
+    free(fp_for_thresh_per_class);
+
+    fprintf(stderr, "Total Detection Time: %d Seconds\n", (int)(time(0) - start));
+    printf("\nSet -points flag:\n");
+    printf(" `-points 101` for MS COCO \n");
+    printf(" `-points 11` for PascalVOC 2007 (uncomment `difficult` in voc.data) \n");
+    printf(" `-points 0` (AUC) for ImageNet, PascalVOC 2010-2012, your custom dataset\n");
+    if (reinforcement_fd != NULL) fclose(reinforcement_fd);
+
+    // free memory
+    free_ptrs((void**)names, net.layers[net.n - 1].classes);
+    free_list_contents_kvp(options);
+    free_list(options);
+
+    if (existing_net) {
+        //set_batch_network(&net, initial_batch);
+        //free_network_recurrent_state(*existing_net);
+        restore_network_recurrent_state(*existing_net);
+        //randomize_network_recurrent_state(*existing_net);
+    }
+    else {
+        free_network(net);
+    }
+    if (val) free(val);
+    if (val_resized) free(val_resized);
+    if (thr) free(thr);
+    if (buf) free(buf);
+    if (buf_resized) free(buf_resized);
+
+    return mean_average_precision;
+}
+
+typedef struct {
+    float w, h;
+} anchors_t;
+
+int anchors_comparator(const void *pa, const void *pb)
+{
+    anchors_t a = *(const anchors_t *)pa;
+    anchors_t b = *(const anchors_t *)pb;
+    float diff = b.w*b.h - a.w*a.h;
+    if (diff < 0) return 1;
+    else if (diff > 0) return -1;
+    return 0;
+}
+
+int anchors_data_comparator(const float **pa, const float **pb)
+{
+    float *a = (float *)*pa;
+    float *b = (float *)*pb;
+    float diff = b[0] * b[1] - a[0] * a[1];
+    if (diff < 0) return 1;
+    else if (diff > 0) return -1;
+    return 0;
+}
+
+
+void calc_anchors(char *datacfg, int num_of_clusters, int width, int height, int show)
+{
+    printf("\n num_of_clusters = %d, width = %d, height = %d \n", num_of_clusters, width, height);
+    if (width < 0 || height < 0) {
+        printf("Usage: darknet detector calc_anchors data/voc.data -num_of_clusters 9 -width 416 -height 416 \n");
+        printf("Error: set width and height \n");
+        return;
+    }
+
+    //float pointsdata[] = { 1,1, 2,2, 6,6, 5,5, 10,10 };
+    float* rel_width_height_array = (float*)xcalloc(1000, sizeof(float));
+
+
+    list *options = read_data_cfg(datacfg);
+    char *train_images = option_find_str(options, "train", "data/train.list");
+    list *plist = get_paths(train_images);
+    int number_of_images = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    int classes = option_find_int(options, "classes", 1);
+    int* counter_per_class = (int*)xcalloc(classes, sizeof(int));
+
+    srand(time(0));
+    int number_of_boxes = 0;
+    printf(" read labels from %d images \n", number_of_images);
+
+    int i, j;
+    for (i = 0; i < number_of_images; ++i) {
+        char *path = paths[i];
+        char labelpath[4096];
+        replace_image_to_label(path, labelpath);
+
+        int num_labels = 0;
+        box_label *truth = read_boxes(labelpath, &num_labels);
+        //printf(" new path: %s \n", labelpath);
+        char *buff = (char*)xcalloc(6144, sizeof(char));
+        for (j = 0; j < num_labels; ++j)
+        {
+            if (truth[j].x > 1 || truth[j].x <= 0 || truth[j].y > 1 || truth[j].y <= 0 ||
+                truth[j].w > 1 || truth[j].w <= 0 || truth[j].h > 1 || truth[j].h <= 0)
+            {
+                printf("\n\nWrong label: %s - j = %d, x = %f, y = %f, width = %f, height = %f \n",
+                    labelpath, j, truth[j].x, truth[j].y, truth[j].w, truth[j].h);
+                sprintf(buff, "echo \"Wrong label: %s - j = %d, x = %f, y = %f, width = %f, height = %f\" >> bad_label.list",
+                    labelpath, j, truth[j].x, truth[j].y, truth[j].w, truth[j].h);
+                system(buff);
+            }
+            if (truth[j].id >= classes) {
+                classes = truth[j].id + 1;
+                counter_per_class = (int*)xrealloc(counter_per_class, classes * sizeof(int));
+            }
+            counter_per_class[truth[j].id]++;
+
+            number_of_boxes++;
+            rel_width_height_array = (float*)xrealloc(rel_width_height_array, 2 * number_of_boxes * sizeof(float));
+
+            rel_width_height_array[number_of_boxes * 2 - 2] = truth[j].w * width;
+            rel_width_height_array[number_of_boxes * 2 - 1] = truth[j].h * height;
+            printf("\r loaded \t image: %d \t box: %d", i + 1, number_of_boxes);
+        }
+        free(buff);
+        free(truth);
+    }
+    printf("\n all loaded. \n");
+    printf("\n calculating k-means++ ...");
+
+    matrix boxes_data;
+    model anchors_data;
+    boxes_data = make_matrix(number_of_boxes, 2);
+
+    printf("\n");
+    for (i = 0; i < number_of_boxes; ++i) {
+        boxes_data.vals[i][0] = rel_width_height_array[i * 2];
+        boxes_data.vals[i][1] = rel_width_height_array[i * 2 + 1];
+        //if (w > 410 || h > 410) printf("i:%d,  w = %f, h = %f \n", i, w, h);
+    }
+
+    // Is used: distance(box, centroid) = 1 - IoU(box, centroid)
+
+    // K-means
+    anchors_data = do_kmeans(boxes_data, num_of_clusters);
+
+    qsort((void*)anchors_data.centers.vals, num_of_clusters, 2 * sizeof(float), (__compar_fn_t)anchors_data_comparator);
+
+    //gen_anchors.py = 1.19, 1.99, 2.79, 4.60, 4.53, 8.92, 8.06, 5.29, 10.32, 10.66
+    //float orig_anch[] = { 1.19, 1.99, 2.79, 4.60, 4.53, 8.92, 8.06, 5.29, 10.32, 10.66 };
+
+    printf("\n");
+    float avg_iou = 0;
+    for (i = 0; i < number_of_boxes; ++i) {
+        float box_w = rel_width_height_array[i * 2]; //points->data.fl[i * 2];
+        float box_h = rel_width_height_array[i * 2 + 1]; //points->data.fl[i * 2 + 1];
+                                                         //int cluster_idx = labels->data.i[i];
+        int cluster_idx = 0;
+        float min_dist = FLT_MAX;
+        float best_iou = 0;
+        for (j = 0; j < num_of_clusters; ++j) {
+            float anchor_w = anchors_data.centers.vals[j][0];   // centers->data.fl[j * 2];
+            float anchor_h = anchors_data.centers.vals[j][1];   // centers->data.fl[j * 2 + 1];
+            float min_w = (box_w < anchor_w) ? box_w : anchor_w;
+            float min_h = (box_h < anchor_h) ? box_h : anchor_h;
+            float box_intersect = min_w*min_h;
+            float box_union = box_w*box_h + anchor_w*anchor_h - box_intersect;
+            float iou = box_intersect / box_union;
+            float distance = 1 - iou;
+            if (distance < min_dist) {
+              min_dist = distance;
+              cluster_idx = j;
+              best_iou = iou;
+            }
+        }
+
+        float anchor_w = anchors_data.centers.vals[cluster_idx][0]; //centers->data.fl[cluster_idx * 2];
+        float anchor_h = anchors_data.centers.vals[cluster_idx][1]; //centers->data.fl[cluster_idx * 2 + 1];
+        if (best_iou > 1 || best_iou < 0) { // || box_w > width || box_h > height) {
+            printf(" Wrong label: i = %d, box_w = %f, box_h = %f, anchor_w = %f, anchor_h = %f, iou = %f \n",
+                i, box_w, box_h, anchor_w, anchor_h, best_iou);
+        }
+        else avg_iou += best_iou;
+    }
+
+    char buff[1024];
+    FILE* fwc = fopen("counters_per_class.txt", "wb");
+    if (fwc) {
+        sprintf(buff, "counters_per_class = ");
+        printf("\n%s", buff);
+        fwrite(buff, sizeof(char), strlen(buff), fwc);
+        for (i = 0; i < classes; ++i) {
+            sprintf(buff, "%d", counter_per_class[i]);
+            printf("%s", buff);
+            fwrite(buff, sizeof(char), strlen(buff), fwc);
+            if (i < classes - 1) {
+                fwrite(", ", sizeof(char), 2, fwc);
+                printf(", ");
+            }
+        }
+        printf("\n");
+        fclose(fwc);
+    }
+    else {
+        printf(" Error: file counters_per_class.txt can't be open \n");
+    }
+
+    avg_iou = 100 * avg_iou / number_of_boxes;
+    printf("\n avg IoU = %2.2f %% \n", avg_iou);
+
+
+    FILE* fw = fopen("anchors.txt", "wb");
+    if (fw) {
+        printf("\nSaving anchors to the file: anchors.txt \n");
+        printf("anchors = ");
+        for (i = 0; i < num_of_clusters; ++i) {
+            float anchor_w = anchors_data.centers.vals[i][0]; //centers->data.fl[i * 2];
+            float anchor_h = anchors_data.centers.vals[i][1]; //centers->data.fl[i * 2 + 1];
+            if (width > 32) sprintf(buff, "%3.0f,%3.0f", anchor_w, anchor_h);
+            else sprintf(buff, "%2.4f,%2.4f", anchor_w, anchor_h);
+            printf("%s", buff);
+            fwrite(buff, sizeof(char), strlen(buff), fw);
+            if (i + 1 < num_of_clusters) {
+                fwrite(", ", sizeof(char), 2, fw);
+                printf(", ");
+            }
+        }
+        printf("\n");
+        fclose(fw);
+    }
+    else {
+        printf(" Error: file anchors.txt can't be open \n");
+    }
+
+    if (show) {
+#ifdef OPENCV
+        show_acnhors(number_of_boxes, num_of_clusters, rel_width_height_array, anchors_data, width, height);
+#endif // OPENCV
+    }
+    free(rel_width_height_array);
+    free(counter_per_class);
+}
+
+
+void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh,
+    float hier_thresh, int dont_show, int ext_output, int save_labels, char *outfile, int letter_box, int benchmark_layers)
+{
+    list *options = read_data_cfg(datacfg);
+    char *name_list = option_find_str(options, "names", "data/names.list");
+    int names_size = 0;
+    char **names = get_labels_custom(name_list, &names_size); //get_labels(name_list);
+
+    image **alphabet = load_alphabet();
+    network net = parse_network_cfg_custom(cfgfile, 1, 1); // set batch=1
+    if (weightfile) {
+        load_weights(&net, weightfile);
+    }
+    if (net.letter_box) letter_box = 1;
+    net.benchmark_layers = benchmark_layers;
+    fuse_conv_batchnorm(net);
+    calculate_binary_weights(net);
+    if (net.layers[net.n - 1].classes != names_size) {
+        printf("\n Error: in the file %s number of names %d that isn't equal to classes=%d in the file %s \n",
+            name_list, names_size, net.layers[net.n - 1].classes, cfgfile);
+    }
+    srand(2222222);
+    char buff[256];
+    char *input = buff;
+    char *json_buf = NULL;
+    int json_image_id = 0;
+    FILE* json_file = NULL;
+    if (outfile) {
+        json_file = fopen(outfile, "wb");
+        if(!json_file) {
+            error("fopen failed", DARKNET_LOC);
+        }
+        char *tmp = "[\n";
+        fwrite(tmp, sizeof(char), strlen(tmp), json_file);
+    }
+    int j;
+    float nms = .45;    // 0.4F
+    while (1) {
+        if (filename) {
+            strncpy(input, filename, 256);
+            if (strlen(input) > 0)
+                if (input[strlen(input) - 1] == 0x0d) input[strlen(input) - 1] = 0;
+        }
+        else {
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if (!input) break;
+            strtok(input, "\n");
+        }
+        //image im;
+        //image sized = load_image_resize(input, net.w, net.h, net.c, &im);
+        image im = load_image(input, 0, 0, net.c);
+        image sized;
+        if(letter_box) sized = letterbox_image(im, net.w, net.h);
+        else sized = resize_image(im, net.w, net.h);
+
+        layer l = net.layers[net.n - 1];
+        int k;
+        for (k = 0; k < net.n; ++k) {
+            layer lk = net.layers[k];
+            if (lk.type == YOLO || lk.type == GAUSSIAN_YOLO || lk.type == REGION) {
+                l = lk;
+                printf(" Detection layer: %d - type = %d \n", k, l.type);
+            }
+        }
+
+        //box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
+        //float **probs = calloc(l.w*l.h*l.n, sizeof(float*));
+        //for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = (float*)xcalloc(l.classes, sizeof(float));
+
+        float *X = sized.data;
+
+        //time= what_time_is_it_now();
+        double time = get_time_point();
+        network_predict(net, X);
+        //network_predict_image(&net, im); letterbox = 1;
+        printf("%s: Predicted in %lf milli-seconds.\n", input, ((double)get_time_point() - time) / 1000);
+        //printf("%s: Predicted in %f seconds.\n", input, (what_time_is_it_now()-time));
+
+        int nboxes = 0;
+        detection *dets = get_network_boxes(&net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes, letter_box);
+        if (nms) {
+            if (l.nms_kind == DEFAULT_NMS) do_nms_sort(dets, nboxes, l.classes, nms);
+            else diounms_sort(dets, nboxes, l.classes, nms, l.nms_kind, l.beta_nms);
+        }
+        draw_detections_v3(im, dets, nboxes, thresh, names, alphabet, l.classes, ext_output);
+        save_image(im, "predictions");
+        if (!dont_show) {
+            show_image(im, "predictions");
+        }
+
+        if (json_file) {
+            if (json_buf) {
+                char *tmp = ", \n";
+                fwrite(tmp, sizeof(char), strlen(tmp), json_file);
+            }
+            ++json_image_id;
+            json_buf = detection_to_json(dets, nboxes, l.classes, names, json_image_id, input);
+
+            fwrite(json_buf, sizeof(char), strlen(json_buf), json_file);
+            free(json_buf);
+        }
+
+        // pseudo labeling concept - fast.ai
+        if (save_labels)
+        {
+            char labelpath[4096];
+            replace_image_to_label(input, labelpath);
+
+            FILE* fw = fopen(labelpath, "wb");
+            int i;
+            for (i = 0; i < nboxes; ++i) {
+                char buff[1024];
+                int class_id = -1;
+                float prob = 0;
+                for (j = 0; j < l.classes; ++j) {
+                    if (dets[i].prob[j] > thresh && dets[i].prob[j] > prob) {
+                        prob = dets[i].prob[j];
+                        class_id = j;
+                    }
+                }
+                if (class_id >= 0) {
+                    sprintf(buff, "%d %2.4f %2.4f %2.4f %2.4f\n", class_id, dets[i].bbox.x, dets[i].bbox.y, dets[i].bbox.w, dets[i].bbox.h);
+                    fwrite(buff, sizeof(char), strlen(buff), fw);
+                }
+            }
+            fclose(fw);
+        }
+
+        free_detections(dets, nboxes);
+        free_image(im);
+        free_image(sized);
+
+        if (!dont_show) {
+            wait_until_press_key_cv();
+            destroy_all_windows_cv();
+        }
+
+        if (filename) break;
+    }
+
+    if (json_file) {
+        char *tmp = "\n]";
+        fwrite(tmp, sizeof(char), strlen(tmp), json_file);
+        fclose(json_file);
+    }
+
+    // free memory
+    free_ptrs((void**)names, net.layers[net.n - 1].classes);
+    free_list_contents_kvp(options);
+    free_list(options);
+    free_alphabet(alphabet);
+    free_network(net);
+}
+
+#if defined(OPENCV) && defined(GPU)
+
+// adversarial attack dnn
+void draw_object(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, int dont_show, int it_num,
+    int letter_box, int benchmark_layers)
+{
+    list *options = read_data_cfg(datacfg);
+    char *name_list = option_find_str(options, "names", "data/names.list");
+    int names_size = 0;
+    char **names = get_labels_custom(name_list, &names_size); //get_labels(name_list);
+
+    image **alphabet = load_alphabet();
+    network net = parse_network_cfg(cfgfile);// parse_network_cfg_custom(cfgfile, 1, 1); // set batch=1
+    net.adversarial = 1;
+    set_batch_network(&net, 1);
+    if (weightfile) {
+        load_weights(&net, weightfile);
+    }
+    net.benchmark_layers = benchmark_layers;
+    //fuse_conv_batchnorm(net);
+    //calculate_binary_weights(net);
+    if (net.layers[net.n - 1].classes != names_size) {
+        printf("\n Error: in the file %s number of names %d that isn't equal to classes=%d in the file %s \n",
+            name_list, names_size, net.layers[net.n - 1].classes, cfgfile);
+    }
+
+    srand(2222222);
+    char buff[256];
+    char *input = buff;
+
+    int j;
+    float nms = .45;    // 0.4F
+    while (1) {
+        if (filename) {
+            strncpy(input, filename, 256);
+            if (strlen(input) > 0)
+                if (input[strlen(input) - 1] == 0x0d) input[strlen(input) - 1] = 0;
+        }
+        else {
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if (!input) break;
+            strtok(input, "\n");
+        }
+        //image im;
+        //image sized = load_image_resize(input, net.w, net.h, net.c, &im);
+        image im = load_image(input, 0, 0, net.c);
+        image sized;
+        if (letter_box) sized = letterbox_image(im, net.w, net.h);
+        else sized = resize_image(im, net.w, net.h);
+
+        image src_sized = copy_image(sized);
+
+        layer l = net.layers[net.n - 1];
+        int k;
+        for (k = 0; k < net.n; ++k) {
+            layer lk = net.layers[k];
+            if (lk.type == YOLO || lk.type == GAUSSIAN_YOLO || lk.type == REGION) {
+                l = lk;
+                printf(" Detection layer: %d - type = %d \n", k, l.type);
+            }
+        }
+
+        net.num_boxes = l.max_boxes;
+        int num_truth = l.truths;
+        float *truth_cpu = (float *)xcalloc(num_truth, sizeof(float));
+
+        int *it_num_set = (int *)xcalloc(1, sizeof(int));
+        float *lr_set = (float *)xcalloc(1, sizeof(float));
+        int *boxonly = (int *)xcalloc(1, sizeof(int));
+
+        cv_draw_object(sized, truth_cpu, net.num_boxes, num_truth, it_num_set, lr_set, boxonly, l.classes, names);
+
+        net.learning_rate = *lr_set;
+        it_num = *it_num_set;
+
+        float *X = sized.data;
+
+        mat_cv* img = NULL;
+        float max_img_loss = 5;
+        int number_of_lines = 100;
+        int img_size = 1000;
+        char windows_name[100];
+        char *base = basecfg(cfgfile);
+        sprintf(windows_name, "chart_%s.png", base);
+        img = draw_train_chart(windows_name, max_img_loss, it_num, number_of_lines, img_size, dont_show, NULL);
+
+        int iteration;
+        for (iteration = 0; iteration < it_num; ++iteration)
+        {
+            forward_backward_network_gpu(net, X, truth_cpu);
+
+            float avg_loss = get_network_cost(net);
+            draw_train_loss(windows_name, img, img_size, avg_loss, max_img_loss, iteration, it_num, 0, 0, "mAP%", 0, dont_show, 0, 0);
+
+            float inv_loss = 1.0 / max_val_cmp(0.01, avg_loss);
+            //net.learning_rate = *lr_set * inv_loss;
+
+            if (*boxonly) {
+                int dw = truth_cpu[2] * sized.w, dh = truth_cpu[3] * sized.h;
+                int dx = truth_cpu[0] * sized.w - dw / 2, dy = truth_cpu[1] * sized.h - dh / 2;
+                image crop = crop_image(sized, dx, dy, dw, dh);
+                copy_image_inplace(src_sized, sized);
+                embed_image(crop, sized, dx, dy);
+            }
+
+            show_image_cv(sized, "image_optimization");
+            wait_key_cv(20);
+        }
+
+        net.train = 0;
+        quantize_image(sized);
+        network_predict(net, X);
+
+        save_image_png(sized, "drawn");
+        //sized = load_image("drawn.png", 0, 0, net.c);
+
+        int nboxes = 0;
+        detection *dets = get_network_boxes(&net, sized.w, sized.h, thresh, 0, 0, 1, &nboxes, letter_box);
+        if (nms) {
+            if (l.nms_kind == DEFAULT_NMS) do_nms_sort(dets, nboxes, l.classes, nms);
+            else diounms_sort(dets, nboxes, l.classes, nms, l.nms_kind, l.beta_nms);
+        }
+        draw_detections_v3(sized, dets, nboxes, thresh, names, alphabet, l.classes, 1);
+        save_image(sized, "pre_predictions");
+        if (!dont_show) {
+            show_image(sized, "pre_predictions");
+        }
+
+        free_detections(dets, nboxes);
+        free_image(im);
+        free_image(sized);
+        free_image(src_sized);
+
+        if (!dont_show) {
+            wait_until_press_key_cv();
+            destroy_all_windows_cv();
+        }
+
+        free(lr_set);
+        free(it_num_set);
+
+        if (filename) break;
+    }
+
+    // free memory
+    free_ptrs((void**)names, net.layers[net.n - 1].classes);
+    free_list_contents_kvp(options);
+    free_list(options);
+
+    int i;
+    const int nsize = 8;
+    for (j = 0; j < nsize; ++j) {
+        for (i = 32; i < 127; ++i) {
+            free_image(alphabet[j][i]);
+        }
+        free(alphabet[j]);
+    }
+    free(alphabet);
+
+    free_network(net);
+}
+#else // defined(OPENCV) && defined(GPU)
+void draw_object(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, int dont_show, int it_num,
+    int letter_box, int benchmark_layers)
+{
+    error("darknet detector draw ... can't be used without OpenCV and CUDA", DARKNET_LOC);
+}
+#endif // defined(OPENCV) && defined(GPU)
+
+void run_detector(int argc, char **argv)
+{
+    int dont_show = find_arg(argc, argv, "-dont_show");
+    int benchmark = find_arg(argc, argv, "-benchmark");
+    int benchmark_layers = find_arg(argc, argv, "-benchmark_layers");
+    //if (benchmark_layers) benchmark = 1;
+    if (benchmark) dont_show = 1;
+    int show = find_arg(argc, argv, "-show");
+    int letter_box = find_arg(argc, argv, "-letter_box");
+    int calc_map = find_arg(argc, argv, "-map");
+    int map_points = find_int_arg(argc, argv, "-points", 0);
+    int show_imgs = find_arg(argc, argv, "-show_imgs");
+    int mjpeg_port = find_int_arg(argc, argv, "-mjpeg_port", -1);
+    int avgframes = find_int_arg(argc, argv, "-avgframes", 3);
+    int dontdraw_bbox = find_arg(argc, argv, "-dontdraw_bbox");
+    int json_port = find_int_arg(argc, argv, "-json_port", -1);
+    char *http_post_host = find_char_arg(argc, argv, "-http_post_host", 0);
+    int time_limit_sec = find_int_arg(argc, argv, "-time_limit_sec", 0);
+    char *out_filename = find_char_arg(argc, argv, "-out_filename", 0);
+    char *json_file_output = find_char_arg(argc, argv, "-json_file_output", 0);
+    char *outfile = find_char_arg(argc, argv, "-out", 0);
+    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
+    float thresh = find_float_arg(argc, argv, "-thresh", .25);    // 0.24
+    float iou_thresh = find_float_arg(argc, argv, "-iou_thresh", .5);    // 0.5 for mAP
+    float hier_thresh = find_float_arg(argc, argv, "-hier", .5);
+    int cam_index = find_int_arg(argc, argv, "-c", 0);
+    int frame_skip = find_int_arg(argc, argv, "-s", 0);
+    int num_of_clusters = find_int_arg(argc, argv, "-num_of_clusters", 5);
+    int width = find_int_arg(argc, argv, "-width", -1);
+    int height = find_int_arg(argc, argv, "-height", -1);
+    // extended output in test mode (output of rect bound coords)
+    // and for recall mode (extended output table-like format with results for best_class fit)
+    int ext_output = find_arg(argc, argv, "-ext_output");
+    int save_labels = find_arg(argc, argv, "-save_labels");
+    char* chart_path = find_char_arg(argc, argv, "-chart", 0);
+    // While training, decide after how many epochs mAP will be calculated. Default value is 4 which means the mAP will be calculated after each 4 epochs
+    int mAP_epochs = find_int_arg(argc, argv, "-mAP_epochs", 4);
+    if (argc < 4) {
+        fprintf(stderr, "usage: %s %s [train/test/valid/demo/map] [data] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
+    int *gpus = 0;
+    int gpu = 0;
+    int ngpus = 0;
+    if (gpu_list) {
+        printf("%s\n", gpu_list);
+        int len = (int)strlen(gpu_list);
+        ngpus = 1;
+        int i;
+        for (i = 0; i < len; ++i) {
+            if (gpu_list[i] == ',') ++ngpus;
+        }
+        gpus = (int*)xcalloc(ngpus, sizeof(int));
+        for (i = 0; i < ngpus; ++i) {
+            gpus[i] = atoi(gpu_list);
+            gpu_list = strchr(gpu_list, ',') + 1;
+        }
+    }
+    else {
+        gpu = gpu_index;
+        gpus = &gpu;
+        ngpus = 1;
+    }
+
+    int clear = find_arg(argc, argv, "-clear");
+
+    char *datacfg = argv[3];
+    char *cfg = argv[4];
+    char *weights = (argc > 5) ? argv[5] : 0;
+    if (weights)
+        if (strlen(weights) > 0)
+            if (weights[strlen(weights) - 1] == 0x0d) weights[strlen(weights) - 1] = 0;
+    char *filename = (argc > 6) ? argv[6] : 0;
+    if (0 == strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh, dont_show, ext_output, save_labels, outfile, letter_box, benchmark_layers);
+    else if (0 == strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear, dont_show, calc_map, thresh, iou_thresh, mjpeg_port, show_imgs, benchmark_layers, chart_path, mAP_epochs);
+    else if (0 == strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
+    else if (0 == strcmp(argv[2], "recall")) validate_detector_recall(datacfg, cfg, weights);
+    else if (0 == strcmp(argv[2], "map")) validate_detector_map(datacfg, cfg, weights, thresh, iou_thresh, map_points, letter_box, NULL);
+    else if (0 == strcmp(argv[2], "calc_anchors")) calc_anchors(datacfg, num_of_clusters, width, height, show);
+    else if (0 == strcmp(argv[2], "draw")) {
+        int it_num = 100;
+        draw_object(datacfg, cfg, weights, filename, thresh, dont_show, it_num, letter_box, benchmark_layers);
+    }
+    else if (0 == strcmp(argv[2], "demo")) {
+        list *options = read_data_cfg(datacfg);
+        int classes = option_find_int(options, "classes", 20);
+        char *name_list = option_find_str(options, "names", "data/names.list");
+        char **names = get_labels(name_list);
+        if (filename)
+            if (strlen(filename) > 0)
+                if (filename[strlen(filename) - 1] == 0x0d) filename[strlen(filename) - 1] = 0;
+        demo(cfg, weights, thresh, hier_thresh, cam_index, filename, names, classes, avgframes, frame_skip, prefix, out_filename,
+            mjpeg_port, dontdraw_bbox, json_port, dont_show, ext_output, letter_box, time_limit_sec, http_post_host, benchmark, benchmark_layers, json_file_output);
+
+        free_list_contents_kvp(options);
+        free_list(options);
+    }
+    else printf(" There isn't such command: %s", argv[2]);
+
+    if (gpus && gpu_list && ngpus > 1) free(gpus);
+}
diff --git a/darknet-master/src/dice.c b/darknet-master/src/dice.c
new file mode 100644
index 0000000..bb5d643
--- /dev/null
+++ b/darknet-master/src/dice.c
@@ -0,0 +1,117 @@
+#include "network.h"
+#include "utils.h"
+#include "parser.h"
+
+char *dice_labels[] = {"face1","face2","face3","face4","face5","face6"};
+
+void train_dice(char *cfgfile, char *weightfile)
+{
+    srand(time(0));
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    char* backup_directory = "backup/";
+    printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = 1024;
+    int i = *net.seen/imgs;
+    char **labels = dice_labels;
+    list *plist = get_paths("data/dice/dice.train.list");
+    char **paths = (char **)list_to_array(plist);
+    printf("%d\n", plist->size);
+    clock_t time;
+    while(1){
+        ++i;
+        time=clock();
+        data train = load_data_old(paths, imgs, plist->size, labels, 6, net.w, net.h);
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        time=clock();
+        float loss = train_network(net, train);
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+        printf("%d: %f, %f avg, %lf seconds, %" PRIu64 " images\n", i, loss, avg_loss, sec(clock()-time), *net.seen);
+        free_data(train);
+        if((i % 100) == 0) net.learning_rate *= .1;
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, i);
+            save_weights(net, buff);
+        }
+    }
+}
+
+void validate_dice(char *filename, char *weightfile)
+{
+    network net = parse_network_cfg(filename);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    char **labels = dice_labels;
+    list *plist = get_paths("data/dice/dice.val.list");
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    data val = load_data_old(paths, m, 0, labels, 6, net.w, net.h);
+    float *acc = network_accuracies(net, val, 2);
+    printf("Validation Accuracy: %f, %d images\n", acc[0], m);
+    free_data(val);
+}
+
+void test_dice(char *cfgfile, char *weightfile, char *filename)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    srand(2222222);
+    int i = 0;
+    char **names = dice_labels;
+    char buff[256];
+    char *input = buff;
+    int indexes[6];
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, net.w, net.h);
+        float *X = im.data;
+        float *predictions = network_predict(net, X);
+        top_predictions(net, 6, indexes);
+        for(i = 0; i < 6; ++i){
+            int index = indexes[i];
+            printf("%s: %f\n", names[index], predictions[index]);
+        }
+        free_image(im);
+        if (filename) break;
+    }
+}
+
+void run_dice(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *filename = (argc > 5) ? argv[5]: 0;
+    if(0==strcmp(argv[2], "test")) test_dice(cfg, weights, filename);
+    else if(0==strcmp(argv[2], "train")) train_dice(cfg, weights);
+    else if(0==strcmp(argv[2], "valid")) validate_dice(cfg, weights);
+}
diff --git a/darknet-master/src/dropout_layer.c b/darknet-master/src/dropout_layer.c
new file mode 100644
index 0000000..744c708
--- /dev/null
+++ b/darknet-master/src/dropout_layer.c
@@ -0,0 +1,88 @@
+#include "dropout_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+dropout_layer make_dropout_layer(int batch, int inputs, float probability, int dropblock, float dropblock_size_rel, int dropblock_size_abs, int w, int h, int c)
+{
+    dropout_layer l = { (LAYER_TYPE)0 };
+    l.type = DROPOUT;
+    l.probability = probability;
+    l.dropblock = dropblock;
+    l.dropblock_size_rel = dropblock_size_rel;
+    l.dropblock_size_abs = dropblock_size_abs;
+    if (l.dropblock) {
+        l.out_w = l.w = w;
+        l.out_h = l.h = h;
+        l.out_c = l.c = c;
+
+        if (l.w <= 0 || l.h <= 0 || l.c <= 0) {
+            printf(" Error: DropBlock - there must be positive values for: l.w=%d, l.h=%d, l.c=%d \n", l.w, l.h, l.c);
+            error("Error!", DARKNET_LOC);
+        }
+    }
+    l.inputs = inputs;
+    l.outputs = inputs;
+    l.batch = batch;
+    l.rand = (float*)xcalloc(inputs * batch, sizeof(float));
+    l.scale = 1./(1.0 - probability);
+    l.forward = forward_dropout_layer;
+    l.backward = backward_dropout_layer;
+#ifdef GPU
+    l.forward_gpu = forward_dropout_layer_gpu;
+    l.backward_gpu = backward_dropout_layer_gpu;
+    l.rand_gpu = cuda_make_array(l.rand, inputs*batch);
+    if (l.dropblock) {
+        l.drop_blocks_scale = cuda_make_array_pinned(l.rand, l.batch);
+        l.drop_blocks_scale_gpu = cuda_make_array(l.rand, l.batch);
+    }
+#endif
+    if (l.dropblock) {
+        if(l.dropblock_size_abs) fprintf(stderr, "dropblock    p = %.3f   l.dropblock_size_abs = %d    %4d  ->   %4d\n", probability, l.dropblock_size_abs, inputs, inputs);
+        else fprintf(stderr, "dropblock    p = %.3f   l.dropblock_size_rel = %.2f    %4d  ->   %4d\n", probability, l.dropblock_size_rel, inputs, inputs);
+    }
+    else fprintf(stderr, "dropout    p = %.3f        %4d  ->   %4d\n", probability, inputs, inputs);
+    return l;
+}
+
+void resize_dropout_layer(dropout_layer *l, int inputs)
+{
+    l->inputs = l->outputs = inputs;
+    l->rand = (float*)xrealloc(l->rand, l->inputs * l->batch * sizeof(float));
+#ifdef GPU
+    cuda_free(l->rand_gpu);
+    l->rand_gpu = cuda_make_array(l->rand, l->inputs*l->batch);
+
+    if (l->dropblock) {
+        cudaFreeHost(l->drop_blocks_scale);
+        l->drop_blocks_scale = cuda_make_array_pinned(l->rand, l->batch);
+
+        cuda_free(l->drop_blocks_scale_gpu);
+        l->drop_blocks_scale_gpu = cuda_make_array(l->rand, l->batch);
+    }
+#endif
+}
+
+void forward_dropout_layer(dropout_layer l, network_state state)
+{
+    int i;
+    if (!state.train) return;
+    for(i = 0; i < l.batch * l.inputs; ++i){
+        float r = rand_uniform(0, 1);
+        l.rand[i] = r;
+        if(r < l.probability) state.input[i] = 0;
+        else state.input[i] *= l.scale;
+    }
+}
+
+void backward_dropout_layer(dropout_layer l, network_state state)
+{
+    int i;
+    if(!state.delta) return;
+    for(i = 0; i < l.batch * l.inputs; ++i){
+        float r = l.rand[i];
+        if(r < l.probability) state.delta[i] = 0;
+        else state.delta[i] *= l.scale;
+    }
+}
diff --git a/darknet-master/src/dropout_layer.h b/darknet-master/src/dropout_layer.h
new file mode 100644
index 0000000..fa02300
--- /dev/null
+++ b/darknet-master/src/dropout_layer.h
@@ -0,0 +1,26 @@
+#ifndef DROPOUT_LAYER_H
+#define DROPOUT_LAYER_H
+
+#include "layer.h"
+#include "network.h"
+
+typedef layer dropout_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+dropout_layer make_dropout_layer(int batch, int inputs, float probability, int dropblock, float dropblock_size_rel, int dropblock_size_abs, int w, int h, int c);
+
+void forward_dropout_layer(dropout_layer l, network_state state);
+void backward_dropout_layer(dropout_layer l, network_state state);
+void resize_dropout_layer(dropout_layer *l, int inputs);
+
+#ifdef GPU
+void forward_dropout_layer_gpu(dropout_layer l, network_state state);
+void backward_dropout_layer_gpu(dropout_layer l, network_state state);
+
+#endif
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/dropout_layer_kernels.cu b/darknet-master/src/dropout_layer_kernels.cu
new file mode 100644
index 0000000..3ed7a18
--- /dev/null
+++ b/darknet-master/src/dropout_layer_kernels.cu
@@ -0,0 +1,311 @@
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+#include <cstring>
+
+#include "dropout_layer.h"
+#include "dark_cuda.h"
+#include "utils.h"
+#include "blas.h"
+
+#include "image_opencv.h"
+#include "image.h"
+
+
+__global__ void dropblock_fast_kernel(float *rand, float prob, int w, int h, int spatial, int filters, int batch, int block_size, float *drop_blocks_scale, float *output)
+{
+    const int threads = BLOCK;
+    const int id = threadIdx.x;
+    const int f = blockIdx.x % filters;
+    const int b = blockIdx.x / filters;
+
+    __shared__ int prob_block;
+    __shared__ int index_block;
+
+    if (id == 0) {
+        prob_block = 1.0 * 1000000;
+        index_block = -1;
+    }
+    __syncthreads();
+
+    int i;
+    for (i = id; i < spatial; i += threads) {
+        int index = b*spatial*f + f*spatial + i;
+
+        if (rand[index] < prob) {
+            //Chose with the lowest rand[i]
+            int new_val = rand[index] * 1000000;
+            rand[index] = 1;
+            int old_val = atomicMin(&prob_block, new_val);
+            if (new_val < old_val) {
+                index_block = i;
+                //if (b == 0) printf("\n rand[i] = %f, prob = %f, b = %d, f = %d, i = %d, index_block = %d \n", rand[i], prob, b, f, i, index_block);
+            }
+        }
+
+    }
+    __syncthreads();
+    if (index_block == -1) return;
+
+
+    int b_x = index_block % w;
+    int b_y = index_block / w;
+
+    if (b_x > (w - block_size)) b_x = b_x - (w - block_size);
+    if (b_y > (h - block_size)) b_y = b_y - (h - block_size);
+
+    b_x = max(0, min(b_x, w - block_size));
+    b_y = max(0, min(b_y, h - block_size));
+
+    int block_square_size = block_size * block_size;
+
+    for (i = id; i < block_square_size; i += threads)
+    {
+        int i_x = i % block_size;
+        int i_y = i / block_size;
+
+        int x = b_x + i_x;
+        int y = b_y + i_y;
+
+        if (x >= 0 && x < w && y >= 0 && y < h) {
+            int new_index = b*filters*spatial + f*spatial + y*w + x;
+
+            output[new_index] = 0;
+            rand[new_index] = 0;
+        }
+    }
+
+    //if (id == 0 && b == 0) printf(" f = %d, b = %d \n", f, b);
+
+    if (id == 0 && drop_blocks_scale) {
+        atomicAdd(&drop_blocks_scale[b], block_square_size);
+        //if(b == 0) printf("\n index_block = %d \n", index_block);
+    }
+
+}
+
+__global__ void set_scales_dropblock_kernel(float *drop_blocks_scale, int block_size_w, int block_size_h, int outputs, int batch)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= batch) return;
+
+    //printf(" drop_blocks_scale[index] = %f \n", drop_blocks_scale[index]);
+    const float prob = drop_blocks_scale[index] / (float)outputs;
+    const float scale = 1.0f / (1.0f - prob);
+    drop_blocks_scale[index] = scale;
+}
+
+__global__ void scale_dropblock_kernel(float *output, int size, int outputs, float *drop_blocks_scale)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= size) return;
+
+    const int b = index / outputs;
+    output[index] *= drop_blocks_scale[b];
+}
+
+
+__global__ void backward_dropblock_kernel(float *pass, float *delta, int size)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= size) return;
+
+    if (pass[index] == 0) delta[index] = 0;
+}
+
+
+__global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(id < size) input[id] = (rand[id] < prob) ? 0 : input[id]*scale;
+}
+
+
+void forward_dropout_layer_gpu(dropout_layer l, network_state state)
+{
+    if (!state.train) return;
+    int iteration_num = get_current_iteration(state.net); // (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    //if (iteration_num < state.net.burn_in) return;
+
+    // We gradually increase the block size and the probability of dropout - during the first half of the training
+    float multiplier = 1.0;
+    if(iteration_num < (state.net.max_batches*0.85))
+        multiplier = (iteration_num / (float)(state.net.max_batches*0.85));
+
+    // dropblock
+    if (l.dropblock) {
+        //l.probability = 1 / keep_prob
+        //const int max_blocks_per_channel = 10;
+        const float cur_prob = l.probability * multiplier;
+        const float cur_scale = 1.f / (1.f - cur_prob);
+
+        int block_width = l.dropblock_size_abs *multiplier;
+        int block_height = l.dropblock_size_abs *multiplier;
+
+        if (l.dropblock_size_rel) {
+            block_width = l.dropblock_size_rel * l.w * multiplier;
+            block_height = l.dropblock_size_rel * l.h * multiplier;
+        }
+
+        block_width = max_val_cmp(1, block_width);
+        block_height = max_val_cmp(1, block_height);
+
+        block_width = min_val_cmp(l.w, block_width);
+        block_height = min_val_cmp(l.h, block_height);
+
+        const int block_size = min_val_cmp(block_width, block_height);
+        const float block_prob = cur_prob / (block_size*block_size);
+        assert(block_size <= l.w && block_size <= l.h);
+
+        const int size = l.inputs*l.batch;
+        cuda_random(l.rand_gpu, size);
+
+        fill_ongpu(l.batch, 0, l.drop_blocks_scale_gpu, 1);
+
+        //fill_ongpu(l.outputs * l.batch, 1, state.input, 1); // remove!!!
+
+        int num_blocks = l.batch * l.c;
+        dropblock_fast_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (l.rand_gpu, block_prob, l.w, l.h, l.w*l.h, l.c, l.batch, block_size, l.drop_blocks_scale_gpu, state.input);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        num_blocks = get_number_of_blocks(l.batch, BLOCK);
+        set_scales_dropblock_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (l.drop_blocks_scale_gpu, block_size, block_size, l.outputs, l.batch);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        /*
+        {
+            cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
+
+            float avg_scale = 0;
+
+            for (int b = 0; b < l.batch; ++b) {
+                const float scale = l.drop_blocks_scale[b];
+                avg_scale += scale;
+                printf(" %d x %d - block_size = %d, block_size*block_size = %d , ", l.w, l.h, block_size, block_size*block_size);
+                printf(" , l.drop_blocks_scale[b] = %f, scale = %f \t cur_prob = %f, cur_scale = %f \n",
+                    l.drop_blocks_scale[b], scale, cur_prob, cur_scale);
+            }
+            avg_scale = avg_scale / l.batch;
+            printf(" avg_scale = %f \n", avg_scale);
+
+            float *output = (float *)calloc(l.outputs * l.batch, sizeof(float));
+            cuda_pull_array(state.input, output, l.outputs * l.batch);
+
+            printf(" l.w = %d, l.h = %d, l.c = %d \n", l.w, l.h, l.c);
+
+            image img = float_to_image(l.w, l.h, l.c, output);
+            img = collapse_image_layers(img, 1);
+            //normalize_image(img);
+
+            show_image(img, "dropout - forward");
+            wait_key_cv(0);
+            //free_image(img);
+            //free(output);
+        }
+        */
+
+        num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
+        scale_dropblock_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (state.input, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+    }
+    // dropout
+    else {
+        int size = l.inputs*l.batch;
+        cuda_random(l.rand_gpu, size);
+        /*
+        int i;
+        for(i = 0; i < size; ++i){
+            layer.rand[i] = rand_uniform();
+        }
+        cuda_push_array(layer.rand_gpu, layer.rand, size);
+        */
+
+        yoloswag420blazeit360noscope <<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>> (state.input, size, l.rand_gpu, l.probability, l.scale);
+        CHECK_CUDA(cudaPeekAtLastError());
+    }
+}
+
+void backward_dropout_layer_gpu(dropout_layer l, network_state state)
+{
+    if(!state.delta) return;
+    //int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    //if (iteration_num < state.net.burn_in) return;
+
+    const int size = l.inputs*l.batch;
+
+    // dropblock
+    if (l.dropblock) {
+        int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
+        float multiplier = 1.0;
+        if (iteration_num < (state.net.max_batches*0.85))
+            multiplier = (iteration_num / (float)(state.net.max_batches*0.85));
+
+        const float cur_prob = l.probability * multiplier;
+        const float cur_scale = 1.f / (1.f - cur_prob);
+
+        int block_width = l.dropblock_size_abs * multiplier;
+        int block_height = l.dropblock_size_abs * multiplier;
+
+        if (l.dropblock_size_rel) {
+            block_width = l.dropblock_size_rel * l.w * multiplier;
+            block_height = l.dropblock_size_rel * l.h * multiplier;
+        }
+
+        block_width = max_val_cmp(1, block_width);
+        block_height = max_val_cmp(1, block_height);
+
+        block_width = min_val_cmp(l.w, block_width);
+        block_height = min_val_cmp(l.h, block_height);
+
+        const int block_size = min_val_cmp(block_width, block_height);
+        const float block_prob = cur_prob / (block_size*block_size);
+
+        //fill_ongpu(l.outputs * l.batch, 1, state.delta, 1); // remove!!!
+
+        int num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
+        backward_dropblock_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(l.rand_gpu, state.delta, l.outputs * l.batch);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        scale_dropblock_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (state.delta, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        /*
+        {
+            cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
+
+            float avg_scale = 0;
+
+            for (int b = 0; b < l.batch; ++b) {
+                const float scale = l.drop_blocks_scale[b];
+                avg_scale += scale;
+                printf(" %d x %d - block_size = %d, block_size*block_size = %d , ", l.w, l.h, block_size, block_size*block_size);
+                printf(" , l.drop_blocks_scale[b] = %f, scale = %f \t cur_prob = %f, cur_scale = %f \n",
+                    l.drop_blocks_scale[b], scale, cur_prob, cur_scale);
+            }
+            avg_scale = avg_scale / l.batch;
+            printf(" avg_scale = %f \n", avg_scale);
+
+            float *output = (float *)calloc(l.outputs * l.batch, sizeof(float));
+            cuda_pull_array(state.delta, output, l.outputs * l.batch);
+
+            printf(" l.w = %d, l.h = %d, l.c = %d \n", l.w, l.h, l.c);
+
+            image img = float_to_image(l.w, l.h, l.c, output);
+            img = collapse_image_layers(img, 1);
+            //normalize_image(img);
+
+            show_image(img, "dropout - delta");
+            wait_key_cv(0);
+            //free_image(img);
+            //free(output);
+        }
+        */
+
+    }
+    // dropout
+    else {
+        yoloswag420blazeit360noscope <<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>> (state.delta, size, l.rand_gpu, l.probability, l.scale);
+        CHECK_CUDA(cudaPeekAtLastError());
+    }
+}
diff --git a/darknet-master/src/gaussian_yolo_layer.c b/darknet-master/src/gaussian_yolo_layer.c
new file mode 100644
index 0000000..f94f4a6
--- /dev/null
+++ b/darknet-master/src/gaussian_yolo_layer.c
@@ -0,0 +1,896 @@
+// Gaussian YOLOv3 implementation
+// Author: Jiwoong Choi
+// ICCV 2019 Paper: http://openaccess.thecvf.com/content_ICCV_2019/html/Choi_Gaussian_YOLOv3_An_Accurate_and_Fast_Object_Detector_Using_Localization_ICCV_2019_paper.html
+// arxiv.org: https://arxiv.org/abs/1904.04620v2
+// source code: https://github.com/jwchoi384/Gaussian_YOLOv3
+
+#include "gaussian_yolo_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "box.h"
+#include "dark_cuda.h"
+#include "utils.h"
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#ifndef M_PI
+#define M_PI 3.141592
+#endif
+
+layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes, int max_boxes)
+{
+    int i;
+    layer l = { (LAYER_TYPE)0 };
+    l.type = GAUSSIAN_YOLO;
+
+    l.n = n;
+    l.total = total;
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = n*(classes + 8 + 1);
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
+    l.classes = classes;
+    l.cost = (float*)calloc(1, sizeof(float));
+    l.biases = (float*)calloc(total*2, sizeof(float));
+    if(mask) l.mask = mask;
+    else{
+        l.mask = (int*)calloc(n, sizeof(int));
+        for(i = 0; i < n; ++i){
+            l.mask[i] = i;
+        }
+    }
+    l.bias_updates = (float*)calloc(n*2, sizeof(float));
+    l.outputs = h*w*n*(classes + 8 + 1);
+    l.inputs = l.outputs;
+    l.max_boxes = max_boxes;
+    l.truth_size = 4 + 2;
+    l.truths = l.max_boxes*l.truth_size;
+    l.delta = (float*)calloc(batch*l.outputs, sizeof(float));
+    l.output = (float*)calloc(batch*l.outputs, sizeof(float));
+    for(i = 0; i < total*2; ++i){
+        l.biases[i] = .5;
+    }
+
+    l.forward = forward_gaussian_yolo_layer;
+    l.backward = backward_gaussian_yolo_layer;
+#ifdef GPU
+    l.forward_gpu = forward_gaussian_yolo_layer_gpu;
+    l.backward_gpu = backward_gaussian_yolo_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
+    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
+
+
+    free(l.output);
+    if (cudaSuccess == cudaHostAlloc(&l.output, batch*l.outputs * sizeof(float), cudaHostRegisterMapped)) l.output_pinned = 1;
+    else {
+        cudaGetLastError(); // reset CUDA-error
+        l.output = (float*)calloc(batch * l.outputs, sizeof(float));
+    }
+
+    free(l.delta);
+    if (cudaSuccess == cudaHostAlloc(&l.delta, batch*l.outputs * sizeof(float), cudaHostRegisterMapped)) l.delta_pinned = 1;
+    else {
+        cudaGetLastError(); // reset CUDA-error
+        l.delta = (float*)calloc(batch * l.outputs, sizeof(float));
+    }
+
+#endif
+
+    //fprintf(stderr, "Gaussian_yolo\n");
+    srand(time(0));
+
+    return l;
+}
+
+void resize_gaussian_yolo_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+
+    l->outputs = h*w*l->n*(l->classes + 8 + 1);
+    l->inputs = l->outputs;
+
+    //l->output = (float *)realloc(l->output, l->batch*l->outputs * sizeof(float));
+    //l->delta = (float *)realloc(l->delta, l->batch*l->outputs * sizeof(float));
+
+    if (!l->output_pinned) l->output = (float*)realloc(l->output, l->batch*l->outputs * sizeof(float));
+    if (!l->delta_pinned) l->delta = (float*)realloc(l->delta, l->batch*l->outputs * sizeof(float));
+
+#ifdef GPU
+
+    if (l->output_pinned) {
+        CHECK_CUDA(cudaFreeHost(l->output));
+        if (cudaSuccess != cudaHostAlloc(&l->output, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
+            cudaGetLastError(); // reset CUDA-error
+            l->output = (float*)calloc(l->batch * l->outputs, sizeof(float));
+            l->output_pinned = 0;
+        }
+    }
+
+    if (l->delta_pinned) {
+        CHECK_CUDA(cudaFreeHost(l->delta));
+        if (cudaSuccess != cudaHostAlloc(&l->delta, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
+            cudaGetLastError(); // reset CUDA-error
+            l->delta = (float*)calloc(l->batch * l->outputs, sizeof(float));
+            l->delta_pinned = 0;
+        }
+    }
+
+
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+
+    l->delta_gpu =     cuda_make_array(l->delta, l->batch*l->outputs);
+    l->output_gpu =    cuda_make_array(l->output, l->batch*l->outputs);
+#endif
+}
+
+box get_gaussian_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride, YOLO_POINT yolo_point)
+{
+    box b;
+
+    b.w = exp(x[index + 4 * stride]) * biases[2 * n] / w;
+    b.h = exp(x[index + 6 * stride]) * biases[2 * n + 1] / h;
+    b.x = (i + x[index + 0 * stride]) / lw;
+    b.y = (j + x[index + 2 * stride]) / lh;
+
+    if (yolo_point == YOLO_CENTER) {
+    }
+    else if (yolo_point == YOLO_LEFT_TOP) {
+        b.x = (i + x[index + 0 * stride]) / lw + b.w / 2;
+        b.y = (j + x[index + 2 * stride]) / lh + b.h / 2;
+    }
+    else if (yolo_point == YOLO_RIGHT_BOTTOM) {
+        b.x = (i + x[index + 0 * stride]) / lw - b.w / 2;
+        b.y = (j + x[index + 2 * stride]) / lh - b.h / 2;
+    }
+
+    return b;
+}
+
+static inline float fix_nan_inf(float val)
+{
+    if (isnan(val) || isinf(val)) val = 0;
+    return val;
+}
+
+static inline float clip_value(float val, const float max_val)
+{
+    if (val > max_val) val = max_val;
+    else if (val < -max_val) val = -max_val;
+    return val;
+}
+
+float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta,
+    float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss, float uc_normalizer, int accumulate, YOLO_POINT yolo_point, float max_delta)
+{
+    box pred = get_gaussian_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride, yolo_point);
+
+    float iou;
+    ious all_ious = { 0 };
+    all_ious.iou = box_iou(pred, truth);
+    all_ious.giou = box_giou(pred, truth);
+    all_ious.diou = box_diou(pred, truth);
+    all_ious.ciou = box_ciou(pred, truth);
+    if (pred.w == 0) { pred.w = 1.0; }
+    if (pred.h == 0) { pred.h = 1.0; }
+
+    float sigma_const = 0.3;
+    float epsi = pow(10,-9);
+
+    float dx, dy, dw, dh;
+
+    iou = all_ious.iou;
+
+    float tx, ty, tw, th;
+
+    tx = (truth.x*lw - i);
+    ty = (truth.y*lh - j);
+    tw = log(truth.w*w / biases[2 * n]);
+    th = log(truth.h*h / biases[2 * n + 1]);
+
+    if (yolo_point == YOLO_CENTER) {
+    }
+    else if (yolo_point == YOLO_LEFT_TOP) {
+        tx = ((truth.x - truth.w / 2)*lw - i);
+        ty = ((truth.y - truth.h / 2)*lh - j);
+    }
+    else if (yolo_point == YOLO_RIGHT_BOTTOM) {
+        tx = ((truth.x + truth.w / 2)*lw - i);
+        ty = ((truth.y + truth.h / 2)*lh - j);
+    }
+
+    dx = (tx - x[index + 0 * stride]);
+    dy = (ty - x[index + 2 * stride]);
+    dw = (tw - x[index + 4 * stride]);
+    dh = (th - x[index + 6 * stride]);
+
+    // Gaussian
+    float in_exp_x = dx / x[index+1*stride];
+    float in_exp_x_2 = pow(in_exp_x, 2);
+    float normal_dist_x = exp(in_exp_x_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+1*stride]+sigma_const));
+
+    float in_exp_y = dy / x[index+3*stride];
+    float in_exp_y_2 = pow(in_exp_y, 2);
+    float normal_dist_y = exp(in_exp_y_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+3*stride]+sigma_const));
+
+    float in_exp_w = dw / x[index+5*stride];
+    float in_exp_w_2 = pow(in_exp_w, 2);
+    float normal_dist_w = exp(in_exp_w_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+5*stride]+sigma_const));
+
+    float in_exp_h = dh / x[index+7*stride];
+    float in_exp_h_2 = pow(in_exp_h, 2);
+    float normal_dist_h = exp(in_exp_h_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+7*stride]+sigma_const));
+
+    float temp_x = (1./2.) * 1./(normal_dist_x+epsi) * normal_dist_x * scale;
+    float temp_y = (1./2.) * 1./(normal_dist_y+epsi) * normal_dist_y * scale;
+    float temp_w = (1./2.) * 1./(normal_dist_w+epsi) * normal_dist_w * scale;
+    float temp_h = (1./2.) * 1./(normal_dist_h+epsi) * normal_dist_h * scale;
+
+    if (!accumulate) {
+        delta[index + 0 * stride] = 0;
+        delta[index + 1 * stride] = 0;
+        delta[index + 2 * stride] = 0;
+        delta[index + 3 * stride] = 0;
+        delta[index + 4 * stride] = 0;
+        delta[index + 5 * stride] = 0;
+        delta[index + 6 * stride] = 0;
+        delta[index + 7 * stride] = 0;
+    }
+
+    float delta_x = temp_x * in_exp_x  * (1. / x[index + 1 * stride]);
+    float delta_y = temp_y * in_exp_y  * (1. / x[index + 3 * stride]);
+    float delta_w = temp_w * in_exp_w  * (1. / x[index + 5 * stride]);
+    float delta_h = temp_h * in_exp_h  * (1. / x[index + 7 * stride]);
+
+    float delta_ux = temp_x * (in_exp_x_2 / x[index + 1 * stride] - 1. / (x[index + 1 * stride] + sigma_const));
+    float delta_uy = temp_y * (in_exp_y_2 / x[index + 3 * stride] - 1. / (x[index + 3 * stride] + sigma_const));
+    float delta_uw = temp_w * (in_exp_w_2 / x[index + 5 * stride] - 1. / (x[index + 5 * stride] + sigma_const));
+    float delta_uh = temp_h * (in_exp_h_2 / x[index + 7 * stride] - 1. / (x[index + 7 * stride] + sigma_const));
+
+    if (iou_loss != MSE) {
+        // GIoU
+        iou = all_ious.giou;
+
+        // https://github.com/generalized-iou/g-darknet
+        // https://arxiv.org/abs/1902.09630v2
+        // https://giou.stanford.edu/
+        // https://arxiv.org/abs/1911.08287v1
+        // https://github.com/Zzh-tju/DIoU-darknet
+        all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
+
+        float dx, dy, dw, dh;
+
+        dx = all_ious.dx_iou.dt;
+        dy = all_ious.dx_iou.db;
+        dw = all_ious.dx_iou.dl;
+        dh = all_ious.dx_iou.dr;
+
+        if (yolo_point == YOLO_CENTER) {
+        }
+        else if (yolo_point == YOLO_LEFT_TOP) {
+            dx = dx - dw/2;
+            dy = dy - dh/2;
+        }
+        else if (yolo_point == YOLO_RIGHT_BOTTOM) {
+            dx = dx + dw / 2;
+            dy = dy + dh / 2;
+        }
+
+        // jacobian^t (transpose)
+        //float dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
+        //float dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
+        //float dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
+        //float dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
+
+        // predict exponential, apply gradient of e^delta_t ONLY for w,h
+        dw *= exp(x[index + 4 * stride]);
+        dh *= exp(x[index + 6 * stride]);
+
+        delta_x = dx;
+        delta_y = dy;
+        delta_w = dw;
+        delta_h = dh;
+    }
+
+    // normalize iou weight, for GIoU
+    delta_x *= iou_normalizer;
+    delta_y *= iou_normalizer;
+    delta_w *= iou_normalizer;
+    delta_h *= iou_normalizer;
+
+    // normalize Uncertainty weight
+    delta_ux *= uc_normalizer;
+    delta_uy *= uc_normalizer;
+    delta_uw *= uc_normalizer;
+    delta_uh *= uc_normalizer;
+
+    delta_x = fix_nan_inf(delta_x);
+    delta_y = fix_nan_inf(delta_y);
+    delta_w = fix_nan_inf(delta_w);
+    delta_h = fix_nan_inf(delta_h);
+
+    delta_ux = fix_nan_inf(delta_ux);
+    delta_uy = fix_nan_inf(delta_uy);
+    delta_uw = fix_nan_inf(delta_uw);
+    delta_uh = fix_nan_inf(delta_uh);
+
+    if (max_delta != FLT_MAX) {
+        delta_x = clip_value(delta_x, max_delta);
+        delta_y = clip_value(delta_y, max_delta);
+        delta_w = clip_value(delta_w, max_delta);
+        delta_h = clip_value(delta_h, max_delta);
+
+        delta_ux = clip_value(delta_ux, max_delta);
+        delta_uy = clip_value(delta_uy, max_delta);
+        delta_uw = clip_value(delta_uw, max_delta);
+        delta_uh = clip_value(delta_uh, max_delta);
+    }
+
+    delta[index + 0 * stride] += delta_x;
+    delta[index + 2 * stride] += delta_y;
+    delta[index + 4 * stride] += delta_w;
+    delta[index + 6 * stride] += delta_h;
+
+    delta[index + 1 * stride] += delta_ux;
+    delta[index + 3 * stride] += delta_uy;
+    delta[index + 5 * stride] += delta_uw;
+    delta[index + 7 * stride] += delta_uh;
+    return iou;
+}
+
+void averages_gaussian_yolo_deltas(int class_index, int box_index, int stride, int classes, float *delta)
+{
+
+    int classes_in_one_box = 0;
+    int c;
+    for (c = 0; c < classes; ++c) {
+        if (delta[class_index + stride*c] > 0) classes_in_one_box++;
+    }
+
+    if (classes_in_one_box > 0) {
+        delta[box_index + 0 * stride] /= classes_in_one_box;
+        delta[box_index + 1 * stride] /= classes_in_one_box;
+        delta[box_index + 2 * stride] /= classes_in_one_box;
+        delta[box_index + 3 * stride] /= classes_in_one_box;
+        delta[box_index + 4 * stride] /= classes_in_one_box;
+        delta[box_index + 5 * stride] /= classes_in_one_box;
+        delta[box_index + 6 * stride] /= classes_in_one_box;
+        delta[box_index + 7 * stride] /= classes_in_one_box;
+    }
+}
+
+void delta_gaussian_yolo_class(float *output, float *delta, int index, int class_id, int classes, int stride, float *avg_cat, float label_smooth_eps, float *classes_multipliers, float cls_normalizer)
+{
+    int n;
+    if (delta[index]){
+        float y_true = 1;
+        if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        delta[index + stride*class_id] = y_true - output[index + stride*class_id];
+        //delta[index + stride*class_id] = 1 - output[index + stride*class_id];
+
+        if (classes_multipliers) delta[index + stride*class_id] *= classes_multipliers[class_id];
+        if(avg_cat) *avg_cat += output[index + stride*class_id];
+        return;
+    }
+    for(n = 0; n < classes; ++n){
+        float y_true = ((n == class_id) ? 1 : 0);
+        if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        delta[index + stride*n] = y_true - output[index + stride*n];
+
+        if (classes_multipliers && n == class_id) delta[index + stride*class_id] *= classes_multipliers[class_id] * cls_normalizer;
+        if(n == class_id && avg_cat) *avg_cat += output[index + stride*n];
+    }
+}
+
+int compare_gaussian_yolo_class(float *output, int classes, int class_index, int stride, float objectness, int class_id, float conf_thresh)
+{
+    int j;
+    for (j = 0; j < classes; ++j) {
+        //float prob = objectness * output[class_index + stride*j];
+        float prob = output[class_index + stride*j];
+        if (prob > conf_thresh) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static int entry_gaussian_index(layer l, int batch, int location, int entry)
+{
+    int n =   location / (l.w*l.h);
+    int loc = location % (l.w*l.h);
+    return batch*l.outputs + n*l.w*l.h*(8+l.classes+1) + entry*l.w*l.h + loc;
+}
+
+void forward_gaussian_yolo_layer(const layer l, network_state state)
+{
+    int i,j,b,t,n;
+    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
+
+#ifndef GPU
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            // x : mu, sigma
+            int index = entry_gaussian_index(l, b, n*l.w*l.h, 0);
+            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
+            scal_add_cpu(l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output + index, 1);    // scale x
+            // y : mu, sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 2);
+            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
+            scal_add_cpu(l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output + index, 1);    // scale y
+            // w : sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 5);
+            activate_array(l.output + index, l.w*l.h, LOGISTIC);
+            // h : sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 7);
+            activate_array(l.output + index, l.w*l.h, LOGISTIC);
+            // objectness & class
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 8);
+            activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
+        }
+    }
+#endif
+
+    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+    if (!state.train) return;
+    float avg_iou = 0;
+    float recall = 0;
+    float recall75 = 0;
+    float avg_cat = 0;
+    float avg_obj = 0;
+    float avg_anyobj = 0;
+    int count = 0;
+    int class_count = 0;
+    *(l.cost) = 0;
+    for (b = 0; b < l.batch; ++b) {
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    const int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
+                    const int obj_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 8);
+                    const int box_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    const int stride = l.w*l.h;
+                    box pred = get_gaussian_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h, l.yolo_point);
+                    float best_match_iou = 0;
+                    int best_match_t = 0;
+                    float best_iou = 0;
+                    int best_t = 0;
+                    for(t = 0; t < l.max_boxes; ++t){
+                        box truth = float_to_box_stride(state.truth + t*l.truth_size + b*l.truths, 1);
+                        int class_id = state.truth[t*l.truth_size + b*l.truths + 4];
+                        if (class_id >= l.classes) {
+                            printf("\n Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1);
+                            printf(" truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f, class_id = %d \n", truth.x, truth.y, truth.w, truth.h, class_id);
+                            continue; // if label contains class_id more than number of classes in the cfg-file
+                        }
+                        if(!truth.x) break;
+
+
+                        float objectness = l.output[obj_index];
+                        int class_id_match = compare_gaussian_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id, 0.25f);
+
+                        float iou = box_iou(pred, truth);
+                        if (iou > best_match_iou && class_id_match == 1) {
+                            best_match_iou = iou;
+                            best_match_t = t;
+                        }
+                        if (iou > best_iou) {
+                            best_iou = iou;
+                            best_t = t;
+                        }
+                    }
+
+                    avg_anyobj += l.output[obj_index];
+                    l.delta[obj_index] = l.obj_normalizer * (0 - l.output[obj_index]);
+                    if (best_match_iou > l.ignore_thresh) {
+                        const float iou_multiplier = best_match_iou*best_match_iou;// (best_match_iou - l.ignore_thresh) / (1.0 - l.ignore_thresh);
+                        if (l.objectness_smooth) {
+                            l.delta[obj_index] = l.obj_normalizer * (iou_multiplier - l.output[obj_index]);
+
+                            int class_id = state.truth[best_match_t*l.truth_size + b*l.truths + 4];
+                            if (l.map) class_id = l.map[class_id];
+                            delta_gaussian_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, 0, l.label_smooth_eps, l.classes_multipliers, l.cls_normalizer);
+                        }
+                        else l.delta[obj_index] = 0;
+                    }
+                    else if (state.net.adversarial) {
+                        float scale = pred.w * pred.h;
+                        if (scale > 0) scale = sqrt(scale);
+                        l.delta[obj_index] = scale * l.obj_normalizer * (0 - l.output[obj_index]);
+                        int cl_id;
+                        for (cl_id = 0; cl_id < l.classes; ++cl_id) {
+                            if (l.output[class_index + stride*cl_id] * l.output[obj_index] > 0.25)
+                                l.delta[class_index + stride*cl_id] = scale * (0 - l.output[class_index + stride*cl_id]);
+                        }
+                    }
+                    if (best_iou > l.truth_thresh) {
+                        const float iou_multiplier = best_iou*best_iou;// (best_iou - l.truth_thresh) / (1.0 - l.truth_thresh);
+                        if (l.objectness_smooth) l.delta[obj_index] = l.obj_normalizer * (iou_multiplier - l.output[obj_index]);
+                        else l.delta[obj_index] = l.obj_normalizer * (1 - l.output[obj_index]);
+                        //l.delta[obj_index] = l.obj_normalizer * (1 - l.output[obj_index]);
+
+                        int class_id = state.truth[best_t*l.truth_size + b*l.truths + 4];
+                        if (l.map) class_id = l.map[class_id];
+                        delta_gaussian_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, 0, l.label_smooth_eps, l.classes_multipliers, l.cls_normalizer);
+                        const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
+                        if (l.objectness_smooth) l.delta[class_index + stride*class_id] = class_multiplier * (iou_multiplier - l.output[class_index + stride*class_id]);
+                        box truth = float_to_box_stride(state.truth + best_t*l.truth_size + b*l.truths, 1);
+                        delta_gaussian_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, l.uc_normalizer, 1, l.yolo_point, l.max_delta);
+                    }
+                }
+            }
+        }
+        for(t = 0; t < l.max_boxes; ++t){
+            box truth = float_to_box_stride(state.truth + t*l.truth_size + b*l.truths, 1);
+
+            if(!truth.x) break;
+            float best_iou = 0;
+            int best_n = 0;
+            i = (truth.x * l.w);
+            j = (truth.y * l.h);
+
+            if (l.yolo_point == YOLO_CENTER) {
+            }
+            else if (l.yolo_point == YOLO_LEFT_TOP) {
+                i = min_val_cmp(l.w-1, max_val_cmp(0, ((truth.x - truth.w / 2) * l.w)));
+                j = min_val_cmp(l.h-1, max_val_cmp(0, ((truth.y - truth.h / 2) * l.h)));
+            }
+            else if (l.yolo_point == YOLO_RIGHT_BOTTOM) {
+                i = min_val_cmp(l.w-1, max_val_cmp(0, ((truth.x + truth.w / 2) * l.w)));
+                j = min_val_cmp(l.h-1, max_val_cmp(0, ((truth.y + truth.h / 2) * l.h)));
+            }
+
+            box truth_shift = truth;
+            truth_shift.x = truth_shift.y = 0;
+            for(n = 0; n < l.total; ++n){
+                box pred = {0};
+                pred.w = l.biases[2*n]/ state.net.w;
+                pred.h = l.biases[2*n+1]/ state.net.h;
+                float iou = box_iou(pred, truth_shift);
+                if (iou > best_iou){
+                    best_iou = iou;
+                    best_n = n;
+                }
+            }
+
+            int mask_n = int_index(l.mask, best_n, l.n);
+            if(mask_n >= 0){
+                int class_id = state.truth[t*l.truth_size + b*l.truths + 4];
+                if (l.map) class_id = l.map[class_id];
+
+                int box_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
+                const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
+                float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, l.uc_normalizer, 1, l.yolo_point, l.max_delta);
+
+                int obj_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 8);
+                avg_obj += l.output[obj_index];
+                l.delta[obj_index] = class_multiplier * l.obj_normalizer * (1 - l.output[obj_index]);
+
+                int class_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 9);
+                delta_gaussian_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, &avg_cat, l.label_smooth_eps, l.classes_multipliers, l.cls_normalizer);
+
+                ++count;
+                ++class_count;
+                if(iou > .5) recall += 1;
+                if(iou > .75) recall75 += 1;
+                avg_iou += iou;
+            }
+
+
+            // iou_thresh
+            for (n = 0; n < l.total; ++n) {
+                int mask_n = int_index(l.mask, n, l.n);
+                if (mask_n >= 0 && n != best_n && l.iou_thresh < 1.0f) {
+                    box pred = { 0 };
+                    pred.w = l.biases[2 * n] / state.net.w;
+                    pred.h = l.biases[2 * n + 1] / state.net.h;
+                    float iou = box_iou_kind(pred, truth_shift, l.iou_thresh_kind); // IOU, GIOU, MSE, DIOU, CIOU
+                    // iou, n
+
+                    if (iou > l.iou_thresh) {
+                        int class_id = state.truth[t*l.truth_size + b*l.truths + 4];
+                        if (l.map) class_id = l.map[class_id];
+
+                        int box_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
+                        const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
+                        float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, l.uc_normalizer, 1, l.yolo_point, l.max_delta);
+
+                        int obj_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 8);
+                        avg_obj += l.output[obj_index];
+                        l.delta[obj_index] = class_multiplier * l.obj_normalizer * (1 - l.output[obj_index]);
+
+                        int class_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 9);
+                        delta_gaussian_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, &avg_cat, l.label_smooth_eps, l.classes_multipliers, l.cls_normalizer);
+
+                        ++count;
+                        ++class_count;
+                        if (iou > .5) recall += 1;
+                        if (iou > .75) recall75 += 1;
+                        avg_iou += iou;
+                    }
+                }
+            }
+        }
+
+        // averages the deltas obtained by the function: delta_yolo_box()_accumulate
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int box_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
+                    const int stride = l.w*l.h;
+
+                    averages_gaussian_yolo_deltas(class_index, box_index, stride, l.classes, l.delta);
+                }
+            }
+        }
+    }
+
+
+    // calculate: Classification-loss, IoU-loss and Uncertainty-loss
+    const int stride = l.w*l.h;
+    float* classification_lost = (float *)calloc(l.batch * l.outputs, sizeof(float));
+    memcpy(classification_lost, l.delta, l.batch * l.outputs * sizeof(float));
+
+
+    for (b = 0; b < l.batch; ++b) {
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int box_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+
+                    classification_lost[box_index + 0 * stride] = 0;
+                    classification_lost[box_index + 1 * stride] = 0;
+                    classification_lost[box_index + 2 * stride] = 0;
+                    classification_lost[box_index + 3 * stride] = 0;
+                    classification_lost[box_index + 4 * stride] = 0;
+                    classification_lost[box_index + 5 * stride] = 0;
+                    classification_lost[box_index + 6 * stride] = 0;
+                    classification_lost[box_index + 7 * stride] = 0;
+                }
+            }
+        }
+    }
+    float class_loss = pow(mag_array(classification_lost, l.outputs * l.batch), 2);
+    free(classification_lost);
+
+
+    float* except_uncertainty_lost = (float *)calloc(l.batch * l.outputs, sizeof(float));
+    memcpy(except_uncertainty_lost, l.delta, l.batch * l.outputs * sizeof(float));
+    for (b = 0; b < l.batch; ++b) {
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int box_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    except_uncertainty_lost[box_index + 4 * stride] = 0;
+                    except_uncertainty_lost[box_index + 5 * stride] = 0;
+                    except_uncertainty_lost[box_index + 6 * stride] = 0;
+                    except_uncertainty_lost[box_index + 7 * stride] = 0;
+                }
+            }
+        }
+    }
+    float except_uc_loss = pow(mag_array(except_uncertainty_lost, l.outputs * l.batch), 2);
+    free(except_uncertainty_lost);
+
+    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+
+    float loss = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+    float uc_loss = loss - except_uc_loss;
+    float iou_loss = except_uc_loss - class_loss;
+
+    loss /= l.batch;
+    class_loss /= l.batch;
+    uc_loss /= l.batch;
+    iou_loss /= l.batch;
+
+    fprintf(stderr, "Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d, class_loss = %.2f, iou_loss = %.2f, uc_loss = %.2f, total_loss = %.2f \n",
+        state.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count,
+        class_loss, iou_loss, uc_loss, loss);
+}
+
+void backward_gaussian_yolo_layer(const layer l, network_state state)
+{
+   axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
+}
+
+void correct_gaussian_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter)
+{
+    int i;
+    int new_w=0;
+    int new_h=0;
+    if (letter) {
+        if (((float)netw / w) < ((float)neth / h)) {
+            new_w = netw;
+            new_h = (h * netw) / w;
+        }
+        else {
+            new_h = neth;
+            new_w = (w * neth) / h;
+        }
+    }
+    else {
+        new_w = netw;
+        new_h = neth;
+    }
+    /*
+    if (((float)netw/w) < ((float)neth/h)) {
+        new_w = netw;
+        new_h = (h * netw)/w;
+    } else {
+        new_h = neth;
+        new_w = (w * neth)/h;
+    }
+    */
+    for (i = 0; i < n; ++i){
+        box b = dets[i].bbox;
+        b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw);
+        b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth);
+        b.w *= (float)netw/new_w;
+        b.h *= (float)neth/new_h;
+        if(!relative){
+            b.x *= w;
+            b.w *= w;
+            b.y *= h;
+            b.h *= h;
+        }
+        dets[i].bbox = b;
+    }
+}
+
+int gaussian_yolo_num_detections(layer l, float thresh)
+{
+    int i, n;
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_gaussian_index(l, 0, n*l.w*l.h + i, 8);
+            if(l.output[obj_index] > thresh){
+                ++count;
+            }
+        }
+    }
+    return count;
+}
+
+/*
+void avg_flipped_gaussian_yolo(layer l)
+{
+    int i,j,n,z;
+    float *flip = l.output + l.outputs;
+    for (j = 0; j < l.h; ++j) {
+        for (i = 0; i < l.w/2; ++i) {
+            for (n = 0; n < l.n; ++n) {
+                for(z = 0; z < l.classes + 8 + 1; ++z){
+                    int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
+                    int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
+                    float swap = flip[i1];
+                    flip[i1] = flip[i2];
+                    flip[i2] = swap;
+                    if(z == 0){
+                        flip[i1] = -flip[i1];
+                        flip[i2] = -flip[i2];
+                    }
+                }
+            }
+        }
+    }
+    for(i = 0; i < l.outputs; ++i){
+        l.output[i] = (l.output[i] + flip[i])/2.;
+    }
+}
+*/
+
+int get_gaussian_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter)
+{
+    int i,j,n;
+    float *predictions = l.output;
+    //if (l.batch == 2) avg_flipped_gaussian_yolo(l);
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        int row = i / l.w;
+        int col = i % l.w;
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_gaussian_index(l, 0, n*l.w*l.h + i, 8);
+            float objectness = predictions[obj_index];
+            if (objectness <= thresh) continue;    // incorrect behavior for Nan values
+
+            if (objectness > thresh) {
+                int box_index = entry_gaussian_index(l, 0, n*l.w*l.h + i, 0);
+                dets[count].bbox = get_gaussian_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h, l.yolo_point);
+                dets[count].objectness = objectness;
+                dets[count].classes = l.classes;
+
+                dets[count].uc[0] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 1)]; // tx uncertainty
+                dets[count].uc[1] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 3)]; // ty uncertainty
+                dets[count].uc[2] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 5)]; // tw uncertainty
+                dets[count].uc[3] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 7)]; // th uncertainty
+
+                dets[count].points = l.yolo_point;
+                //if (l.yolo_point != YOLO_CENTER) dets[count].objectness = objectness = 0;
+
+                for (j = 0; j < l.classes; ++j) {
+                    int class_index = entry_gaussian_index(l, 0, n*l.w*l.h + i, 9 + j);
+                    float uc_aver = (dets[count].uc[0] + dets[count].uc[1] + dets[count].uc[2] + dets[count].uc[3]) / 4.0;
+                    float prob = objectness*predictions[class_index] * (1.0 - uc_aver);
+                    dets[count].prob[j] = (prob > thresh) ? prob : 0;
+                }
+                ++count;
+            }
+        }
+    }
+    correct_gaussian_yolo_boxes(dets, count, w, h, netw, neth, relative, letter);
+    return count;
+}
+
+#ifdef GPU
+
+void forward_gaussian_yolo_layer_gpu(const layer l, network_state state)
+{
+    copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
+    int b, n;
+    for (b = 0; b < l.batch; ++b)
+    {
+        for(n = 0; n < l.n; ++n)
+        {
+            // x : mu, sigma
+            int index = entry_gaussian_index(l, b, n*l.w*l.h, 0);
+            activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            scal_add_ongpu(l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output_gpu + index, 1);      // scale x
+            // y : mu, sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 2);
+            activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            scal_add_ongpu(l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output_gpu + index, 1);      // scale y
+            // w : sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 5);
+            activate_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC);
+            // h : sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 7);
+            activate_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC);
+            // objectness & class
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 8);
+            activate_array_ongpu(l.output_gpu + index, (1+l.classes)*l.w*l.h, LOGISTIC);
+        }
+    }
+
+    if (!state.train || l.onlyforward) {
+        //cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+        cuda_pull_array_async(l.output_gpu, l.output, l.batch*l.outputs);
+        CHECK_CUDA(cudaPeekAtLastError());
+        return;
+    }
+
+    float *in_cpu = (float *)calloc(l.batch*l.inputs, sizeof(float));
+    cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+    memcpy(in_cpu, l.output, l.batch*l.outputs * sizeof(float));
+    float *truth_cpu = 0;
+    if (state.truth) {
+        int num_truth = l.batch*l.truths;
+        truth_cpu = (float *)calloc(num_truth, sizeof(float));
+        cuda_pull_array(state.truth, truth_cpu, num_truth);
+    }
+    network_state cpu_state = state;
+    cpu_state.net = state.net;
+    cpu_state.index = state.index;
+    cpu_state.train = state.train;
+    cpu_state.truth = truth_cpu;
+    cpu_state.input = in_cpu;
+    forward_gaussian_yolo_layer(l, cpu_state);
+    //forward_yolo_layer(l, state);
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
+    free(in_cpu);
+    if (cpu_state.truth) free(cpu_state.truth);
+}
+
+void backward_gaussian_yolo_layer_gpu(const layer l, network_state state)
+{
+    axpy_ongpu(l.batch*l.inputs, l.delta_normalizer, l.delta_gpu, 1, state.delta, 1);
+}
+#endif
diff --git a/darknet-master/src/gaussian_yolo_layer.h b/darknet-master/src/gaussian_yolo_layer.h
new file mode 100644
index 0000000..9080881
--- /dev/null
+++ b/darknet-master/src/gaussian_yolo_layer.h
@@ -0,0 +1,22 @@
+//Gaussian YOLOv3 implementation
+#ifndef GAUSSIAN_YOLO_LAYER_H
+#define GAUSSIAN_YOLO_LAYER_H
+
+#include "darknet.h"
+#include "layer.h"
+#include "network.h"
+
+layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes, int max_boxes);
+void forward_gaussian_yolo_layer(const layer l, network_state state);
+void backward_gaussian_yolo_layer(const layer l, network_state state);
+void resize_gaussian_yolo_layer(layer *l, int w, int h);
+int gaussian_yolo_num_detections(layer l, float thresh);
+int get_gaussian_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter);
+void correct_gaussian_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter);
+
+#ifdef GPU
+void forward_gaussian_yolo_layer_gpu(const layer l, network_state state);
+void backward_gaussian_yolo_layer_gpu(layer l, network_state state);
+#endif
+
+#endif
diff --git a/darknet-master/src/gemm.c b/darknet-master/src/gemm.c
new file mode 100644
index 0000000..256061b
--- /dev/null
+++ b/darknet-master/src/gemm.c
@@ -0,0 +1,2850 @@
+#include "gemm.h"
+#include "utils.h"
+#include "im2col.h"
+#include "dark_cuda.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+#include <stdint.h>
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#if defined(_MSC_VER)
+#if defined(_M_ARM) || defined(_M_ARM64)
+static inline uint32_t popcnt(uint32_t v) {
+  v = v - ((v >> 1) & 0x55555555);
+  v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+  return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
+}
+#define POPCNT(x) popcnt((x))
+#define POPCNT64(x) (popcnt((unsigned)(x)) + popcnt((unsigned)((uint64_t)(x) >> 32)))
+#else
+#include <intrin.h>
+#ifdef _WIN64
+#define POPCNT(x) __popcnt(x)
+#define POPCNT64(x) __popcnt64(x)
+#else
+static inline int popcnt_64(uint64_t val64) {
+  int tmp_count = __popcnt(val64);
+  tmp_count += __popcnt(val64 >> 32);
+  return tmp_count;
+}
+#define POPCNT(x) __popcnt(x)
+#define POPCNT64(x) popcnt_64(x)
+#endif
+#endif
+#elif defined(__GNUC__)
+#define POPCNT(x) __builtin_popcount(x)
+#define POPCNT64(x) __builtin_popcountll(x)
+#endif
+
+#define TILE_M 4 // 4 ops
+#define TILE_N 16 // AVX2 = 2 ops * 8 floats
+#define TILE_K 16 // loop
+#ifdef __cplusplus
+#define PUT_IN_REGISTER
+#else
+#define PUT_IN_REGISTER register
+#endif
+
+void gemm_bin(int M, int N, int K, float ALPHA,
+        char  *A, int lda,
+        float *B, int ldb,
+        float *C, int ldc)
+{
+    int i,j,k;
+    for(i = 0; i < M; ++i){
+        for(k = 0; k < K; ++k){
+            char A_PART = A[i*lda+k];
+            if(A_PART){
+                for(j = 0; j < N; ++j){
+                    C[i*ldc+j] += B[k*ldb+j];
+                }
+            } else {
+                for(j = 0; j < N; ++j){
+                    C[i*ldc+j] -= B[k*ldb+j];
+                }
+            }
+        }
+    }
+}
+
+float *random_matrix(int rows, int cols)
+{
+    int i;
+    float* m = (float*)xcalloc(rows * cols, sizeof(float));
+    for(i = 0; i < rows*cols; ++i){
+        m[i] = (float)rand()/RAND_MAX;
+    }
+    return m;
+}
+
+void time_random_matrix(int TA, int TB, int m, int k, int n)
+{
+    float *a;
+    if(!TA) a = random_matrix(m,k);
+    else a = random_matrix(k,m);
+    int lda = (!TA)?k:m;
+    float *b;
+    if(!TB) b = random_matrix(k,n);
+    else b = random_matrix(n,k);
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+    int i;
+    clock_t start = clock(), end;
+    for(i = 0; i<10; ++i){
+        gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
+    }
+    end = clock();
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
+    free(a);
+    free(b);
+    free(c);
+}
+
+
+void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float BETA,
+        float *C, int ldc)
+{
+    gemm_cpu( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
+}
+
+
+//--------------------------------------------
+// XNOR bitwise GEMM for binary neural network
+//--------------------------------------------
+
+
+static inline unsigned char xnor(unsigned char a, unsigned char b) {
+    //return a == b;
+    return !(a^b);
+}
+
+// INT-32
+static inline uint32_t get_bit_int32(uint32_t const*const src, size_t index) {
+    size_t src_i = index / 32;
+    int src_shift = index % 32;
+    unsigned char val = (src[src_i] & (1 << src_shift)) > 0;
+    return val;
+}
+
+static inline uint32_t xnor_int32(uint32_t a, uint32_t b) {
+    return ~(a^b);
+}
+
+static inline uint64_t xnor_int64(uint64_t a, uint64_t b) {
+    return ~(a^b);
+}
+
+
+static inline uint32_t fill_bit_int32(char src) {
+    if (src == 0) return 0x00000000;
+    else return  0xFFFFFFFF;
+}
+
+static inline uint64_t fill_bit_int64(char src) {
+    if (src == 0) return 0x0000000000000000;
+    else return  0xFFFFFFFFFFFFFFFF;
+}
+
+void binary_int32_printf(uint32_t src) {
+    int i;
+    for (i = 0; i < 32; ++i) {
+        if (src & 1) printf("1");
+        else printf("0");
+        src = src >> 1;
+    }
+    printf("\n");
+}
+
+void binary_int64_printf(uint64_t src) {
+    int i;
+    for (i = 0; i < 64; ++i) {
+        if (src & 1) printf("1");
+        else printf("0");
+        src = src >> 1;
+    }
+    printf("\n");
+}
+
+/*
+void gemm_nn_custom_bin_mean(int M, int N, int K, float ALPHA_UNUSED,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+    int *count_arr = xcalloc(M*N, sizeof(int));
+
+    int i, j, k;
+    for (i = 0; i < M; ++i) {   // l.n - filters [16 - 55 - 1024]
+        for (k = 0; k < K; ++k) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+            char a_bit = get_bit(A, i*lda + k);
+
+            for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
+                char b_bit = get_bit(B, k*ldb + j);
+                count_arr[i*ldc + j] += xnor(a_bit, b_bit);
+            }
+        }
+    }
+
+    for (i = 0; i < M; ++i) {
+        float mean_val = mean_arr[i];
+        for (j = 0; j < N; ++j) {
+            C[i*ldc + j] = (2 * count_arr[i*ldc + j] - K) * mean_val;
+        }
+    }
+    free(count_arr);
+}
+*/
+
+/*
+void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+    int *count_arr = xcalloc(M*N, sizeof(int));
+
+    int i, j, k;
+    for (i = 0; i < M; ++i) {   // l.n - filters [16 - 55 - 1024]
+        for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
+            for (k = 0; k < K; ++k) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                char a_bit = get_bit(A, i*lda + k);
+                char b_bit = get_bit(B, j*ldb + k);
+                count_arr[i*ldc + j] += xnor(a_bit, b_bit);
+            }
+        }
+    }
+
+    for (i = 0; i < M; ++i) {
+        float mean_val = mean_arr[i];
+        for (j = 0; j < N; ++j) {
+            C[i*ldc + j] = (2 * count_arr[i*ldc + j] - K) * mean_val;
+        }
+    }
+    free(count_arr);
+}
+*/
+
+/*
+void gemm_nn_custom_bin_mean(int M, int N, int K, float ALPHA_UNUSED,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+    int *count_arr = xcalloc(M*N, sizeof(int));
+
+    int i;
+
+#pragma omp parallel for
+    for (i = 0; i < M; ++i) {   // l.n - filters [16 - 55 - 1024]
+        int j, k, h;
+        for (k = 0; k < K; ++k) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+            const char a_bit = get_bit(A, i*lda + k);
+            uint64_t a_bit64 = fill_bit_int64(a_bit);
+            int  k_ldb = k*ldb;
+
+            for (j = 0; j < N; j += 64) { // out_h*out_w - one channel output size [169 - 173056]
+                if ((N - j > 64) && (k_ldb % 8 == 0)) {
+                    uint64_t b_bit64 = *((uint64_t *)(B + (k_ldb + j) / 8));
+                    uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
+                    //printf("\n %d \n",__builtin_popcountll(c_bit64)); // gcc
+                    printf("\n %d \n", POPCNT64(c_bit64));    // msvs
+
+                    int h;
+                    for (h = 0; h < 64; ++h)
+                        if ((c_bit64 >> h) & 1) count_arr[i*ldc + j + h] += 1;
+
+                    //binary_int64_printf(a_bit64);
+                    //binary_int64_printf(b_bit64);
+                    //binary_int64_printf(c_bit64);
+                }
+                else {
+                    for (; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
+                        char b_bit = get_bit(B, k_ldb + j);
+                        if (xnor(a_bit, b_bit)) count_arr[i*ldc + j] += 1;
+                    }
+                }
+
+            }
+        }
+    }
+
+    if (mean_arr) {
+        //int K_2 = K / 2;
+        for (i = 0; i < M; ++i) {
+            float mean_val = mean_arr[i];
+            //float mean_val2 = 2 * mean_val;
+            for (j = 0; j < N; ++j) {
+                C[i*ldc + j] = (2 * count_arr[i*ldc + j] - K) * mean_val;
+                //C[i*ldc + j] = (count_arr[i*ldc + j] - K_2) *mean_val2;
+            }
+        }
+    }
+    else {
+        for (i = 0; i < M; ++i) {
+            for (j = 0; j < N; ++j) {
+                C[i*ldc + j] = count_arr[i*ldc + j] - K / 2;
+            }
+        }
+    }
+
+    free(count_arr);
+}
+*/
+
+
+/*
+void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+    int i;
+
+#pragma omp parallel for
+    for (i = 0; i < M; ++i) {   // l.n - filters [16 - 55 - 1024]
+        int j, k, h;
+        float mean_val = mean_arr[i];
+
+        for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
+            int count = 0;
+
+            for (k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8));
+                uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8));
+                uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
+
+                int tmp_count = POPCNT64(c_bit64);
+
+                if (K - k < 64)  tmp_count = tmp_count - (64 - (K - k));    // remove extra bits
+                count += tmp_count;
+                //binary_int64_printf(c_bit64);
+                //printf(", count = %d \n\n", tmp_count);
+            }
+
+            C[i*ldc + j] = (2 * count - K) * mean_val;
+        }
+    }
+}
+*/
+
+//----------------------------
+
+// is not used
+/*
+void transpose_32x32_bits_my(uint32_t *A, uint32_t *B, int lda, int ldb)
+{
+    unsigned int x, y;
+    for (y = 0; y < 32; ++y) {
+        for (x = 0; x < 32; ++x) {
+            if (A[y * lda] & ((uint32_t)1 << x)) B[x * ldb] |= (uint32_t)1 << y;
+        }
+    }
+}
+*/
+
+#ifndef GPU
+uint8_t reverse_8_bit(uint8_t a) {
+    return ((a * 0x0802LU & 0x22110LU) | (a * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;
+}
+
+uint32_t reverse_32_bit(uint32_t a)
+{
+    // unsigned int __rbit(unsigned int val) // for ARM    //__asm__("rbit %0, %1\n" : "=r"(output) : "r"(input));
+    return (reverse_8_bit(a >> 24) << 0) |
+        (reverse_8_bit(a >> 16) << 8) |
+        (reverse_8_bit(a >> 8) << 16) |
+        (reverse_8_bit(a >> 0) << 24);
+}
+
+#define swap(a0, a1, j, m) t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j);
+
+void transpose32_optimized(uint32_t A[32]) {
+    int j, k;
+    unsigned m, t;
+
+    //m = 0x0000FFFF;
+    //for (j = 16; j != 0; j = j >> 1, m = m ^ (m << j)) {
+    //    for (k = 0; k < 32; k = (k + j + 1) & ~j) {
+    //        t = (A[k] ^ (A[k + j] >> j)) & m;
+    //        A[k] = A[k] ^ t;
+    //        A[k + j] = A[k + j] ^ (t << j);
+    //    }
+    //}
+
+    j = 16;
+    m = 0x0000FFFF;
+    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }
+
+    j = 8;
+    m = 0x00ff00ff;
+    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }
+
+    j = 4;
+    m = 0x0f0f0f0f;
+    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }
+
+    j = 2;
+    m = 0x33333333;
+    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }
+
+    j = 1;
+    m = 0x55555555;
+    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }
+
+    // reverse Y
+    for (j = 0; j < 16; ++j) {
+        uint32_t tmp = A[j];
+        A[j] = reverse_32_bit(A[31 - j]);
+        A[31 - j] = reverse_32_bit(tmp);
+    }
+}
+
+void transpose_32x32_bits_reversed_diagonale(uint32_t *A, uint32_t *B, int m, int n)
+{
+    unsigned A_tmp[32];
+    int i;
+    #pragma unroll
+    for (i = 0; i < 32; ++i) A_tmp[i] = A[i * m];
+    transpose32_optimized(A_tmp);
+    #pragma unroll
+    for (i = 0; i < 32; ++i) B[i*n] = A_tmp[i];
+}
+
+
+void transpose_8x8_bits_my(unsigned char *A, unsigned char *B, int lda, int ldb)
+{
+    unsigned x, y;
+    for (y = 0; y < 8; ++y) {
+        for (x = 0; x < 8; ++x) {
+            if (A[y * lda] & (1 << x)) B[x * ldb] |= 1 << y;
+        }
+    }
+}
+
+unsigned char reverse_byte_1(char a)
+{
+    return ((a & 0x1) << 7) | ((a & 0x2) << 5) |
+        ((a & 0x4) << 3) | ((a & 0x8) << 1) |
+        ((a & 0x10) >> 1) | ((a & 0x20) >> 3) |
+        ((a & 0x40) >> 5) | ((a & 0x80) >> 7);
+}
+
+unsigned char reverse_byte(unsigned char a)
+{
+    return ((a * 0x0802LU & 0x22110LU) | (a * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;
+}
+
+static unsigned char lookup[16] = {
+    0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+    0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf, };
+
+unsigned char reverse_byte_3(unsigned char n) {
+    // Reverse the top and bottom nibble then swap them.
+    return (lookup[n & 0b1111] << 4) | lookup[n >> 4];
+}
+
+
+void transpose8rS32_reversed_diagonale(unsigned char* A, unsigned char* B, int m, int n)
+{
+    unsigned x, y, t;
+
+    x = y = 0;
+    // Load the array and pack it into x and y.
+    //x = (A[0] << 24) | (A[m] << 16) | (A[2 * m] << 8) | A[3 * m];
+    //y = (A[4 * m] << 24) | (A[5 * m] << 16) | (A[6 * m] << 8) | A[7 * m];
+
+    t = (x ^ (x >> 7)) & 0x00AA00AA;  x = x ^ t ^ (t << 7);
+    t = (y ^ (y >> 7)) & 0x00AA00AA;  y = y ^ t ^ (t << 7);
+
+    t = (x ^ (x >> 14)) & 0x0000CCCC;  x = x ^ t ^ (t << 14);
+    t = (y ^ (y >> 14)) & 0x0000CCCC;  y = y ^ t ^ (t << 14);
+
+    t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);
+    y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);
+    x = t;
+
+    B[7 * n] = reverse_byte(x >> 24);  B[6 * n] = reverse_byte(x >> 16);  B[5 * n] = reverse_byte(x >> 8);  B[4 * n] = reverse_byte(x);
+    B[3 * n] = reverse_byte(y >> 24);  B[2 * n] = reverse_byte(y >> 16);  B[1 * n] = reverse_byte(y >> 8);  B[0 * n] = reverse_byte(y);
+}
+
+/*
+// transpose by 8-bit
+void transpose_bin(char *A, char *B, const int n, const int m,
+    const int lda, const int ldb, const int block_size)
+{
+    //printf("\n n = %d, ldb = %d \t\t m = %d, lda = %d \n", n, ldb, m, lda);
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; i += 8) {
+        int j;
+        for (j = 0; j < m; j += 8) {
+            int a_index = i*lda + j;
+            int b_index = j*ldb + i;
+            //transpose_8x8_bits_my(&A[a_index/8], &B[b_index/8], lda/8, ldb/8);
+            transpose8rS32_reversed_diagonale(&A[a_index / 8], &B[b_index / 8], lda / 8, ldb / 8);
+        }
+        for (; j < m; ++j) {
+            if (get_bit(A, i*lda + j)) set_bit(B, j*ldb + i);
+        }
+    }
+}
+*/
+
+#endif
+
+// transpose by 32-bit
+void transpose_bin(uint32_t *A, uint32_t *B, const int n, const int m,
+    const int lda, const int ldb, const int block_size)
+{
+    //printf("\n n = %d (n mod 32 = %d), m = %d (m mod 32 = %d) \n", n, n % 32, m, m % 32);
+    //printf("\n lda = %d (lda mod 32 = %d), ldb = %d (ldb mod 32 = %d) \n", lda, lda % 32, ldb, ldb % 32);
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; i += 32) {
+        int j;
+        for (j = 0; j < m; j += 32) {
+            int a_index = i*lda + j;
+            int b_index = j*ldb + i;
+            transpose_32x32_bits_reversed_diagonale(&A[a_index / 32], &B[b_index / 32], lda / 32, ldb / 32);
+            //transpose_32x32_bits_my(&A[a_index/32], &B[b_index/32], lda/32, ldb/32);
+        }
+        for (; j < m; ++j) {
+            if (get_bit((const unsigned char* const)A, i * lda + j)) set_bit((unsigned char* const)B, j * ldb + i);
+        }
+    }
+}
+
+#if (defined(__AVX__) && defined(__x86_64__)) || (defined(_WIN64) && !defined(__MINGW32__) && !defined(_M_ARM64))
+
+#if (defined(_WIN64) && !defined(__MINGW64__))
+#include <intrin.h>
+#include <ammintrin.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+static inline __int32 _mm256_extract_epi64(__m256i a, const int index) {
+    return a.m256i_i64[index];
+}
+
+static inline __int32 _mm256_extract_epi32(__m256i a, const int index) {
+    return a.m256i_i32[index];
+}
+#endif
+
+static inline float _dn_castu32_f32(uint32_t a) {
+    return *((float *)&a);
+}
+
+static inline float _mm256_extract_float32(__m256 a, const int index) {
+    return a.m256_f32[index];
+}
+
+#else    // Linux GCC/Clang
+#include <x86intrin.h>
+#include <ammintrin.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+#include <cpuid.h>
+
+static inline float _dn_castu32_f32(uint32_t a) {
+    return *((float *)&a);
+}
+
+static inline float _mm256_extract_float32(__m256 a, const int index) {
+    switch(index) {
+    case 0:
+      return _dn_castu32_f32(_mm256_extract_epi32(_mm256_castps_si256(a), 0));
+    case 1:
+      return _dn_castu32_f32(_mm256_extract_epi32(_mm256_castps_si256(a), 1));
+    case 2:
+      return _dn_castu32_f32(_mm256_extract_epi32(_mm256_castps_si256(a), 2));
+    case 3:
+      return _dn_castu32_f32(_mm256_extract_epi32(_mm256_castps_si256(a), 3));
+    case 4:
+      return _dn_castu32_f32(_mm256_extract_epi32(_mm256_castps_si256(a), 4));
+    case 5:
+      return _dn_castu32_f32(_mm256_extract_epi32(_mm256_castps_si256(a), 5));
+    case 6:
+      return _dn_castu32_f32(_mm256_extract_epi32(_mm256_castps_si256(a), 6));
+    case 7:
+      return _dn_castu32_f32(_mm256_extract_epi32(_mm256_castps_si256(a), 7));
+    default:
+      return _dn_castu32_f32(_mm256_extract_epi32(_mm256_castps_si256(a), 0));
+    }
+}
+
+void asm_cpuid(uint32_t* abcd, uint32_t eax)
+{
+    uint32_t ebx = 0, edx = 0, ecx = 0;
+
+    // EBX is saved to EDI and later restored
+    __asm__("movl %%ebx, %%edi;"
+        "cpuid;"
+        "xchgl %%ebx, %%edi;"
+        : "=D"(ebx),
+        "+a"(eax), "+c"(ecx), "=d"(edx));
+
+    abcd[0] = eax;
+    abcd[1] = ebx;
+    abcd[2] = ecx;
+    abcd[3] = edx;
+}
+#endif
+
+
+
+#ifdef _WIN32
+//  Windows
+#define cpuid(info, x)    __cpuidex(info, x, 0)
+#else
+//  GCC Intrinsics
+void cpuid(int info[4], int InfoType) {
+    __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
+}
+#endif
+
+
+//  Misc.
+static int HW_MMX, HW_x64, HW_RDRAND, HW_BMI1, HW_BMI2, HW_ADX, HW_PREFETCHWT1;
+static int HW_ABM;      // Advanced Bit Manipulation
+
+//  SIMD: 128-bit
+static int HW_SSE, HW_SSE2, HW_SSE3, HW_SSSE3, HW_SSE41, HW_SSE42, HW_SSE4a, HW_AES, HW_SHA;
+
+//  SIMD: 256-bit
+static int HW_AVX, HW_XOP, HW_FMA3, HW_FMA4, HW_AVX2;
+
+//  SIMD: 512-bit
+static int HW_AVX512F;    //  AVX512 Foundation
+static int HW_AVX512CD;   //  AVX512 Conflict Detection
+static int HW_AVX512PF;   //  AVX512 Prefetch
+static int HW_AVX512ER;   //  AVX512 Exponential + Reciprocal
+static int HW_AVX512VL;   //  AVX512 Vector Length Extensions
+static int HW_AVX512BW;   //  AVX512 Byte + Word
+static int HW_AVX512DQ;   //  AVX512 Doubleword + Quadword
+static int HW_AVX512IFMA; //  AVX512 Integer 52-bit Fused Multiply-Add
+static int HW_AVX512VBMI; //  AVX512 Vector Byte Manipulation Instructions
+
+// https://stackoverflow.com/questions/6121792/how-to-check-if-a-cpu-supports-the-sse3-instruction-set
+void check_cpu_features(void) {
+    int info[4];
+    cpuid(info, 0);
+    int nIds = info[0];
+
+    cpuid(info, 0x80000000);
+    unsigned nExIds = info[0];
+
+    //  Detect Features
+    if (nIds >= 0x00000001) {
+        cpuid(info, 0x00000001);
+        HW_MMX = (info[3] & ((uint32_t)1 << 23)) != 0;
+        HW_SSE = (info[3] & ((uint32_t)1 << 25)) != 0;
+        HW_SSE2 = (info[3] & ((uint32_t)1 << 26)) != 0;
+        HW_SSE3 = (info[2] & ((uint32_t)1 << 0)) != 0;
+
+        HW_SSSE3 = (info[2] & ((uint32_t)1 << 9)) != 0;
+        HW_SSE41 = (info[2] & ((uint32_t)1 << 19)) != 0;
+        HW_SSE42 = (info[2] & ((uint32_t)1 << 20)) != 0;
+        HW_AES = (info[2] & ((uint32_t)1 << 25)) != 0;
+
+        HW_AVX = (info[2] & ((uint32_t)1 << 28)) != 0;
+        HW_FMA3 = (info[2] & ((uint32_t)1 << 12)) != 0;
+
+        HW_RDRAND = (info[2] & ((uint32_t)1 << 30)) != 0;
+    }
+    if (nIds >= 0x00000007) {
+        cpuid(info, 0x00000007);
+        HW_AVX2 = (info[1] & ((uint32_t)1 << 5)) != 0;
+
+        HW_BMI1 = (info[1] & ((uint32_t)1 << 3)) != 0;
+        HW_BMI2 = (info[1] & ((uint32_t)1 << 8)) != 0;
+        HW_ADX = (info[1] & ((uint32_t)1 << 19)) != 0;
+        HW_SHA = (info[1] & ((uint32_t)1 << 29)) != 0;
+        HW_PREFETCHWT1 = (info[2] & ((uint32_t)1 << 0)) != 0;
+
+        HW_AVX512F = (info[1] & ((uint32_t)1 << 16)) != 0;
+        HW_AVX512CD = (info[1] & ((uint32_t)1 << 28)) != 0;
+        HW_AVX512PF = (info[1] & ((uint32_t)1 << 26)) != 0;
+        HW_AVX512ER = (info[1] & ((uint32_t)1 << 27)) != 0;
+        HW_AVX512VL = (info[1] & ((uint32_t)1 << 31)) != 0;
+        HW_AVX512BW = (info[1] & ((uint32_t)1 << 30)) != 0;
+        HW_AVX512DQ = (info[1] & ((uint32_t)1 << 17)) != 0;
+        HW_AVX512IFMA = (info[1] & ((uint32_t)1 << 21)) != 0;
+        HW_AVX512VBMI = (info[2] & ((uint32_t)1 << 1)) != 0;
+    }
+    if (nExIds >= 0x80000001) {
+        cpuid(info, 0x80000001);
+        HW_x64 = (info[3] & ((uint32_t)1 << 29)) != 0;
+        HW_ABM = (info[2] & ((uint32_t)1 << 5)) != 0;
+        HW_SSE4a = (info[2] & ((uint32_t)1 << 6)) != 0;
+        HW_FMA4 = (info[2] & ((uint32_t)1 << 16)) != 0;
+        HW_XOP = (info[2] & ((uint32_t)1 << 11)) != 0;
+    }
+}
+
+int is_avx() {
+    static int result = -1;
+    if (result == -1) {
+        check_cpu_features();
+        result = HW_AVX;
+        if (result == 1) printf(" Used AVX \n");
+        else printf(" Not used AVX \n");
+    }
+    return result;
+}
+
+int is_fma_avx2() {
+    static int result = -1;
+    if (result == -1) {
+        check_cpu_features();
+        result = HW_FMA3 && HW_AVX2;
+        if (result == 1) printf(" Used FMA & AVX2 \n");
+        else printf(" Not used FMA & AVX2 \n");
+    }
+    return result;
+}
+
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide
+void gemm_nn(int M, int N, int K, float ALPHA,
+    float *A, int lda,
+    float *B, int ldb,
+    float *C, int ldc)
+{
+    int i, j, k;
+    if (is_avx() == 1) {    // AVX
+        for (i = 0; i < M; ++i) {
+            for (k = 0; k < K; ++k) {
+                float A_PART = ALPHA*A[i*lda + k];
+                __m256 a256, b256, c256, result256;    // AVX
+                a256 = _mm256_set1_ps(A_PART);
+                for (j = 0; j < N - 8; j += 8) {
+                    b256 = _mm256_loadu_ps(&B[k*ldb + j]);
+                    c256 = _mm256_loadu_ps(&C[i*ldc + j]);
+                    // FMA - Intel Haswell (2013), AMD Piledriver (2012)
+                    //result256 = _mm256_fmadd_ps(a256, b256, c256);
+                    result256 = _mm256_mul_ps(a256, b256);
+                    result256 = _mm256_add_ps(result256, c256);
+                    _mm256_storeu_ps(&C[i*ldc + j], result256);
+                }
+
+                int prev_end = (N % 8 == 0) ? (N - 8) : (N / 8) * 8;
+                for (j = prev_end; j < N; ++j)
+                    C[i*ldc + j] += A_PART*B[k*ldb + j];
+            }
+        }
+    }
+    else {
+        for (i = 0; i < M; ++i) {
+            for (k = 0; k < K; ++k) {
+                PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k];
+                for (j = 0; j < N; ++j) {
+                    C[i*ldc + j] += A_PART*B[k*ldb + j];
+                }
+                /* // SSE
+                __m128 a128, b128, c128, result128;    // SSE
+                a128 = _mm_set1_ps(A_PART);
+                for (j = 0; j < N - 4; j += 4) {
+                b128 = _mm_loadu_ps(&B[k*ldb + j]);
+                c128 = _mm_loadu_ps(&C[i*ldc + j]);
+                //result128 = _mm_fmadd_ps(a128, b128, c128);
+                result128 = _mm_mul_ps(a128, b128);
+                result128 = _mm_add_ps(result128, c128);
+                _mm_storeu_ps(&C[i*ldc + j], result128);
+                }
+
+                int prev_end = (N % 4 == 0) ? (N - 4) : (N / 4) * 4;
+                for (j = prev_end; j < N; ++j){
+                C[i*ldc + j] += A_PART*B[k*ldb + j];
+                }
+                */
+            }
+        }
+    }
+}
+
+
+
+void gemm_nn_fast(int M, int N, int K, float ALPHA,
+    float *A, int lda,
+    float *B, int ldb,
+    float *C, int ldc)
+{
+    int i;
+
+    #pragma omp parallel for
+    for (i = 0; i < (M / TILE_M)*TILE_M; i += TILE_M)
+    {
+        int j, k;
+        int i_d, k_d;
+
+        for (k = 0; k < (K / TILE_K)*TILE_K; k += TILE_K)
+        {
+            for (j = 0; j < (N / TILE_N)*TILE_N; j += TILE_N)
+            {
+                // L1 - 6 bits tag [11:6] - cache size 32 KB, conflict for each 4 KB
+                // L2 - 9 bits tag [14:6] - cache size 256 KB, conflict for each 32 KB
+                // L3 - 13 bits tag [18:6] - cache size 8 MB, conflict for each 512 KB
+
+                __m256 result256;
+                __m256 a256_0, b256_0;    // AVX
+                __m256 a256_1, b256_1;    // AVX
+                __m256 a256_2;// , b256_2;    // AVX
+                __m256 a256_3;// , b256_3;    // AVX
+                __m256 c256_0, c256_1, c256_2, c256_3;
+                __m256 c256_4, c256_5, c256_6, c256_7;
+
+                c256_0 = _mm256_loadu_ps(&C[(0 + i)*ldc + (0 + j)]);
+                c256_1 = _mm256_loadu_ps(&C[(1 + i)*ldc + (0 + j)]);
+                c256_2 = _mm256_loadu_ps(&C[(0 + i)*ldc + (8 + j)]);
+                c256_3 = _mm256_loadu_ps(&C[(1 + i)*ldc + (8 + j)]);
+
+                c256_4 = _mm256_loadu_ps(&C[(2 + i)*ldc + (0 + j)]);
+                c256_5 = _mm256_loadu_ps(&C[(3 + i)*ldc + (0 + j)]);
+                c256_6 = _mm256_loadu_ps(&C[(2 + i)*ldc + (8 + j)]);
+                c256_7 = _mm256_loadu_ps(&C[(3 + i)*ldc + (8 + j)]);
+
+
+                for (k_d = 0; k_d < (TILE_K); ++k_d)
+                {
+                    a256_0 = _mm256_set1_ps(ALPHA*A[(0 + i)*lda + (k_d + k)]);
+                    a256_1 = _mm256_set1_ps(ALPHA*A[(1 + i)*lda + (k_d + k)]);
+
+                    a256_2 = _mm256_set1_ps(ALPHA*A[(2 + i)*lda + (k_d + k)]);
+                    a256_3 = _mm256_set1_ps(ALPHA*A[(3 + i)*lda + (k_d + k)]);
+
+
+                    b256_0 = _mm256_loadu_ps(&B[(k_d + k)*ldb + (0 + j)]);
+                    b256_1 = _mm256_loadu_ps(&B[(k_d + k)*ldb + (8 + j)]);
+
+                    // FMA - Intel Haswell (2013), AMD Piledriver (2012)
+                    //c256_0 = _mm256_fmadd_ps(a256_0, b256_0, c256_0);
+                    //c256_1 = _mm256_fmadd_ps(a256_1, b256_0, c256_1);
+                    //c256_2 = _mm256_fmadd_ps(a256_0, b256_1, c256_2);
+                    //c256_3 = _mm256_fmadd_ps(a256_1, b256_1, c256_3);
+
+                    //c256_4 = _mm256_fmadd_ps(a256_2, b256_0, c256_4);
+                    //c256_5 = _mm256_fmadd_ps(a256_3, b256_0, c256_5);
+                    //c256_6 = _mm256_fmadd_ps(a256_2, b256_1, c256_6);
+                    //c256_7 = _mm256_fmadd_ps(a256_3, b256_1, c256_7);
+
+                    result256 = _mm256_mul_ps(a256_0, b256_0);
+                    c256_0 = _mm256_add_ps(result256, c256_0);
+
+                    result256 = _mm256_mul_ps(a256_1, b256_0);
+                    c256_1 = _mm256_add_ps(result256, c256_1);
+
+                    result256 = _mm256_mul_ps(a256_0, b256_1);
+                    c256_2 = _mm256_add_ps(result256, c256_2);
+
+                    result256 = _mm256_mul_ps(a256_1, b256_1);
+                    c256_3 = _mm256_add_ps(result256, c256_3);
+
+
+                    result256 = _mm256_mul_ps(a256_2, b256_0);
+                    c256_4 = _mm256_add_ps(result256, c256_4);
+
+                    result256 = _mm256_mul_ps(a256_3, b256_0);
+                    c256_5 = _mm256_add_ps(result256, c256_5);
+
+                    result256 = _mm256_mul_ps(a256_2, b256_1);
+                    c256_6 = _mm256_add_ps(result256, c256_6);
+
+                    result256 = _mm256_mul_ps(a256_3, b256_1);
+                    c256_7 = _mm256_add_ps(result256, c256_7);
+                }
+                _mm256_storeu_ps(&C[(0 + i)*ldc + (0 + j)], c256_0);
+                _mm256_storeu_ps(&C[(1 + i)*ldc + (0 + j)], c256_1);
+                _mm256_storeu_ps(&C[(0 + i)*ldc + (8 + j)], c256_2);
+                _mm256_storeu_ps(&C[(1 + i)*ldc + (8 + j)], c256_3);
+
+                _mm256_storeu_ps(&C[(2 + i)*ldc + (0 + j)], c256_4);
+                _mm256_storeu_ps(&C[(3 + i)*ldc + (0 + j)], c256_5);
+                _mm256_storeu_ps(&C[(2 + i)*ldc + (8 + j)], c256_6);
+                _mm256_storeu_ps(&C[(3 + i)*ldc + (8 + j)], c256_7);
+            }
+
+            for (j = (N / TILE_N)*TILE_N; j < N; ++j) {
+                for (i_d = i; i_d < (i + TILE_M); ++i_d)
+                {
+                    for (k_d = k; k_d < (k + TILE_K); ++k_d)
+                    {
+                        PUT_IN_REGISTER float A_PART = ALPHA*A[i_d*lda + k_d];
+                        C[i_d*ldc + j] += A_PART*B[k_d*ldb + j];
+                    }
+                }
+            }
+        }
+
+        for (k = (K / TILE_K)*TILE_K; k < K; ++k)
+        {
+            for (i_d = i; i_d < (i + TILE_M); ++i_d)
+            {
+                PUT_IN_REGISTER float A_PART = ALPHA*A[i_d*lda + k];
+                for (j = 0; j < N; ++j) {
+                    C[i_d*ldc + j] += A_PART*B[k*ldb + j];
+                }
+            }
+        }
+    }
+
+    for (i = (M / TILE_M)*TILE_M; i < M; ++i) {
+        int j, k;
+        for (k = 0; k < K; ++k) {
+            PUT_IN_REGISTER float A_PART = ALPHA*A[i*lda + k];
+            for (j = 0; j < N; ++j) {
+                C[i*ldc + j] += A_PART*B[k*ldb + j];
+            }
+        }
+    }
+}
+
+
+
+void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA,
+    uint32_t *A, int lda,
+    uint32_t *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < M; ++i) {   // l.n
+        int j, s;
+        float mean_val = mean_arr[i];
+        //printf(" l.mean_arr[i] = %d \n ", l.mean_arr[i]);
+        for (s = 0; s < K; ++s) // l.size*l.size*l.c/32  or (l.size*l.size*l.c)
+        {
+            PUT_IN_REGISTER uint32_t A_PART = A[i*lda + s];
+            __m256i a256 = _mm256_set1_epi32(A_PART);
+
+            for (j = 0; j < N - 8; j += 8)
+            {
+                __m256i b256 = *((__m256i*)&B[s*ldb + j]);
+                __m256i xor256 = _mm256_xor_si256(a256, b256);  // xnor = xor(a,b)
+                __m256i all_1 = _mm256_set1_epi8((char)255);
+                __m256i xnor256 = _mm256_andnot_si256(xor256, all_1); // xnor = not(xor(a,b))
+
+                // waiting for - CPUID Flags: AVX512VPOPCNTDQ: __m512i _mm512_popcnt_epi32(__m512i a)
+                __m256 count = _mm256_setr_ps(
+                    POPCNT(_mm256_extract_epi32(xnor256, 0)),
+                    POPCNT(_mm256_extract_epi32(xnor256, 1)),
+                    POPCNT(_mm256_extract_epi32(xnor256, 2)),
+                    POPCNT(_mm256_extract_epi32(xnor256, 3)),
+                    POPCNT(_mm256_extract_epi32(xnor256, 4)),
+                    POPCNT(_mm256_extract_epi32(xnor256, 5)),
+                    POPCNT(_mm256_extract_epi32(xnor256, 6)),
+                    POPCNT(_mm256_extract_epi32(xnor256, 7)));
+
+                __m256 val2 = _mm256_set1_ps(2);
+                count = _mm256_mul_ps(count, val2);     // count * 2
+
+                __m256 val32 = _mm256_set1_ps(32);
+                count = _mm256_sub_ps(count, val32);    // count - 32
+
+                __m256 mean256 = _mm256_set1_ps(mean_val);
+                count = _mm256_mul_ps(count, mean256);  // count * mean_val
+
+                __m256 c256 = *((__m256*)&C[i*ldc + j]);
+                count = _mm256_add_ps(count, c256);     // c = c + count
+                *((__m256*)&C[i*ldc + j]) = count;
+            }
+
+            for (; j < N; ++j) // out_h*out_w;
+            {
+                PUT_IN_REGISTER uint32_t B_PART = B[s*ldb + j];
+                uint32_t xnor_result = ~(A_PART ^ B_PART);
+                int32_t count = POPCNT(xnor_result);  // must be Signed int
+
+                C[i*ldc + j] += (2 * count - 32) * mean_val;
+            }
+        }
+    }
+}
+
+void convolution_2d_old(int w, int h, int ksize, int n, int c, int pad, int stride,
+    float *weights, float *input, float *output)
+{
+    //const int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
+    //const int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
+
+    int fil;
+    // filter index
+    #pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
+    for (fil = 0; fil < n; ++fil) {
+        //int i, f, j;
+        int chan, y, x, f_y, f_x;
+        // channel index
+        for (chan = 0; chan < c; ++chan)
+            // input - y
+            for (y = 0; y < h; ++y)
+                // input - x
+                for (x = 0; x < w; ++x)
+                {
+                    int const output_index = fil*w*h + y*w + x;
+                    int const weights_pre_index = fil*c*ksize*ksize + chan*ksize*ksize;
+                    int const input_pre_index = chan*w*h;
+                    float sum = 0;
+
+                    // filter - y
+                    for (f_y = 0; f_y < ksize; ++f_y)
+                    {
+                        int input_y = y + f_y - pad;
+                        // filter - x
+                        for (f_x = 0; f_x < ksize; ++f_x)
+                        {
+                            int input_x = x + f_x - pad;
+                            if (input_y < 0 || input_x < 0 || input_y >= h || input_x >= w) continue;
+
+                            int input_index = input_pre_index + input_y*w + input_x;
+                            int weights_index = weights_pre_index + f_y*ksize + f_x;
+
+                            sum += input[input_index] * weights[weights_index];
+                        }
+                    }
+                    // l.output[filters][width][height] +=
+                    //        state.input[channels][width][height] *
+                    //        l.weights[filters][channels][filter_width][filter_height];
+                    output[output_index] += sum;
+                }
+    }
+}
+
+void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
+    float *weights, float *input, float *output, float *mean)
+{
+    //const int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
+    //const int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
+    int i;
+
+#if defined(_OPENMP)
+    static int max_num_threads = 0;
+    if (max_num_threads == 0) {
+        max_num_threads = omp_get_max_threads();
+        //omp_set_num_threads( max_num_threads / 2);
+    }
+#endif
+
+    //convolution_2d_old(w, h, ksize, n, c, pad, stride, weights, input, output);
+
+    __m256i all256_sing1 = _mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+    for (i = 0; i < ksize*ksize*n*c; i+=8) {
+        *((__m256*)&weights[i]) = _mm256_and_ps(*((__m256*)&weights[i]), _mm256_castsi256_ps(all256_sing1));
+    }
+
+    //for (i = 0; i < w*h*c; i += 8) {
+        //(*(__m256*)&input[i]) = _mm256_and_ps(*((__m256*)&input[i]), _mm256_castsi256_ps(all256_sing1));
+    //}
+
+
+    //__m256i all256_last_zero = _mm256_set1_epi32(0xFFFFFFFF);
+    //all256_last_zero.m256i_i32[7] = 0;
+    __m256i all256_last_zero =
+        _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0);
+
+    __m256i idx256 = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
+    //__m256 all256_sing1 = _mm256_set1_ps(0x80000000);
+    __m256 all256_one = _mm256_set1_ps(1);
+    __m256i all256i_one = _mm256_set1_epi32(1);
+
+    ///__m256i src256 = _mm256_loadu_si256((__m256i *)(&src[i]));
+    ///__m256i result256 = _mm256_and_si256(src256, all256_sing1); // check sign in 8 x 32-bit floats
+
+    int fil;
+    // filter index
+    #pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
+    for (fil = 0; fil < n; ++fil) {
+        int chan, y, x, f_y, f_x;
+        float cur_mean = fabs(mean[fil]);
+        __m256 mean256 = _mm256_set1_ps(cur_mean);
+        // channel index
+        //for (chan = 0; chan < c; ++chan)
+            // input - y
+            for (y = 0; y < h; ++y)
+                // input - x
+                for (x = 0; x < w-8; x+=8)
+                {
+                    int const output_index = fil*w*h + y*w + x;
+                    float sum = 0;
+                    __m256 sum256 = _mm256_set1_ps(0);
+
+                    for (chan = 0; chan < c; ++chan) {
+                        int const weights_pre_index = fil*c*ksize*ksize + chan*ksize*ksize;
+                        int const input_pre_index = chan*w*h;
+
+
+                        // filter - y
+                        for (f_y = 0; f_y < ksize; ++f_y)
+                        {
+                            int input_y = y + f_y - pad;
+                            //__m256 in = *((__m256*)&input[input_pre_index + input_y*w]);
+                            if (input_y < 0 || input_y >= h) continue;
+                            //__m256 in = _mm256_loadu_ps(&input[input_pre_index + input_y*w + x - pad]);
+
+                            // filter - x
+                            for (f_x = 0; f_x < ksize; ++f_x)
+                            {
+                                int input_x = x + f_x - pad;
+                                //if (input_y < 0 || input_x < 0 || input_y >= h || input_x >= w) continue;
+
+                                int input_index = input_pre_index + input_y*w + input_x;
+                                int weights_index = weights_pre_index + f_y*ksize + f_x;
+                                //if (input_y < 0 || input_y >= h) continue;
+
+                                //sum += input[input_index] * weights[weights_index];
+
+                                __m256 in = *((__m256*)&input[input_index]);
+                                __m256 w = _mm256_set1_ps(weights[weights_index]);
+                                //__m256 w_sign = _mm256_and_ps(w, _mm256_castsi256_ps(all256_sing1)); // check sign in 8 x 32-bit floats
+                                __m256 xor256 = _mm256_xor_ps(w, in);
+                                //printf("\n xor256_1 = %f, xor256_2 = %f \n", xor256.m256_f32[0], xor256.m256_f32[1]);
+                                //printf("\n in = %f, w = %f, xor256 = %f \n", in.m256_f32[0], w_sign.m256_f32[0], xor256.m256_f32[0]);
+
+                                //__m256 pn1 = _mm256_and_ps(_mm256_castsi256_ps(all256i_one), xor256);
+
+
+                                //sum256 = xor256;
+                                sum256 = _mm256_add_ps(xor256, sum256);
+                                //printf("\n --- \n");
+                                //printf("\n 0 = %f, 1 = %f, 2 = %f, 3 = %f, 4 = %f, 5 = %f, 6 = %f, 7 = %f \n", in.m256_f32[0], in.m256_f32[1], in.m256_f32[2], in.m256_f32[3], in.m256_f32[4], in.m256_f32[5], in.m256_f32[6], in.m256_f32[7]);
+
+                                if (f_x < ksize-1) {
+                                    //in = _mm256_permutevar8x32_ps(in, idx256);
+                                    //in = _mm256_and_ps(in, _mm256_castsi256_ps(all256_last_zero));
+                                }
+                            }
+                        }
+                    }
+                    // l.output[filters][width][height] +=
+                    //        state.input[channels][width][height] *
+                    //        l.weights[filters][channels][filter_width][filter_height];
+                    //output[output_index] += sum;
+
+                    sum256 = _mm256_mul_ps(sum256, mean256);
+                    //printf("\n cur_mean = %f, sum256 = %f, sum256 = %f, in = %f \n",
+                    //    cur_mean, sum256.m256_f32[0], sum256.m256_f32[1], input[input_pre_index]);
+
+                    //__m256 out = *((__m256*)&output[output_index]);
+                    //out = _mm256_add_ps(out, sum256);
+                    //(*(__m256*)&output[output_index]) = out;
+                    *((__m256*)&output[output_index]) = sum256;
+
+                    //_mm256_storeu_ps(&C[i*ldc + j], result256);
+                }
+    }
+}
+
+
+
+// http://graphics.stanford.edu/~seander/bithacks.html
+// https://stackoverflow.com/questions/17354971/fast-counting-the-number-of-set-bits-in-m128i-register
+// https://arxiv.org/pdf/1611.07612.pdf
+
+static inline int popcnt128(__m128i n) {
+    const __m128i n_hi = _mm_unpackhi_epi64(n, n);
+    return POPCNT64(_mm_cvtsi128_si64(n)) + POPCNT64(_mm_cvtsi128_si64(n_hi));
+}
+
+static inline int popcnt256(__m256i n) {
+    return popcnt128(_mm256_extractf128_si256(n, 0)) + popcnt128(_mm256_extractf128_si256(n, 1));
+}
+
+static inline __m256i count256(__m256i v) {
+    __m256i lookup =
+        _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2,
+            2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3,
+            1, 2, 2, 3, 2, 3, 3, 4);
+
+    __m256i low_mask = _mm256_set1_epi8(0x0f);
+
+    __m256i lo = _mm256_and_si256(v, low_mask);
+    __m256i hi = _mm256_and_si256(_mm256_srli_epi32(v, 4), low_mask);
+    __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo);
+    __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi);
+    __m256i total = _mm256_add_epi8(popcnt1, popcnt2);
+
+    return _mm256_sad_epu8(total, _mm256_setzero_si256());
+}
+
+static inline int popcnt256_custom(__m256i n) {
+    __m256i val = count256(n);
+
+    //return val.m256i_i64[0] +
+    //val.m256i_i64[1] +
+    //val.m256i_i64[2] +
+    //val.m256i_i64[3];
+    return _mm256_extract_epi64(val, 0)
+        + _mm256_extract_epi64(val, 1)
+        + _mm256_extract_epi64(val, 2)
+        + _mm256_extract_epi64(val, 3);
+}
+
+static inline void xnor_avx2_popcnt(__m256i a_bit256, __m256i b_bit256, __m256i *count_sum) {
+    __m256i c_bit256 = _mm256_set1_epi8((char)255);
+
+    __m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256);  // xnor = not(xor(a,b))
+    c_bit256 = _mm256_andnot_si256(xor256, c_bit256);  // can be optimized - we can do other NOT for wegihts once and do not do this NOT
+
+    *count_sum = _mm256_add_epi64(count256(c_bit256), *count_sum);    //  1st part - popcnt Mula's algorithm
+}
+
+// 2nd part - popcnt Mula's algorithm
+static inline int get_count_mula(__m256i count_sum) {
+    return _mm256_extract_epi64(count_sum, 0)
+        + _mm256_extract_epi64(count_sum, 1)
+        + _mm256_extract_epi64(count_sum, 2)
+        + _mm256_extract_epi64(count_sum, 3);
+}
+
+// 5x times faster than gemm()-float32
+// further optimizations: do mean-mult only for the last layer
+void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+    int i;
+
+#if defined(_OPENMP)
+    static int max_num_threads = 0;
+    if (max_num_threads == 0) {
+        max_num_threads = omp_get_max_threads();
+        //omp_set_num_threads(max_num_threads / 2);
+    }
+#endif
+
+    //#pragma omp parallel for
+    //for (i = 0; i < M; ++i)
+    #pragma omp parallel for
+    for (i = 0; i < (M/2)*2; i += 2)
+    {   // l.n - filters [16 - 55 - 1024]
+        float mean_val_0 = mean_arr[i + 0];
+        float mean_val_1 = mean_arr[i + 1];
+        int j, k;
+        //__m256i all_1 = _mm256_set1_epi8(255);
+
+        //for (j = 0; j < N; ++j)
+        for (j = 0; j < (N/2)*2; j += 2)
+        { // out_h*out_w - one channel output size [169 - 173056]
+            //int count = 0;
+            const int bit_step = 256;
+            __m256i count_sum_0 = _mm256_set1_epi8(0);
+            __m256i count_sum_1 = _mm256_set1_epi8(0);
+            __m256i count_sum_2 = _mm256_set1_epi8(0);
+            __m256i count_sum_3 = _mm256_set1_epi8(0);
+
+            for (k = 0; k < K; k += bit_step) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+
+                __m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + 0)*lda + k) / 8));
+                __m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
+
+                __m256i a_bit256_1 = _mm256_loadu_si256((__m256i *)(A + ((i + 1)*lda + k) / 8));
+                __m256i b_bit256_1 = _mm256_loadu_si256((__m256i *)(B + ((j + 1)*ldb + k) / 8));
+
+
+                xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum_0);
+                xnor_avx2_popcnt(a_bit256_0, b_bit256_1, &count_sum_1);
+
+                xnor_avx2_popcnt(a_bit256_1, b_bit256_0, &count_sum_2);
+                xnor_avx2_popcnt(a_bit256_1, b_bit256_1, &count_sum_3);
+
+                //count += popcnt256(c_bit256);
+                //binary_int64_printf(c_bit64);
+                //printf(", count = %d \n\n", tmp_count);
+            }
+
+            int count_0 = get_count_mula(count_sum_0);
+            int count_1 = get_count_mula(count_sum_1);
+            int count_2 = get_count_mula(count_sum_2);
+            int count_3 = get_count_mula(count_sum_3);
+
+            const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
+            count_0 = count_0 - f1;    // remove extra bits (from empty space for align only)
+            count_1 = count_1 - f1;
+            count_2 = count_2 - f1;
+            count_3 = count_3 - f1;
+            C[i*ldc + (j + 0)] = (2 * count_0 - K) * mean_val_0;
+            C[i*ldc + (j + 1)] = (2 * count_1 - K) * mean_val_0;
+            C[(i + 1)*ldc + (j + 0)] = (2 * count_2 - K) * mean_val_1;
+            C[(i + 1)*ldc + (j + 1)] = (2 * count_3 - K) * mean_val_1;
+        }
+
+        int i_d;
+        for (i_d = 0; i_d < 2; ++i_d)
+        {
+            float mean_val = mean_arr[i + i_d];
+            for (j = (N / 2) * 2; j < N; j += 1)
+            { // out_h*out_w - one channel output size [169 - 173056]
+                const int bit_step = 256;
+                __m256i count_sum = _mm256_set1_epi8(0);
+
+                for (k = 0; k < K; k += bit_step) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                    __m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + i_d + 0)*lda + k) / 8));
+                    __m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
+                    xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum);
+                }
+                int count = get_count_mula(count_sum);
+                const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
+                count = count - f1;    // remove extra bits (from empty space for align only)
+                C[(i + i_d)*ldc + j] = (2 * count - K) * mean_val;
+            }
+        }
+    }
+
+    for (i = (M / 2) * 2; i < M; i += 1)
+    {
+        float mean_val = mean_arr[i];
+        int j, k;
+        for (j = 0; j < N; j += 1)
+        { // out_h*out_w - one channel output size [169 - 173056]
+            const int bit_step = 256;
+            __m256i count_sum = _mm256_set1_epi8(0);
+
+            for (k = 0; k < K; k += bit_step) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                __m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + 0)*lda + k) / 8));
+                __m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
+                xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum);
+            }
+            int count = get_count_mula(count_sum);
+            const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
+            count = count - f1;    // remove extra bits (from empty space for align only)
+            C[i*ldc + j] = (2 * count - K) * mean_val;
+        }
+    }
+}
+
+
+
+
+//From Berkeley Vision's Caffe!
+//https://github.com/BVLC/caffe/blob/master/LICENSE
+void im2col_cpu_custom_transpose(float* data_im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col, int ldb_align)
+{
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
+    int c;
+
+    // optimized version
+    if (height_col == height && width_col == width && stride == 1 && pad == 1)
+    {
+        #pragma omp parallel for
+        for (c = 0; c < channels_col; ++c) {
+            int h, w;
+            int w_offset = c % ksize;
+            int h_offset = (c / ksize) % ksize;
+            int c_im = c / ksize / ksize;
+            for (h = pad; h < height_col - pad; ++h) {
+                for (w = pad; w < width_col - pad - 4; w+=8) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = (h * width_col + w)*ldb_align + c;   // transposed & aligned
+
+                    //data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
+                    __m256 src256 = _mm256_loadu_ps((float *)(&data_im[im_col + width*(im_row + height*c_im)]));
+                    data_col[col_index + ldb_align * 0] = _mm256_extract_float32(src256, 0);// src256.m256_f32[0];
+                    data_col[col_index + ldb_align * 1] = _mm256_extract_float32(src256, 1);// src256.m256_f32[1];
+                    data_col[col_index + ldb_align * 2] = _mm256_extract_float32(src256, 2);// src256.m256_f32[2];
+                    data_col[col_index + ldb_align * 3] = _mm256_extract_float32(src256, 3);// src256.m256_f32[3];
+                    data_col[col_index + ldb_align * 4] = _mm256_extract_float32(src256, 4);// src256.m256_f32[4];
+                    data_col[col_index + ldb_align * 5] = _mm256_extract_float32(src256, 5);// src256.m256_f32[5];
+                    data_col[col_index + ldb_align * 6] = _mm256_extract_float32(src256, 6);// src256.m256_f32[6];
+                    data_col[col_index + ldb_align * 7] = _mm256_extract_float32(src256, 7);// src256.m256_f32[7];
+
+                    //_mm256_storeu_ps(&data_col[col_index], src256);
+                }
+
+                for (; w < width_col - pad; ++w) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    int col_index = (h * width_col + w)*ldb_align + c;   // transposed & aligned
+                    data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
+                }
+            }
+
+            {
+                w = 0;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (h * width_col + w)*ldb_align + c;   // transposed & aligned
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                w = width_col - 1;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (h * width_col + w)*ldb_align + c;   // transposed & aligned
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                h = 0;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (h * width_col + w)*ldb_align + c;   // transposed & aligned
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                h = height_col - 1;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (h * width_col + w)*ldb_align + c;   // transposed & aligned
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+        }
+
+    }
+    else {
+        #pragma omp parallel for
+        for (c = 0; c < channels_col; ++c) {
+            int h, w;
+            int w_offset = c % ksize;
+            int h_offset = (c / ksize) % ksize;
+            int c_im = c / ksize / ksize;
+            for (h = 0; h < height_col; ++h) {
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h * stride;
+                    int im_col = w_offset + w * stride;
+
+                    int col_index = (h * width_col + w)*ldb_align + c;   // transposed & aligned
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+        }
+    }
+}
+
+
+//From Berkeley Vision's Caffe!
+//https://github.com/BVLC/caffe/blob/master/LICENSE
+void im2col_cpu_custom(float* data_im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col)
+{
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
+
+    // optimized version
+    if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
+    {
+        #pragma omp parallel for
+        for (c = 0; c < channels_col; ++c) {
+            int h, w;
+            int w_offset = c % ksize;
+            int h_offset = (c / ksize) % ksize;
+            int c_im = c / ksize / ksize;
+            for (h = pad; h < height_col-pad; ++h) {
+                for (w = pad; w < width_col-pad-8; w += 8) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    int col_index = (c * height_col + h) * width_col + w;
+
+                    //data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
+                    __m256 src256 = _mm256_loadu_ps((float *)(&data_im[im_col + width*(im_row + height*c_im)]));
+                    _mm256_storeu_ps(&data_col[col_index], src256);
+                }
+
+                for (; w < width_col - pad; ++w) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    int col_index = (c * height_col + h) * width_col + w;
+
+                    data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
+                }
+            }
+
+            {
+                w = 0;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (c * height_col + h) * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                w = width_col-1;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (c * height_col + h) * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                h = 0;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (c * height_col + h) * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                            im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                h = height_col-1;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (c * height_col + h) * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+        }
+
+    }
+    else {
+        //printf("\n Error: is no non-optimized version \n");
+        im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col);
+    }
+}
+
+//From Berkeley Vision's Caffe!
+//https://github.com/BVLC/caffe/blob/master/LICENSE
+void im2col_cpu_custom_align(float* data_im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col, int bit_align)
+{
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
+
+    // optimized version
+    if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
+    {
+        int new_ldb = bit_align;
+
+        #pragma omp parallel for
+        for (c = 0; c < channels_col; ++c) {
+            int h, w;
+            int w_offset = c % ksize;
+            int h_offset = (c / ksize) % ksize;
+            int c_im = c / ksize / ksize;
+            for (h = pad; h < height_col - pad; ++h) {
+                for (w = pad; w < width_col - pad - 8; w += 8) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
+                    __m256 src256 = _mm256_loadu_ps((float *)(&data_im[im_col + width*(im_row + height*c_im)]));
+                    _mm256_storeu_ps(&data_col[col_index], src256);
+                }
+
+                for (; w < width_col - pad; ++w) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+                    data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
+                }
+            }
+
+            {
+                w = 0;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                w = width_col - 1;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                h = 0;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                h = height_col - 1;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                }
+            }
+        }
+
+    }
+    else {
+        printf("\n Error: is no non-optimized version \n");
+        //im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col); // must be aligned for transpose after float_to_bin
+        // float_to_bit(b, t_input, src_size);
+        // transpose_bin(t_input, *t_bit_input, k, n, bit_align, new_ldb, 8);
+    }
+}
+
+
+//From Berkeley Vision's Caffe!
+//https://github.com/BVLC/caffe/blob/master/LICENSE
+void im2col_cpu_custom_bin(float* data_im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col, int bit_align)
+{
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
+
+    // optimized version
+    if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
+    {
+        __m256i all256_sing1 = _mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+        __m256 float_zero256 = _mm256_set1_ps(0.00);
+
+        int new_ldb = bit_align;
+
+        #pragma omp parallel for
+        for (c = 0; c < channels_col; ++c) {
+            int h, w;
+            int w_offset = c % ksize;
+            int h_offset = (c / ksize) % ksize;
+            int c_im = c / ksize / ksize;
+            for (h = pad; h < height_col - pad; ++h) {
+                for (w = pad; w < width_col - pad - 8; w += 8) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //__m256i src256 = _mm256_loadu_si256((__m256i *)(&data_im[im_col + width*(im_row + height*c_im)]));
+                    //__m256i result256 = _mm256_and_si256(src256, all256_sing1); // check sign in 8 x 32-bit floats
+                    //uint16_t mask = _mm256_movemask_ps(_mm256_castsi256_ps(result256)); // (val >= 0) ? 0 : 1
+                    //mask = ~mask;   // inverse mask,  (val >= 0) ? 1 : 0
+
+                    __m256 src256 = _mm256_loadu_ps((float *)(&data_im[im_col + width*(im_row + height*c_im)]));
+                    __m256 result256 = _mm256_cmp_ps(src256, float_zero256, _CMP_GT_OS);
+                    uint16_t mask = _mm256_movemask_ps(result256); // (val > 0) ? 0 : 1
+
+                    uint16_t* dst_ptr = (uint16_t*)&((uint8_t*)data_col)[col_index / 8];
+                    *dst_ptr |= (mask << (col_index % 8));
+                }
+
+                for (; w < width_col - pad; ++w) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
+                    float val = data_im[im_col + width*(im_row + height*c_im)];
+                    if (val > 0) set_bit((unsigned char* const)data_col, col_index);
+                }
+            }
+
+            {
+                w = 0;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    if (val > 0) set_bit((unsigned char* const)data_col, col_index);
+                }
+            }
+
+            {
+                w = width_col - 1;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    if (val > 0) set_bit((unsigned char* const)data_col, col_index);
+                }
+            }
+
+            {
+                h = 0;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    if (val > 0) set_bit((unsigned char* const)data_col, col_index);
+                }
+            }
+
+            {
+                h = height_col - 1;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    if (val > 0) set_bit((unsigned char* const)data_col, col_index);
+                }
+            }
+        }
+
+    }
+    else {
+        printf("\n Error: is no non-optimized version \n");
+        //im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col); // must be aligned for transpose after float_to_bin
+        // float_to_bit(b, t_input, src_size);
+        // transpose_bin(t_input, *t_bit_input, k, n, bit_align, new_ldb, 8);
+    }
+}
+
+
+void activate_array_cpu_custom(float *x, const int n, const ACTIVATION a)
+{
+    int i = 0;
+    if (a == LINEAR)
+    {}
+    else if (a == LEAKY)
+    {
+        if (is_fma_avx2()) {
+            __m256i all256_sing1 = _mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+            __m256 all256_01 = _mm256_set1_ps(0.1F);
+
+            for (i = 0; i < n - 8; i += 8) {
+                //x[i] = (x[i]>0) ? x[i] : .1*x[i];
+
+                __m256 src256 = _mm256_loadu_ps(&x[i]);
+                __m256 mult256 = _mm256_mul_ps((src256), all256_01); // mult * 0.1
+
+                __m256i sign256 = _mm256_and_si256(_mm256_castps_si256(src256), all256_sing1); // check sign in 8 x 32-bit floats
+
+                __m256 result256 = _mm256_blendv_ps(src256, mult256, _mm256_castsi256_ps(sign256)); // (sign>0) ? src : mult;
+                _mm256_storeu_ps(&x[i], result256);
+            }
+        }
+
+        for (; i < n; ++i) {
+            x[i] = (x[i]>0) ? x[i] : .1*x[i];
+        }
+    }
+    else {
+        for (i = 0; i < n; ++i) {
+            x[i] = activate(x[i], a);
+        }
+    }
+}
+
+void float_to_bit(float *src, unsigned char *dst, size_t size)
+{
+    size_t dst_size = size / 8 + 1;
+    memset(dst, 0, dst_size);
+
+    size_t i;
+    //__m256i all256_sing1 = _mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+    __m256 float_zero256 = _mm256_set1_ps(0.0);
+
+    for (i = 0; i < size; i+=8)
+    {
+        //__m256i src256 = _mm256_loadu_si256((__m256i *)(&src[i]));
+        //__m256i result256 = _mm256_and_si256(src256, all256_sing1); // check sign in 8 x 32-bit floats
+        //uint32_t mask = _mm256_movemask_ps(_mm256_castsi256_ps(result256)); // (val >= 0) ? 0 : 1
+        ////mask = ~mask;   // inverse mask,  (val >= 0) ? 1 : 0
+
+        __m256 src256 = _mm256_loadu_ps((float *)(&src[i]));
+        __m256 result256 = _mm256_cmp_ps(src256, float_zero256, _CMP_GT_OS);
+        uint32_t mask = _mm256_movemask_ps(result256); // (val > 0) ? 0 : 1
+
+        dst[i / 8] = mask;
+    }
+}
+
+static inline void transpose4x4_SSE(float *A, float *B, const int lda, const int ldb)
+{
+    __m128 row1 = _mm_loadu_ps(&A[0 * lda]);
+    __m128 row2 = _mm_loadu_ps(&A[1 * lda]);
+    __m128 row3 = _mm_loadu_ps(&A[2 * lda]);
+    __m128 row4 = _mm_loadu_ps(&A[3 * lda]);
+    _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+    _mm_storeu_ps(&B[0 * ldb], row1);
+    _mm_storeu_ps(&B[1 * ldb], row2);
+    _mm_storeu_ps(&B[2 * ldb], row3);
+    _mm_storeu_ps(&B[3 * ldb], row4);
+}
+
+void transpose_block_SSE4x4(float *A, float *B, const int n, const int m,
+    const int lda, const int ldb, const int block_size)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; i += block_size) {
+        int j, i2, j2;
+        //int max_i2 = (i + block_size < n) ? (i + block_size) : n;
+        if (i + block_size < n) {
+            int max_i2 = i + block_size;
+            for (j = 0; j < m; j += block_size) {
+                //int max_j2 = (j + block_size < m) ? (j + block_size) : m;
+                if (j + block_size < m) {
+                    int max_j2 = j + block_size;
+                    for (i2 = i; i2 < max_i2; i2 += 4) {
+                        for (j2 = j; j2 < max_j2; j2 += 4) {
+                            transpose4x4_SSE(&A[i2*lda + j2], &B[j2*ldb + i2], lda, ldb);
+                        }
+                    }
+                }
+                else {
+                    for (i2 = i; i2 < max_i2; ++i2) {
+                        for (j2 = j; j2 < m; ++j2) {
+                            B[j2*ldb + i2] = A[i2*lda + j2];
+                        }
+                    }
+                }
+            }
+        }
+        else {
+            for (i2 = i; i2 < n; ++i2) {
+                for (j2 = 0; j2 < m; ++j2) {
+                    B[j2*ldb + i2] = A[i2*lda + j2];
+                }
+            }
+        }
+    }
+}
+
+
+void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, int w, int h, int out_w, int out_h, int c,
+    int pad, int stride, int batch)
+{
+
+    const int w_offset = -pad / 2;
+    const int h_offset = -pad / 2;
+    int b, k;
+
+    for (b = 0; b < batch; ++b) {
+        #pragma omp parallel for
+        for (k = 0; k < c; ++k) {
+            int i, j, m, n;
+            for (i = 0; i < out_h; ++i) {
+                //for (j = 0; j < out_w; ++j) {
+                j = 0;
+
+                if(stride == 1 && is_avx() == 1) {
+                    for (j = 0; j < out_w - 8 - (size - 1); j += 8) {
+                        int out_index = j + out_w*(i + out_h*(k + c*b));
+                        __m256 max256 = _mm256_set1_ps(-FLT_MAX);
+                        for (n = 0; n < size; ++n) {
+                            for (m = 0; m < size; ++m) {
+                                int cur_h = h_offset + i*stride + n;
+                                int cur_w = w_offset + j*stride + m;
+                                int index = cur_w + w*(cur_h + h*(k + b*c));
+                                int valid = (cur_h >= 0 && cur_h < h &&
+                                    cur_w >= 0 && cur_w < w);
+                                if (!valid) continue;
+
+                                __m256 src256 = _mm256_loadu_ps(&src[index]);
+                                max256 = _mm256_max_ps(src256, max256);
+                            }
+                        }
+                        _mm256_storeu_ps(&dst[out_index], max256);
+
+                    }
+                }
+                else if (size == 2 && stride == 2 && is_avx() == 1) {
+                    for (j = 0; j < out_w - 4; j += 4) {
+                        int out_index = j + out_w*(i + out_h*(k + c*b));
+                        //float max = -FLT_MAX;
+                        //int max_i = -1;
+                        __m128 max128 = _mm_set1_ps(-FLT_MAX);
+
+                        for (n = 0; n < size; ++n) {
+                            //for (m = 0; m < size; ++m)
+                            m = 0;
+                            {
+                                int cur_h = h_offset + i*stride + n;
+                                int cur_w = w_offset + j*stride + m;
+                                int index = cur_w + w*(cur_h + h*(k + b*c));
+                                int valid = (cur_h >= 0 && cur_h < h &&
+                                    cur_w >= 0 && cur_w < w);
+                                if (!valid) continue;
+
+                                __m256 src256 = _mm256_loadu_ps(&src[index]);
+                                __m256 src256_2 = _mm256_permute_ps(src256, (1 << 0) | (3 << 4));
+                                __m256 max256 = _mm256_max_ps(src256, src256_2);
+
+                                __m128 src128_0 = _mm256_extractf128_ps(max256, 0);
+                                __m128 src128_1 = _mm256_extractf128_ps(max256, 1);
+                                __m128 src128 = _mm_shuffle_ps(src128_0, src128_1, (2 << 2) | (2 << 6));
+
+                                max128 = _mm_max_ps(src128, max128);
+                            }
+                        }
+                        _mm_storeu_ps(&dst[out_index], max128);
+                    }
+                }
+
+                for (; j < out_w; ++j) {
+                    int out_index = j + out_w*(i + out_h*(k + c*b));
+                    float max = -FLT_MAX;
+                    int max_i = -1;
+                    for (n = 0; n < size; ++n) {
+                        for (m = 0; m < size; ++m) {
+                            int cur_h = h_offset + i*stride + n;
+                            int cur_w = w_offset + j*stride + m;
+                            int index = cur_w + w*(cur_h + h*(k + b*c));
+                            int valid = (cur_h >= 0 && cur_h < h &&
+                                cur_w >= 0 && cur_w < w);
+                            float val = (valid != 0) ? src[index] : -FLT_MAX;
+                            max_i = (val > max) ? index : max_i;
+                            max = (val > max) ? val : max;
+                        }
+                    }
+                    dst[out_index] = max;
+                    if (indexes) indexes[out_index] = max_i;
+                }
+            }
+        }
+    }
+}
+
+#else   // AVX
+
+int is_avx() {
+    return 0;
+}
+
+int is_fma_avx2() {
+    return 0;
+}
+
+void gemm_nn(int M, int N, int K, float ALPHA,
+    float *A, int lda,
+    float *B, int ldb,
+    float *C, int ldc)
+{
+    int i, j, k;
+    for (i = 0; i < M; ++i) {
+        for (k = 0; k < K; ++k) {
+            PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k];
+            for (j = 0; j < N; ++j) {
+                C[i*ldc + j] += A_PART*B[k*ldb + j];
+            }
+        }
+    }
+}
+
+void gemm_nn_fast(int M, int N, int K, float ALPHA,
+    float *A, int lda,
+    float *B, int ldb,
+    float *C, int ldc)
+{
+    int i, j, k;
+    #pragma omp parallel for
+    for (i = 0; i < M; ++i) {
+        for (k = 0; k < K; ++k) {
+            PUT_IN_REGISTER float A_PART = ALPHA*A[i*lda + k];
+            for (j = 0; j < N; ++j) {
+                C[i*ldc + j] += A_PART*B[k*ldb + j];
+            }
+        }
+    }
+}
+
+void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA,
+    uint32_t *A, int lda,
+    uint32_t *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < M; ++i) {   // l.n
+        int j, s;
+        float mean_val = mean_arr[i];
+        //printf(" l.mean_arr[i] = %d \n ", l.mean_arr[i]);
+        for (s = 0; s < K; ++s) // l.size*l.size*l.c/32  or (l.size*l.size*l.c)
+        {
+            //PUT_IN_REGISTER float A_PART = 1*a[i*k + s];
+            PUT_IN_REGISTER uint32_t A_PART = A[i * lda + s];
+            for (j = 0; j < N; ++j) // out_h*out_w;
+            {
+                //c[i*n + j] += A_PART*b[s*n + j];
+                PUT_IN_REGISTER uint32_t B_PART = B[s * ldb + j];
+                uint32_t xnor_result = ~(A_PART ^ B_PART);
+                //printf(" xnor_result = %d, ", xnor_result);
+                int32_t count = POPCNT(xnor_result);  // must be Signed int
+
+                C[i*ldc + j] += (2 * count - 32) * mean_val;
+                //c[i*n + j] += count*mean;
+            }
+        }
+    }
+}
+
+
+void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
+    float *weights, float *input, float *output, float *mean)
+{
+    const int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
+    const int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
+    //int i, f, j;
+
+    int fil;
+    // filter index
+    #pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
+    for (fil = 0; fil < n; ++fil) {
+        int chan, y, x, f_y, f_x;
+        // channel index
+        for (chan = 0; chan < c; ++chan)
+            // input - y
+            for (y = 0; y < h; ++y)
+                // input - x
+                for (x = 0; x < w; ++x)
+                {
+                    int const output_index = fil*w*h + y*w + x;
+                    int const weights_pre_index = fil*c*ksize*ksize + chan*ksize*ksize;
+                    int const input_pre_index = chan*w*h;
+                    float sum = 0;
+
+                    // filter - y
+                    for (f_y = 0; f_y < ksize; ++f_y)
+                    {
+                        int input_y = y + f_y - pad;
+                        // filter - x
+                        for (f_x = 0; f_x < ksize; ++f_x)
+                        {
+                            int input_x = x + f_x - pad;
+                            if (input_y < 0 || input_x < 0 || input_y >= h || input_x >= w) continue;
+
+                            int input_index = input_pre_index + input_y*w + input_x;
+                            int weights_index = weights_pre_index + f_y*ksize + f_x;
+
+                            sum += input[input_index] * weights[weights_index];
+                        }
+                    }
+                    // l.output[filters][width][height] +=
+                    //        state.input[channels][width][height] *
+                    //        l.weights[filters][channels][filter_width][filter_height];
+                    output[output_index] += sum;
+                }
+    }
+}
+
+void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+    int i;
+
+    #pragma omp parallel for
+    for (i = 0; i < M; ++i) {   // l.n - filters [16 - 55 - 1024]
+        int j, k;
+        float mean_val = mean_arr[i];
+
+        for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
+            int count = 0;
+
+            for (k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8));
+                uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8));
+                uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
+
+                int tmp_count = POPCNT64(c_bit64);
+
+                if (K - k < 64)  tmp_count = tmp_count - (64 - (K - k));    // remove extra bits
+                count += tmp_count;
+                //binary_int64_printf(c_bit64);
+                //printf(", count = %d \n\n", tmp_count);
+            }
+
+            C[i*ldc + j] = (2 * count - K) * mean_val;
+        }
+    }
+}
+
+void im2col_cpu_custom_transpose(float* data_im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col, int ldb_align)
+{
+    printf("\n im2col_cpu_custom_transpose() isn't implemented without AVX \n");
+}
+
+//From Berkeley Vision's Caffe!
+//https://github.com/BVLC/caffe/blob/master/LICENSE
+void im2col_cpu_custom(float* data_im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col)
+{
+    im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col);
+    return;
+
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
+
+    // optimized version
+    if (height_col == height && width_col == width && stride == 1 && pad == 1)
+    {
+        #pragma omp parallel for
+        for (c = 0; c < channels_col; ++c) {
+            int h, w;
+            int w_offset = c % ksize;
+            int h_offset = (c / ksize) % ksize;
+            int c_im = c / ksize / ksize;
+            for (h = pad; h < height_col - pad; ++h) {
+                for (w = pad; w < width_col - pad; ++w) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    int col_index = (c * height_col + h) * width_col + w;
+
+                    data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
+                }
+
+                for (; w < width_col - pad; ++w) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    int col_index = (c * height_col + h) * width_col + w;
+
+                    data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
+                }
+    }
+
+            {
+                w = 0;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (c * height_col + h) * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                w = width_col - 1;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (c * height_col + h) * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                h = 0;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (c * height_col + h) * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+
+            {
+                h = height_col - 1;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    int col_index = (c * height_col + h) * width_col + w;
+                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+                }
+            }
+        }
+
+    }
+    else {
+        //printf("\n Error: is no non-optimized version \n");
+        im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col);
+    }
+}
+
+
+//From Berkeley Vision's Caffe!
+//https://github.com/BVLC/caffe/blob/master/LICENSE
+void im2col_cpu_custom_bin(float* data_im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col, int bit_align)
+{
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
+
+    // optimized version
+    if (height_col == height && width_col == width && stride == 1 && pad == 1)
+    {
+        int new_ldb = bit_align;
+
+        #pragma omp parallel for
+        for (c = 0; c < channels_col; ++c) {
+            int h, w;
+            int w_offset = c % ksize;
+            int h_offset = (c / ksize) % ksize;
+            int c_im = c / ksize / ksize;
+            for (h = pad; h < height_col - pad; ++h) {
+                for (w = pad; w < width_col - pad - 8; w += 1) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    float val = data_im[im_col + width*(im_row + height*c_im)];
+                    if (val > 0) set_bit((unsigned char*)data_col, col_index);
+                }
+
+                for (; w < width_col - pad; ++w) {
+                    int im_row = h_offset + h - pad;
+                    int im_col = w_offset + w - pad;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
+                    float val = data_im[im_col + width*(im_row + height*c_im)];
+                    if (val > 0) set_bit((unsigned char*)data_col, col_index);
+                }
+            }
+
+            {
+                w = 0;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    if (val > 0) set_bit((unsigned char*)data_col, col_index);
+                }
+            }
+
+            {
+                w = width_col - 1;
+                for (h = 0; h < height_col; ++h) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    if (val > 0) set_bit((unsigned char*)data_col, col_index);
+                }
+            }
+
+            {
+                h = 0;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    if (val > 0) set_bit((unsigned char*)data_col, col_index);
+                }
+            }
+
+            {
+                h = height_col - 1;
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h;
+                    int im_col = w_offset + w;
+                    //int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = c * new_ldb + h * width_col + w;
+
+                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
+                    if (val > 0) set_bit((unsigned char*)data_col, col_index);
+                }
+            }
+        }
+
+    }
+    else {
+        printf("\n Error: is no non-optimized version \n");
+        //im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col); // must be aligned for transpose after float_to_bin
+        // float_to_bit(b, t_input, src_size);
+        // transpose_bin(t_input, *t_bit_input, k, n, bit_align, new_ldb, 8);
+    }
+}
+
+
+void activate_array_cpu_custom(float *x, const int n, const ACTIVATION a)
+{
+    int i;
+    if (a == LINEAR)
+    {
+    }
+    else if (a == LEAKY)
+    {
+        for (i = 0; i < n; ++i) {
+            x[i] = (x[i]>0) ? x[i] : .1*x[i];
+        }
+    }
+    else {
+        for (i = 0; i < n; ++i) {
+            x[i] = activate(x[i], a);
+        }
+    }
+}
+
+void float_to_bit(float *src, unsigned char *dst, size_t size)
+{
+    size_t dst_size = size / 8 + 1;
+    memset(dst, 0, dst_size);
+
+    size_t i;
+    char* byte_arr = (char*)xcalloc(size, sizeof(char));
+    for (i = 0; i < size; ++i) {
+        if (src[i] > 0) byte_arr[i] = 1;
+    }
+
+    //for (i = 0; i < size; ++i) {
+    //    dst[i / 8] |= byte_arr[i] << (i % 8);
+    //}
+
+    for (i = 0; i < size; i += 8) {
+        char dst_tmp = 0;
+        dst_tmp |= byte_arr[i + 0] << 0;
+        dst_tmp |= byte_arr[i + 1] << 1;
+        dst_tmp |= byte_arr[i + 2] << 2;
+        dst_tmp |= byte_arr[i + 3] << 3;
+        dst_tmp |= byte_arr[i + 4] << 4;
+        dst_tmp |= byte_arr[i + 5] << 5;
+        dst_tmp |= byte_arr[i + 6] << 6;
+        dst_tmp |= byte_arr[i + 7] << 7;
+        dst[i / 8] = dst_tmp;
+    }
+    free(byte_arr);
+}
+
+static inline void transpose_scalar_block(float *A, float *B, const int lda, const int ldb, const int block_size)
+{
+    int i;
+    //#pragma omp parallel for
+    for (i = 0; i<block_size; i++) {
+        int j;
+        for (j = 0; j<block_size; j++) {
+            B[j*ldb + i] = A[i*lda + j];
+        }
+    }
+}
+
+void transpose_block_SSE4x4(float *A, float *B, const int n, const int m,
+    const int lda, const int ldb, const int block_size)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; i += block_size) {
+        int j, i2, j2;
+        for (j = 0; j < m; j += block_size) {
+            int max_i2 = i + block_size < n ? i + block_size : n;
+            int max_j2 = j + block_size < m ? j + block_size : m;
+            for (i2 = i; i2 < max_i2; ++i2) {
+                for (j2 = j; j2 < max_j2; ++j2) {
+                    B[j2*ldb + i2] = A[i2*lda + j2];
+                }
+                }
+            }
+        }
+}
+
+void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, int w, int h, int out_w, int out_h, int c,
+    int pad, int stride, int batch)
+{
+    int b, k;
+    const int w_offset = -pad / 2;
+    const int h_offset = -pad / 2;
+
+    for (b = 0; b < batch; ++b) {
+        #pragma omp parallel for
+        for (k = 0; k < c; ++k) {
+            int i, j, m, n;
+            for (i = 0; i < out_h; ++i) {
+                for (j = 0; j < out_w; ++j) {
+                    int out_index = j + out_w*(i + out_h*(k + c*b));
+                    float max = -FLT_MAX;
+                    int max_i = -1;
+                    for (n = 0; n < size; ++n) {
+                        for (m = 0; m < size; ++m) {
+                            int cur_h = h_offset + i*stride + n;
+                            int cur_w = w_offset + j*stride + m;
+                            int index = cur_w + w*(cur_h + h*(k + b*c));
+                            int valid = (cur_h >= 0 && cur_h < h &&
+                                cur_w >= 0 && cur_w < w);
+                            float val = (valid != 0) ? src[index] : -FLT_MAX;
+                            max_i = (val > max) ? index : max_i;
+                            max = (val > max) ? val : max;
+                        }
+                    }
+                    dst[out_index] = max;
+                    if (indexes) indexes[out_index] = max_i;
+                }
+            }
+        }
+    }
+}
+
+#endif    // AVX
+
+
+// 32 channels -> 1 channel (with 32 floats)
+// 256 channels -> 8 channels (with 32 floats)
+void repack_input(float *input, float *re_packed_input, int w, int h, int c)
+{
+    const int items_per_channel = w * h;
+    int chan, i;
+    for (chan = 0; chan < c; chan += 32)
+    {
+        for (i = 0; i < items_per_channel; ++i)
+        {
+            int c_pack;
+            for (c_pack = 0; c_pack < 32; ++c_pack) {
+                float src = input[(chan + c_pack)*items_per_channel + i];
+
+                re_packed_input[chan*items_per_channel + i * 32 + c_pack] = src;
+            }
+        }
+    }
+}
+
+void transpose_uint32(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
+{
+    //l.bit_align - algined (n) by 32
+    //new_ldb - aligned (k) by 256
+
+    int i;
+    //#pragma omp parallel for
+    for (i = 0; i < src_h; i += 1)  // l.size*l.size*l.c;
+    {
+        int j;
+        for (j = 0; j < src_w; j += 1)  // out_h*out_w;
+        {
+            ((uint32_t *)dst)[j*dst_align / 32 + i] = ((uint32_t *)src)[i*src_align + j];
+        }
+    }
+}
+
+void gemm_nn_bin_transposed_32bit_packed(int M, int N, int K, float ALPHA,
+    uint32_t *A, int lda,
+    uint32_t *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < M; ++i) {   // l.n
+        int j, s;
+        float mean_val = mean_arr[i];
+        for (j = 0; j < N; ++j) // out_h*out_w;
+        {
+            float val = 0;
+            for (s = 0; s < K; ++s) // l.size*l.size*l.c/32  or (l.size*l.size*l.c)
+            {
+                PUT_IN_REGISTER uint32_t A_PART = ((uint32_t*)A)[i*lda + s];
+                PUT_IN_REGISTER uint32_t B_PART = ((uint32_t*)B)[j * ldb + s];
+                uint32_t xnor_result = ~(A_PART ^ B_PART);
+                int32_t count = POPCNT(xnor_result);  // must be Signed int
+
+                val += (2 * count - 32) * mean_val;
+            }
+            C[i*ldc + j] += val;
+        }
+    }
+}
+
+void convolution_repacked(uint32_t *packed_input, uint32_t *packed_weights, float *output,
+    int w, int h, int c, int n, int size, int pad, int new_lda, float *mean_arr)
+{
+    int fil;
+    // filter index
+    #pragma omp parallel for
+    for (fil = 0; fil < n; ++fil) {
+        float mean_val = mean_arr[fil];
+        int chan, y, x, f_y, f_x;   // c_pack
+        // channel index
+        for (chan = 0; chan < c / 32; ++chan)
+            //for (chan = 0; chan < l.c; chan += 32)
+            //for (c_pack = 0; c_pack < 32; ++c_pack)
+            // input - y
+            for (y = 0; y < h; ++y)
+                // input - x
+                for (x = 0; x < w; ++x)
+                {
+                    int const output_index = fil*w*h + y*w + x;
+                    float sum = 0;
+
+                    // filter - y
+                    for (f_y = 0; f_y < size; ++f_y)
+                    {
+                        int input_y = y + f_y - pad;
+                        // filter - x
+                        for (f_x = 0; f_x < size; ++f_x)
+                        {
+                            int input_x = x + f_x - pad;
+                            if (input_y < 0 || input_x < 0 || input_y >= h || input_x >= w) continue;
+
+                            // normal
+                            //float input = state.input[(chan + c_pack)*l.w*l.h + input_y*l.w + input_x];
+                            //float weight = l.weights[fil*l.c*l.size*l.size + (chan + c_pack)*l.size*l.size + f_y*l.size + f_x];
+
+                            // packed
+                            //float input = re_packed_input[chan*l.w*l.h + (input_y*l.w + input_x) * 32 + c_pack];
+                            //float weight = l.weights[fil*l.c*l.size*l.size + chan*l.size*l.size + (f_y*l.size + f_x) * 32 + c_pack];
+                            //sum += input * weight;
+
+                            //float input = re_packed_input[chan*l.w*l.h + (input_y*l.w + input_x) * 32 + c_pack];
+                            //float weight = l.weights[fil*l.c*l.size*l.size + chan*l.size*l.size + (f_y*l.size + f_x) * 32 + c_pack];
+                            //uint32_t bit1 = input > 0;
+                            //uint32_t bit2 = weight > 0;
+                            //uint32_t count = (~(bit1 ^ bit2)) & 1;
+                            //float result = (2 * (float)count - 1) * mean_val;
+                            //printf("\n mul = %f, bit1 = %d, bit2 = %d, count = %d, mean = %f, result = %f  ", input*weight, bit1, bit2, count, mean_val, result);
+                            //sum += result;
+
+                            uint32_t input = ((uint32_t *)packed_input)[chan*w*h + input_y*w + input_x];
+                            //uint32_t weight = ((uint32_t *)l.align_bit_weights)[fil*l.c*l.size*l.size/32 + chan*l.size*l.size + f_y*l.size + f_x];
+                            uint32_t weight = ((uint32_t *)packed_weights)[fil*new_lda / 32 + chan*size*size + f_y*size + f_x];
+
+                            uint32_t xnor_result = ~(input ^ weight);
+                            int32_t count = POPCNT(xnor_result); // mandatory Signed int
+                            sum += (2 * count - 32) * mean_val;
+                        }
+                    }
+                    // l.output[filters][width][height] +=
+                    //        state.input[channels][width][height] *
+                    //        l.weights[filters][channels][filter_width][filter_height];
+                    output[output_index] += sum;
+                }
+    }
+}
+
+void gemm_nt(int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float *C, int ldc)
+{
+    int i,j,k;
+    for(i = 0; i < M; ++i){
+        for(j = 0; j < N; ++j){
+            PUT_IN_REGISTER float sum = 0;
+            for(k = 0; k < K; ++k){
+                sum += ALPHA*A[i*lda+k]*B[j*ldb + k];
+            }
+            C[i*ldc+j] += sum;
+        }
+    }
+}
+
+void gemm_tn(int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float *C, int ldc)
+{
+    int i,j,k;
+    for(i = 0; i < M; ++i){
+        for(k = 0; k < K; ++k){
+            PUT_IN_REGISTER float A_PART = ALPHA * A[k * lda + i];
+            for(j = 0; j < N; ++j){
+                C[i*ldc+j] += A_PART*B[k*ldb+j];
+            }
+        }
+    }
+}
+
+void gemm_tt(int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float *C, int ldc)
+{
+    int i,j,k;
+    for(i = 0; i < M; ++i){
+        for(j = 0; j < N; ++j){
+            PUT_IN_REGISTER float sum = 0;
+            for(k = 0; k < K; ++k){
+                sum += ALPHA*A[i+k*lda]*B[k+j*ldb];
+            }
+            C[i*ldc+j] += sum;
+        }
+    }
+}
+
+
+void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float BETA,
+        float *C, int ldc)
+{
+    //printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
+    if (BETA != 1){
+        int i, j;
+        for(i = 0; i < M; ++i){
+            for(j = 0; j < N; ++j){
+                C[i*ldc + j] *= BETA;
+            }
+        }
+    }
+
+    is_avx();   // initialize static variable
+    if (is_fma_avx2() && !TA && !TB) {
+        gemm_nn_fast(M, N, K, ALPHA, A, lda, B, ldb, C, ldc);
+    }
+    else {
+        int t;
+        #pragma omp parallel for
+        for (t = 0; t < M; ++t) {
+            if (!TA && !TB)
+                gemm_nn(1, N, K, ALPHA, A + t*lda, lda, B, ldb, C + t*ldc, ldc);
+            else if (TA && !TB)
+                gemm_tn(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
+            else if (!TA && TB)
+                gemm_nt(1, N, K, ALPHA, A + t*lda, lda, B, ldb, C + t*ldc, ldc);
+            else
+                gemm_tt(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
+        }
+    }
+}
+
+#ifdef GPU
+
+#include <math.h>
+
+void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A_gpu, int lda,
+        float *B_gpu, int ldb,
+        float BETA,
+        float *C_gpu, int ldc)
+{
+    cublasHandle_t handle = blas_handle();
+    cudaError_t stream_status = (cudaError_t)cublasSetStream(handle, get_cuda_stream());
+    CHECK_CUDA(stream_status);
+    cudaError_t status = (cudaError_t)cublasSgemm(handle, (TB ? CUBLAS_OP_T : CUBLAS_OP_N),
+            (TA ? CUBLAS_OP_T : CUBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc);
+    CHECK_CUDA(status);
+}
+
+void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float BETA,
+        float *C, int ldc)
+{
+    float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M));
+    float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K));
+    float *C_gpu = cuda_make_array(C, ldc*M);
+
+    gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
+
+    cuda_pull_array(C_gpu, C, ldc*M);
+    cuda_free(A_gpu);
+    cuda_free(B_gpu);
+    cuda_free(C_gpu);
+}
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+void time_gpu_random_matrix(int TA, int TB, int m, int k, int n)
+{
+    float *a;
+    if(!TA) a = random_matrix(m,k);
+    else a = random_matrix(k,m);
+    int lda = (!TA)?k:m;
+    float *b;
+    if(!TB) b = random_matrix(k,n);
+    else b = random_matrix(n,k);
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+    int i;
+    clock_t start = clock(), end;
+    for(i = 0; i<32; ++i){
+        gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
+    }
+    end = clock();
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
+    free(a);
+    free(b);
+    free(c);
+}
+
+void time_ongpu(int TA, int TB, int m, int k, int n)
+{
+    int iter = 10;
+    float *a = random_matrix(m,k);
+    float *b = random_matrix(k,n);
+
+    int lda = (!TA)?k:m;
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+
+    float *a_cl = cuda_make_array(a, m*k);
+    float *b_cl = cuda_make_array(b, k*n);
+    float *c_cl = cuda_make_array(c, m*n);
+
+    int i;
+    clock_t start = clock(), end;
+    for(i = 0; i<iter; ++i){
+        gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
+        cudaDeviceSynchronize();
+    }
+    double flop = ((double)m)*n*(2.*k + 2.)*iter;
+    double gflop = flop/pow(10., 9);
+    end = clock();
+    double seconds = sec(end-start);
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
+    cuda_free(a_cl);
+    cuda_free(b_cl);
+    cuda_free(c_cl);
+    free(a);
+    free(b);
+    free(c);
+}
+
+
+void test_gpu_accuracy(int TA, int TB, int m, int k, int n)
+{
+    srand(0);
+    float *a;
+    if(!TA) a = random_matrix(m,k);
+    else a = random_matrix(k,m);
+    int lda = (!TA)?k:m;
+    float *b;
+    if(!TB) b = random_matrix(k,n);
+    else b = random_matrix(n,k);
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+    float *c_gpu = random_matrix(m,n);
+    memset(c, 0, m*n*sizeof(float));
+    memset(c_gpu, 0, m*n*sizeof(float));
+    int i;
+    //pm(m,k,b);
+    gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n);
+    //printf("GPU\n");
+    //pm(m, n, c_gpu);
+
+    gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
+    //printf("\n\nCPU\n");
+    //pm(m, n, c);
+    double sse = 0;
+    for(i = 0; i < m*n; ++i) {
+        //printf("%f %f\n", c[i], c_gpu[i]);
+        sse += pow(c[i]-c_gpu[i], 2);
+    }
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g SSE\n",m,k,k,n, TA, TB, sse/(m*n));
+    free(a);
+    free(b);
+    free(c);
+    free(c_gpu);
+}
+
+int test_gpu_blas()
+{
+    /*
+       test_gpu_accuracy(0,0,10,576,75);
+
+       test_gpu_accuracy(0,0,17,10,10);
+       test_gpu_accuracy(1,0,17,10,10);
+       test_gpu_accuracy(0,1,17,10,10);
+       test_gpu_accuracy(1,1,17,10,10);
+
+       test_gpu_accuracy(0,0,1000,10,100);
+       test_gpu_accuracy(1,0,1000,10,100);
+       test_gpu_accuracy(0,1,1000,10,100);
+       test_gpu_accuracy(1,1,1000,10,100);
+
+       test_gpu_accuracy(0,0,10,10,10);
+
+       time_ongpu(0,0,64,2916,363);
+       time_ongpu(0,0,64,2916,363);
+       time_ongpu(0,0,64,2916,363);
+       time_ongpu(0,0,192,729,1600);
+       time_ongpu(0,0,384,196,1728);
+       time_ongpu(0,0,256,196,3456);
+       time_ongpu(0,0,256,196,2304);
+       time_ongpu(0,0,128,4096,12544);
+       time_ongpu(0,0,128,4096,4096);
+     */
+    time_ongpu(0,0,64,75,12544);
+    time_ongpu(0,0,64,75,12544);
+    time_ongpu(0,0,64,75,12544);
+    time_ongpu(0,0,64,576,12544);
+    time_ongpu(0,0,256,2304,784);
+    time_ongpu(1,1,2304,256,784);
+    time_ongpu(0,0,512,4608,196);
+    time_ongpu(1,1,4608,512,196);
+
+    return 0;
+}
+#endif
+
+
+
+void init_cpu() {
+    is_avx();
+    is_fma_avx2();
+}
diff --git a/darknet-master/src/gemm.h b/darknet-master/src/gemm.h
new file mode 100644
index 0000000..de8a5e1
--- /dev/null
+++ b/darknet-master/src/gemm.h
@@ -0,0 +1,119 @@
+#ifndef GEMM_H
+#define GEMM_H
+#include "activations.h"
+#include <stdint.h>
+#include <stddef.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
+    float *weights, float *input, float *output, float *mean);
+
+static inline void set_bit(unsigned char *const dst, size_t index) {
+    size_t dst_i = index / 8;
+    int dst_shift = index % 8;
+    dst[dst_i] |= 1 << dst_shift;
+    //dst[dst_i] |= 1 << (8 - dst_shift);
+}
+
+static inline unsigned char get_bit(unsigned char const*const src, size_t index) {
+    size_t src_i = index / 8;
+    int src_shift = index % 8;
+    unsigned char val = (src[src_i] & (1 << src_shift)) > 0;
+    //unsigned char val = (src[src_i] & (1 << (8 - src_shift))) > 0;
+    return val;
+}
+
+int is_avx();
+int is_fma_avx2();
+
+void float_to_bit(float *src, unsigned char *dst, size_t size);
+
+void transpose_block_SSE4x4(float *A, float *B, const int n, const int m,
+    const int lda, const int ldb, const int block_size);
+
+void transpose_bin(uint32_t *A, uint32_t *B, const int n, const int m,
+    const int lda, const int ldb, const int block_size);
+
+void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr);
+
+void im2col_cpu_custom(float* data_im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col);
+
+void im2col_cpu_custom_align(float* data_im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col, int bit_align);
+
+void im2col_cpu_custom_bin(float* data_im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col, int bit_align);
+
+void im2col_cpu_custom_transpose(float* data_im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col, int ldb_align);
+
+void activate_array_cpu_custom(float *x, const int n, const ACTIVATION a);
+
+void transpose_32x32_bits_reversed_diagonale(uint32_t *A, uint32_t *B, int m, int n);
+
+void gemm_bin(int M, int N, int K, float ALPHA,
+        char  *A, int lda,
+        float *B, int ldb,
+        float *C, int ldc);
+
+void repack_input(float *input, float *re_packed_input, int w, int h, int c);
+
+void convolution_repacked(uint32_t *packed_input, uint32_t *packed_weights, float *output,
+    int w, int h, int c, int n, int size, int pad, int new_lda, float *mean_arr);
+
+void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA,
+    uint32_t *A, int lda,
+    uint32_t *B, int ldb,
+    float *C, int ldc, float *mean_arr);
+
+void transpose_uint32(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align);
+
+void gemm_nn_bin_transposed_32bit_packed(int M, int N, int K, float ALPHA,
+    uint32_t *A, int lda,
+    uint32_t *B, int ldb,
+    float *C, int ldc, float *mean_arr);
+
+
+void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, int w, int h, int out_w, int out_h, int c,
+    int pad, int stride, int batch);
+
+
+void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
+                    float *A, int lda,
+                    float *B, int ldb,
+                    float BETA,
+                    float *C, int ldc);
+
+void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float BETA,
+        float *C, int ldc);
+
+#ifdef GPU
+void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A_gpu, int lda,
+        float *B_gpu, int ldb,
+        float BETA,
+        float *C_gpu, int ldc);
+
+void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
+        float *A, int lda,
+        float *B, int ldb,
+        float BETA,
+        float *C, int ldc);
+#endif
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/gettimeofday.c b/darknet-master/src/gettimeofday.c
new file mode 100644
index 0000000..74f6789
--- /dev/null
+++ b/darknet-master/src/gettimeofday.c
@@ -0,0 +1,43 @@
+#ifdef _MSC_VER
+#include "gettimeofday.h"
+
+int gettimeofday(struct timeval* tp, struct timezone* tzp)
+{
+  static const uint64_t EPOCH = ((uint64_t)116444736000000000ULL);
+  SYSTEMTIME system_time;
+  FILETIME file_time;
+  uint64_t time;
+
+
+  GetSystemTime(&system_time);
+  SystemTimeToFileTime(&system_time, &file_time);
+  time = ((uint64_t)file_time.dwLowDateTime);
+  time += ((uint64_t)file_time.dwHighDateTime) << 32;
+    /*converting file time to unix epoch*/
+  tp->tv_sec = (long)((time - EPOCH) / 10000000L);
+  tp->tv_usec = (long)(system_time.wMilliseconds * 1000);
+  return 0;
+  }
+
+int clock_gettime(int dummy, struct timespec* ct)
+  {
+  LARGE_INTEGER count;
+
+  if (g_first_time) {
+    g_first_time = 0;
+
+    if (0 == QueryPerformanceFrequency(&g_counts_per_sec)) {
+      g_counts_per_sec.QuadPart = 0;
+    }
+  }
+
+  if ((NULL == ct) || (g_counts_per_sec.QuadPart <= 0) || (0 == QueryPerformanceCounter(&count))) {
+    return -1;
+}
+
+  ct->tv_sec = count.QuadPart / g_counts_per_sec.QuadPart;
+  ct->tv_nsec = ((count.QuadPart % g_counts_per_sec.QuadPart) * BILLION) / g_counts_per_sec.QuadPart;
+
+    return 0;
+}
+#endif
diff --git a/darknet-master/src/gettimeofday.h b/darknet-master/src/gettimeofday.h
new file mode 100644
index 0000000..86fef10
--- /dev/null
+++ b/darknet-master/src/gettimeofday.h
@@ -0,0 +1,38 @@
+#ifdef _MSC_VER
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <winsock2.h>
+#include <stdint.h>
+#include <time.h>
+#include "darknet.h"
+
+#define CLOCK_REALTIME (1)
+#define BILLION (1E9)
+
+#ifndef timersub
+#define timersub(a, b, result)                       \
+  do {                                               \
+    (result)->tv_sec = (a)->tv_sec - (b)->tv_sec;    \
+    (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
+    if ((result)->tv_usec < 0) {                     \
+      --(result)->tv_sec;                            \
+      (result)->tv_usec += 1000000;                  \
+    }                                                \
+  } while (0)
+#endif // timersub
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static unsigned char g_first_time = 1;
+static LARGE_INTEGER g_counts_per_sec;
+
+int gettimeofday(struct timeval*, struct timezone*);
+int clock_gettime(int, struct timespec*);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/go.c b/darknet-master/src/go.c
new file mode 100644
index 0000000..5107125
--- /dev/null
+++ b/darknet-master/src/go.c
@@ -0,0 +1,849 @@
+#include "network.h"
+#include "utils.h"
+#include "parser.h"
+#include "option_list.h"
+#include "blas.h"
+
+
+int inverted = 1;
+int noi = 1;
+//static const unsigned int n_ind = 5;
+#define n_ind 5
+
+typedef struct {
+    char **data;
+    int n;
+} moves;
+
+char *fgetgo(FILE *fp)
+{
+    if(feof(fp)) return 0;
+    size_t size = 94;
+    char* line = (char*)xmalloc(size * sizeof(char));
+    if(size != fread(line, sizeof(char), size, fp)){
+        free(line);
+        return 0;
+    }
+
+    return line;
+}
+
+moves load_go_moves(char *filename)
+{
+    moves m;
+    m.n = 128;
+    m.data = (char**)xcalloc(128, sizeof(char*));
+    FILE *fp = fopen(filename, "rb");
+    int count = 0;
+    char *line = 0;
+    while((line = fgetgo(fp))){
+        if(count >= m.n){
+            m.n *= 2;
+            m.data = (char**)xrealloc(m.data, m.n * sizeof(char*));
+        }
+        m.data[count] = line;
+        ++count;
+    }
+    printf("%d\n", count);
+    m.n = count;
+    m.data = (char**)xrealloc(m.data, count * sizeof(char*));
+    fclose(fp);
+    return m;
+}
+
+void string_to_board(char *s, float *board)
+{
+    int i, j;
+    //memset(board, 0, 1*19*19*sizeof(float));
+    int count = 0;
+    for(i = 0; i < 91; ++i){
+        char c = s[i];
+        for(j = 0; j < 4; ++j){
+            int me = (c >> (2*j)) & 1;
+            int you = (c >> (2*j + 1)) & 1;
+            if (me) board[count] = 1;
+            else if (you) board[count] = -1;
+            else board[count] = 0;
+            ++count;
+            if(count >= 19*19) break;
+        }
+    }
+}
+
+void board_to_string(char *s, float *board)
+{
+    int i, j;
+    memset(s, 0, (19*19/4+1)*sizeof(char));
+    int count = 0;
+    for(i = 0; i < 91; ++i){
+        for(j = 0; j < 4; ++j){
+            int me = (board[count] == 1);
+            int you = (board[count] == -1);
+            if (me) s[i] = s[i] | (1<<(2*j));
+            if (you) s[i] = s[i] | (1<<(2*j + 1));
+            ++count;
+            if(count >= 19*19) break;
+        }
+    }
+}
+
+void random_go_moves(moves m, float *boards, float *labels, int n)
+{
+    int i;
+    memset(labels, 0, 19*19*n*sizeof(float));
+    for(i = 0; i < n; ++i){
+        char *b = m.data[rand()%m.n];
+        int row = b[0];
+        int col = b[1];
+        labels[col + 19*(row + i*19)] = 1;
+        string_to_board(b+2, boards+i*19*19);
+        boards[col + 19*(row + i*19)] = 0;
+
+        int flip = rand()%2;
+        int rotate = rand()%4;
+        image in = float_to_image(19, 19, 1, boards+i*19*19);
+        image out = float_to_image(19, 19, 1, labels+i*19*19);
+        if(flip){
+            flip_image(in);
+            flip_image(out);
+        }
+        rotate_image_cw(in, rotate);
+        rotate_image_cw(out, rotate);
+    }
+}
+
+
+void train_go(char *cfgfile, char *weightfile)
+{
+    srand(time(0));
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+
+    char* backup_directory = "backup/";
+
+    char buff[256];
+    float* board = (float*)xcalloc(19 * 19 * net.batch, sizeof(float));
+    float* move = (float*)xcalloc(19 * 19 * net.batch, sizeof(float));
+    moves m = load_go_moves("backup/go.train");
+    //moves m = load_go_moves("games.txt");
+
+    int N = m.n;
+    int epoch = (*net.seen)/N;
+    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
+        clock_t time=clock();
+
+        random_go_moves(m, board, move, net.batch);
+        float loss = train_network_datum(net, board, move) / net.batch;
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.95 + loss*.05;
+        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %" PRIu64 " images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
+        if(*net.seen/N > epoch){
+            epoch = *net.seen/N;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory,base, epoch);
+            save_weights(net, buff);
+
+        }
+        if(get_current_batch(net)%100 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            save_weights(net, buff);
+        }
+        if(get_current_batch(net)%10000 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.backup",backup_directory,base,get_current_batch(net));
+            save_weights(net, buff);
+        }
+    }
+    sprintf(buff, "%s/%s.weights", backup_directory, base);
+    save_weights(net, buff);
+
+    free_network(net);
+    free(base);
+    free(board);
+    free(move);
+}
+
+void propagate_liberty(float *board, int *lib, int *visited, int row, int col, int side)
+{
+    if (row < 0 || row > 18 || col < 0 || col > 18) return;
+    int index = row*19 + col;
+    if (board[index] != side) return;
+    if (visited[index]) return;
+    visited[index] = 1;
+    lib[index] += 1;
+    propagate_liberty(board, lib, visited, row+1, col, side);
+    propagate_liberty(board, lib, visited, row-1, col, side);
+    propagate_liberty(board, lib, visited, row, col+1, side);
+    propagate_liberty(board, lib, visited, row, col-1, side);
+}
+
+
+int *calculate_liberties(float *board)
+{
+    int* lib = (int*)xcalloc(19 * 19, sizeof(int));
+    int visited[361];
+    int i, j;
+    for(j = 0; j < 19; ++j){
+        for(i = 0; i < 19; ++i){
+            memset(visited, 0, 19*19*sizeof(int));
+            int index = j*19 + i;
+            if(board[index] == 0){
+                if ((i > 0)  && board[index - 1]) propagate_liberty(board, lib, visited, j, i-1, board[index-1]);
+                if ((i < 18) && board[index + 1]) propagate_liberty(board, lib, visited, j, i+1, board[index+1]);
+                if ((j > 0)  && board[index - 19]) propagate_liberty(board, lib, visited, j-1, i, board[index-19]);
+                if ((j < 18) && board[index + 19]) propagate_liberty(board, lib, visited, j+1, i, board[index+19]);
+            }
+        }
+    }
+    return lib;
+}
+
+void print_board(float *board, int swap, int *indexes)
+{
+    //FILE *stream = stdout;
+    FILE *stream = stderr;
+    int i,j,n;
+    fprintf(stream, "\n\n");
+    fprintf(stream, "   ");
+    for(i = 0; i < 19; ++i){
+        fprintf(stream, "%c ", 'A' + i + 1*(i > 7 && noi));
+    }
+    fprintf(stream, "\n");
+    for(j = 0; j < 19; ++j){
+        fprintf(stream, "%2d", (inverted) ? 19-j : j+1);
+        for(i = 0; i < 19; ++i){
+            int index = j*19 + i;
+            if(indexes){
+                int found = 0;
+                for (n = 0; n < n_ind; ++n) {
+                    if(index == indexes[n]){
+                        found = 1;
+                        /*
+                        if(n == 0) fprintf(stream, "\uff11");
+                        else if(n == 1) fprintf(stream, "\uff12");
+                        else if(n == 2) fprintf(stream, "\uff13");
+                        else if(n == 3) fprintf(stream, "\uff14");
+                        else if(n == 4) fprintf(stream, "\uff15");
+                        */
+                        if(n == 0) fprintf(stream, " 1");
+                        else if(n == 1) fprintf(stream, " 2");
+                        else if(n == 2) fprintf(stream, " 3");
+                        else if(n == 3) fprintf(stream, " 4");
+                        else if(n == 4) fprintf(stream, " 5");
+                    }
+                }
+                if(found) continue;
+            }
+            //if(board[index]*-swap > 0) fprintf(stream, "\u25C9 ");
+            //else if(board[index]*-swap < 0) fprintf(stream, "\u25EF ");
+            if(board[index]*-swap > 0) fprintf(stream, " O");
+            else if(board[index]*-swap < 0) fprintf(stream, " X");
+            else fprintf(stream, "  ");
+        }
+        fprintf(stream, "\n");
+    }
+}
+
+void flip_board(float *board)
+{
+    int i;
+    for(i = 0; i < 19*19; ++i){
+        board[i] = -board[i];
+    }
+}
+
+void predict_move(network net, float *board, float *move, int multi)
+{
+    float *output = network_predict(net, board);
+    copy_cpu(19*19, output, 1, move, 1);
+    int i;
+    if(multi){
+        image bim = float_to_image(19, 19, 1, board);
+        for(i = 1; i < 8; ++i){
+            rotate_image_cw(bim, i);
+            if(i >= 4) flip_image(bim);
+
+            float *output = network_predict(net, board);
+            image oim = float_to_image(19, 19, 1, output);
+
+            if(i >= 4) flip_image(oim);
+            rotate_image_cw(oim, -i);
+
+            axpy_cpu(19*19, 1, output, 1, move, 1);
+
+            if(i >= 4) flip_image(bim);
+            rotate_image_cw(bim, -i);
+        }
+        scal_cpu(19*19, 1./8., move, 1);
+    }
+    for(i = 0; i < 19*19; ++i){
+        if(board[i]) move[i] = 0;
+    }
+}
+
+void remove_connected(float *b, int *lib, int p, int r, int c)
+{
+    if (r < 0 || r >= 19 || c < 0 || c >= 19) return;
+    if (b[r*19 + c] != p) return;
+    if (lib[r*19 + c] != 1) return;
+    b[r*19 + c] = 0;
+    remove_connected(b, lib, p, r+1, c);
+    remove_connected(b, lib, p, r-1, c);
+    remove_connected(b, lib, p, r, c+1);
+    remove_connected(b, lib, p, r, c-1);
+}
+
+
+void move_go(float *b, int p, int r, int c)
+{
+    int *l = calculate_liberties(b);
+    b[r*19 + c] = p;
+    remove_connected(b, l, -p, r+1, c);
+    remove_connected(b, l, -p, r-1, c);
+    remove_connected(b, l, -p, r, c+1);
+    remove_connected(b, l, -p, r, c-1);
+    free(l);
+}
+
+int makes_safe_go(float *b, int *lib, int p, int r, int c){
+    if (r < 0 || r >= 19 || c < 0 || c >= 19) return 0;
+    if (b[r*19 + c] == -p){
+        if (lib[r*19 + c] > 1) return 0;
+        else return 1;
+    }
+    if (b[r*19 + c] == 0) return 1;
+    if (lib[r*19 + c] > 1) return 1;
+    return 0;
+}
+
+int suicide_go(float *b, int p, int r, int c)
+{
+    int *l = calculate_liberties(b);
+    int safe = 0;
+    safe = safe || makes_safe_go(b, l, p, r+1, c);
+    safe = safe || makes_safe_go(b, l, p, r-1, c);
+    safe = safe || makes_safe_go(b, l, p, r, c+1);
+    safe = safe || makes_safe_go(b, l, p, r, c-1);
+    free(l);
+    return !safe;
+}
+
+int legal_go(float *b, char *ko, int p, int r, int c)
+{
+    if (b[r*19 + c]) return 0;
+    char curr[91];
+    char next[91];
+    board_to_string(curr, b);
+    move_go(b, p, r, c);
+    board_to_string(next, b);
+    string_to_board(curr, b);
+    if(memcmp(next, ko, 91) == 0) return 0;
+    return 1;
+}
+
+int generate_move(network net, int player, float *board, int multi, float thresh, float temp, char *ko, int print)
+{
+    int i, j;
+    for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp;
+
+    float move[361];
+    if (player < 0) flip_board(board);
+    predict_move(net, board, move, multi);
+    if (player < 0) flip_board(board);
+
+
+    for(i = 0; i < 19; ++i){
+        for(j = 0; j < 19; ++j){
+            if (!legal_go(board, ko, player, i, j)) move[i*19 + j] = 0;
+        }
+    }
+
+    int indexes[n_ind];
+    top_k(move, 19*19, n_ind, indexes);
+    if(thresh > move[indexes[0]]) thresh = move[indexes[n_ind-1]];
+
+    for(i = 0; i < 19; ++i){
+        for(j = 0; j < 19; ++j){
+            if (move[i*19 + j] < thresh) move[i*19 + j] = 0;
+        }
+    }
+
+
+    int max = max_index(move, 19*19);
+    int row = max / 19;
+    int col = max % 19;
+    int index = sample_array(move, 19*19);
+
+    if(print){
+        top_k(move, 19*19, n_ind, indexes);
+        for(i = 0; i < n_ind; ++i){
+            if (!move[indexes[i]]) indexes[i] = -1;
+        }
+        print_board(board, player, indexes);
+        for(i = 0; i < n_ind; ++i){
+            fprintf(stderr, "%d: %f\n", i+1, move[indexes[i]]);
+        }
+    }
+
+    if(suicide_go(board, player, row, col)){
+        return -1;
+    }
+    if(suicide_go(board, player, index/19, index%19)) index = max;
+    return index;
+}
+
+void valid_go(char *cfgfile, char *weightfile, int multi)
+{
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+
+    float* board = (float*)xcalloc(19 * 19, sizeof(float));
+    float* move = (float*)xcalloc(19 * 19, sizeof(float));
+    moves m = load_go_moves("backup/go.test");
+
+    int N = m.n;
+    int i;
+    int correct = 0;
+    for(i = 0; i <N; ++i){
+        char *b = m.data[i];
+        int row = b[0];
+        int col = b[1];
+        int truth = col + 19*row;
+        string_to_board(b+2, board);
+        predict_move(net, board, move, multi);
+        int index = max_index(move, 19*19);
+        if(index == truth) ++correct;
+        printf("%d Accuracy %f\n", i, (float) correct/(i+1));
+    }
+    free(board);
+    free(move);
+}
+
+void engine_go(char *filename, char *weightfile, int multi)
+{
+    network net = parse_network_cfg(filename);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+    set_batch_network(&net, 1);
+    float* board = (float*)xcalloc(19 * 19, sizeof(float));
+    char* one = (char*)xcalloc(91, sizeof(char));
+    char* two = (char*)xcalloc(91, sizeof(char));
+    int passed = 0;
+    while(1){
+        char buff[256];
+        int id = 0;
+        int has_id = (scanf("%d", &id) == 1);
+        scanf("%s", buff);
+        if (feof(stdin)) break;
+        char ids[256];
+        sprintf(ids, "%d", id);
+        //fprintf(stderr, "%s\n", buff);
+        if (!has_id) ids[0] = 0;
+        if (!strcmp(buff, "protocol_version")){
+            printf("=%s 2\n\n", ids);
+        } else if (!strcmp(buff, "name")){
+            printf("=%s DarkGo\n\n", ids);
+        } else if (!strcmp(buff, "version")){
+            printf("=%s 1.0\n\n", ids);
+        } else if (!strcmp(buff, "known_command")){
+            char comm[256];
+            scanf("%s", comm);
+            int known = (!strcmp(comm, "protocol_version") ||
+                    !strcmp(comm, "name") ||
+                    !strcmp(comm, "version") ||
+                    !strcmp(comm, "known_command") ||
+                    !strcmp(comm, "list_commands") ||
+                    !strcmp(comm, "quit") ||
+                    !strcmp(comm, "boardsize") ||
+                    !strcmp(comm, "clear_board") ||
+                    !strcmp(comm, "komi") ||
+                    !strcmp(comm, "final_status_list") ||
+                    !strcmp(comm, "play") ||
+                    !strcmp(comm, "genmove"));
+            if(known) printf("=%s true\n\n", ids);
+            else printf("=%s false\n\n", ids);
+        } else if (!strcmp(buff, "list_commands")){
+            printf("=%s protocol_version\nname\nversion\nknown_command\nlist_commands\nquit\nboardsize\nclear_board\nkomi\nplay\ngenmove\nfinal_status_list\n\n", ids);
+        } else if (!strcmp(buff, "quit")){
+            break;
+        } else if (!strcmp(buff, "boardsize")){
+            int boardsize = 0;
+            scanf("%d", &boardsize);
+            //fprintf(stderr, "%d\n", boardsize);
+            if(boardsize != 19){
+                printf("?%s unacceptable size\n\n", ids);
+            } else {
+                printf("=%s \n\n", ids);
+            }
+        } else if (!strcmp(buff, "clear_board")){
+            passed = 0;
+            memset(board, 0, 19*19*sizeof(float));
+            printf("=%s \n\n", ids);
+        } else if (!strcmp(buff, "komi")){
+            float komi = 0;
+            scanf("%f", &komi);
+            printf("=%s \n\n", ids);
+        } else if (!strcmp(buff, "play")){
+            char color[256];
+            scanf("%s ", color);
+            char c;
+            int r;
+            int count = scanf("%c%d", &c, &r);
+            int player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1;
+            if(c == 'p' && count < 2) {
+                passed = 1;
+                printf("=%s \n\n", ids);
+                char *line = fgetl(stdin);
+                free(line);
+                fflush(stdout);
+                fflush(stderr);
+                continue;
+            } else {
+                passed = 0;
+            }
+            if(c >= 'A' && c <= 'Z') c = c - 'A';
+            if(c >= 'a' && c <= 'z') c = c - 'a';
+            if(c >= 8) --c;
+            r = 19 - r;
+            fprintf(stderr, "move: %d %d\n", r, c);
+
+            char *swap = two;
+            two = one;
+            one = swap;
+            move_go(board, player, r, c);
+            board_to_string(one, board);
+
+            printf("=%s \n\n", ids);
+            print_board(board, 1, 0);
+        } else if (!strcmp(buff, "genmove")){
+            char color[256];
+            scanf("%s", color);
+            int player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1;
+
+            int index = generate_move(net, player, board, multi, .1, .7, two, 1);
+            if(passed || index < 0){
+                printf("=%s pass\n\n", ids);
+                passed = 0;
+            } else {
+                int row = index / 19;
+                int col = index % 19;
+
+                char *swap = two;
+                two = one;
+                one = swap;
+
+                move_go(board, player, row, col);
+                board_to_string(one, board);
+                row = 19 - row;
+                if (col >= 8) ++col;
+                printf("=%s %c%d\n\n", ids, 'A' + col, row);
+                print_board(board, 1, 0);
+            }
+
+        } else if (!strcmp(buff, "p")){
+            //print_board(board, 1, 0);
+        } else if (!strcmp(buff, "final_status_list")){
+            char type[256];
+            scanf("%s", type);
+            fprintf(stderr, "final_status\n");
+            char *line = fgetl(stdin);
+            free(line);
+            if(type[0] == 'd' || type[0] == 'D'){
+                FILE *f = fopen("game.txt", "w");
+                int i, j;
+                int count = 2;
+                fprintf(f, "boardsize 19\n");
+                fprintf(f, "clear_board\n");
+                for(j = 0; j < 19; ++j){
+                    for(i = 0; i < 19; ++i){
+                        if(board[j*19 + i] == 1) fprintf(f, "play black %c%d\n", 'A'+i+(i>=8), 19-j);
+                        if(board[j*19 + i] == -1) fprintf(f, "play white %c%d\n", 'A'+i+(i>=8), 19-j);
+                        if(board[j*19 + i]) ++count;
+                    }
+                }
+                fprintf(f, "final_status_list dead\n");
+                fclose(f);
+#ifdef _WIN32
+                FILE *p = _popen("./gnugo --mode gtp < game.txt", "r");
+#else
+                FILE *p = popen("./gnugo --mode gtp < game.txt", "r");
+#endif
+                for(i = 0; i < count; ++i){
+                    free(fgetl(p));
+                    free(fgetl(p));
+                }
+                char *l = 0;
+                while((l = fgetl(p))){
+                    printf("%s\n", l);
+                    free(l);
+                }
+            } else {
+                printf("?%s unknown command\n\n", ids);
+            }
+        } else {
+            char *line = fgetl(stdin);
+            free(line);
+            printf("?%s unknown command\n\n", ids);
+        }
+        fflush(stdout);
+        fflush(stderr);
+    }
+}
+
+void test_go(char *cfg, char *weights, int multi)
+{
+    network net = parse_network_cfg(cfg);
+    if(weights){
+        load_weights(&net, weights);
+    }
+    srand(time(0));
+    set_batch_network(&net, 1);
+    float* board = (float*)xcalloc(19 * 19, sizeof(float));
+    float* move = (float*)xcalloc(19 * 19, sizeof(float));
+    int color = 1;
+    while(1){
+        float *output = network_predict(net, board);
+        copy_cpu(19*19, output, 1, move, 1);
+        int i;
+        if(multi){
+            image bim = float_to_image(19, 19, 1, board);
+            for(i = 1; i < 8; ++i){
+                rotate_image_cw(bim, i);
+                if(i >= 4) flip_image(bim);
+
+                float *output = network_predict(net, board);
+                image oim = float_to_image(19, 19, 1, output);
+
+                if(i >= 4) flip_image(oim);
+                rotate_image_cw(oim, -i);
+
+                axpy_cpu(19*19, 1, output, 1, move, 1);
+
+                if(i >= 4) flip_image(bim);
+                rotate_image_cw(bim, -i);
+            }
+            scal_cpu(19*19, 1./8., move, 1);
+        }
+        for(i = 0; i < 19*19; ++i){
+            if(board[i]) move[i] = 0;
+        }
+
+        int indexes[n_ind];
+        int row, col;
+        top_k(move, 19 * 19, n_ind, indexes);
+        print_board(board, color, indexes);
+        for (i = 0; i < n_ind; ++i) {
+            int index = indexes[i];
+            row = index / 19;
+            col = index % 19;
+            printf("%d: %c %d, %.2f%%\n", i+1, col + 'A' + 1*(col > 7 && noi), (inverted)?19 - row : row+1, move[index]*100);
+        }
+        //if(color == 1) printf("\u25EF Enter move: ");
+        //else printf("\u25C9 Enter move: ");
+        if(color == 1) printf("X Enter move: ");
+        else printf("O Enter move: ");
+
+        char c;
+        char *line = fgetl(stdin);
+        int picked = 1;
+        int dnum = sscanf(line, "%d", &picked);
+        int cnum = sscanf(line, "%c", &c);
+        if (strlen(line) == 0 || dnum) {
+            --picked;
+            if (picked < n_ind){
+                int index = indexes[picked];
+                row = index / 19;
+                col = index % 19;
+                board[row*19 + col] = 1;
+            }
+        } else if (cnum){
+            if (c <= 'T' && c >= 'A'){
+                int num = sscanf(line, "%c %d", &c, &row);
+                row = (inverted)?19 - row : row-1;
+                col = c - 'A';
+                if (col > 7 && noi) col -= 1;
+                if (num == 2) board[row*19 + col] = 1;
+            } else if (c == 'p') {
+                // Pass
+            } else if(c=='b' || c == 'w'){
+                char g;
+                int num = sscanf(line, "%c %c %d", &g, &c, &row);
+                row = (inverted)?19 - row : row-1;
+                col = c - 'A';
+                if (col > 7 && noi) col -= 1;
+                if (num == 3) board[row*19 + col] = (g == 'b') ? color : -color;
+            } else if(c == 'c'){
+                char g;
+                int num = sscanf(line, "%c %c %d", &g, &c, &row);
+                row = (inverted)?19 - row : row-1;
+                col = c - 'A';
+                if (col > 7 && noi) col -= 1;
+                if (num == 3) board[row*19 + col] = 0;
+            }
+        }
+        free(line);
+        flip_board(board);
+        color = -color;
+    }
+}
+
+float score_game(float *board)
+{
+    FILE *f = fopen("game.txt", "w");
+    int i, j;
+    int count = 3;
+    fprintf(f, "komi 6.5\n");
+    fprintf(f, "boardsize 19\n");
+    fprintf(f, "clear_board\n");
+    for(j = 0; j < 19; ++j){
+        for(i = 0; i < 19; ++i){
+            if(board[j*19 + i] == 1) fprintf(f, "play black %c%d\n", 'A'+i+(i>=8), 19-j);
+            if(board[j*19 + i] == -1) fprintf(f, "play white %c%d\n", 'A'+i+(i>=8), 19-j);
+            if(board[j*19 + i]) ++count;
+        }
+    }
+    fprintf(f, "final_score\n");
+    fclose(f);
+#ifdef _WIN32
+    FILE *p = _popen("./gnugo --mode gtp < game.txt", "r");
+#else
+    FILE *p = popen("./gnugo --mode gtp < game.txt", "r");
+#endif
+    for(i = 0; i < count; ++i){
+        free(fgetl(p));
+        free(fgetl(p));
+    }
+    char *l = 0;
+    float score = 0;
+    char player = 0;
+    while((l = fgetl(p))){
+        fprintf(stderr, "%s  \t", l);
+        int n = sscanf(l, "= %c+%f", &player, &score);
+        free(l);
+        if (n == 2) break;
+    }
+    if(player == 'W') score = -score;
+#ifdef _WIN32
+    _pclose(p);
+#else
+    pclose(p);
+#endif
+    return score;
+}
+
+void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi)
+{
+    network net = parse_network_cfg(filename);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+
+    network net2 = net;
+    if(f2){
+        net2 = parse_network_cfg(f2);
+        if(w2){
+            load_weights(&net2, w2);
+        }
+    }
+    srand(time(0));
+    char boards[300][93];
+    int count = 0;
+    set_batch_network(&net, 1);
+    set_batch_network(&net2, 1);
+    float* board = (float*)xcalloc(19 * 19, sizeof(float));
+    char* one = (char*)xcalloc(91, sizeof(char));
+    char* two = (char*)xcalloc(91, sizeof(char));
+    int done = 0;
+    int player = 1;
+    int p1 = 0;
+    int p2 = 0;
+    int total = 0;
+    while(1){
+        if (done || count >= 300){
+            float score = score_game(board);
+            int i = (score > 0)? 0 : 1;
+            if((score > 0) == (total%2==0)) ++p1;
+            else ++p2;
+            ++total;
+            fprintf(stderr, "Total: %d, Player 1: %f, Player 2: %f\n", total, (float)p1/total, (float)p2/total);
+            int j;
+            for(; i < count; i += 2){
+                for(j = 0; j < 93; ++j){
+                    printf("%c", boards[i][j]);
+                }
+                printf("\n");
+            }
+            memset(board, 0, 19*19*sizeof(float));
+            player = 1;
+            done = 0;
+            count = 0;
+            fflush(stdout);
+            fflush(stderr);
+        }
+        //print_board(board, 1, 0);
+        //sleep(1);
+        network use = ((total%2==0) == (player==1)) ? net : net2;
+        int index = generate_move(use, player, board, multi, .1, .7, two, 0);
+        if(index < 0){
+            done = 1;
+            continue;
+        }
+        int row = index / 19;
+        int col = index % 19;
+
+        char *swap = two;
+        two = one;
+        one = swap;
+
+        if(player < 0) flip_board(board);
+        boards[count][0] = row;
+        boards[count][1] = col;
+        board_to_string(boards[count] + 2, board);
+        if(player < 0) flip_board(board);
+        ++count;
+
+        move_go(board, player, row, col);
+        board_to_string(one, board);
+
+        player = -player;
+    }
+    free(board);
+    free(one);
+    free(two);
+}
+
+void run_go(int argc, char **argv)
+{
+    //boards_go();
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *c2 = (argc > 5) ? argv[5] : 0;
+    char *w2 = (argc > 6) ? argv[6] : 0;
+    int multi = find_arg(argc, argv, "-multi");
+    if(0==strcmp(argv[2], "train")) train_go(cfg, weights);
+    else if(0==strcmp(argv[2], "valid")) valid_go(cfg, weights, multi);
+    else if(0==strcmp(argv[2], "self")) self_go(cfg, weights, c2, w2, multi);
+    else if(0==strcmp(argv[2], "test")) test_go(cfg, weights, multi);
+    else if(0==strcmp(argv[2], "engine")) engine_go(cfg, weights, multi);
+}
diff --git a/darknet-master/src/gru_layer.c b/darknet-master/src/gru_layer.c
new file mode 100644
index 0000000..de301df
--- /dev/null
+++ b/darknet-master/src/gru_layer.c
@@ -0,0 +1,398 @@
+#include "gru_layer.h"
+#include "connected_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include "gemm.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void increment_layer(layer *l, int steps)
+{
+    int num = l->outputs*l->batch*steps;
+    l->output += num;
+    l->delta += num;
+    l->x += num;
+    l->x_norm += num;
+
+#ifdef GPU
+    l->output_gpu += num;
+    l->delta_gpu += num;
+    l->x_gpu += num;
+    l->x_norm_gpu += num;
+#endif
+}
+
+layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize)
+{
+    fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs);
+    batch = batch / steps;
+    layer l = { (LAYER_TYPE)0 };
+    l.batch = batch;
+    l.type = GRU;
+    l.steps = steps;
+    l.inputs = inputs;
+
+    l.input_z_layer = (layer*)xcalloc(1,sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.input_z_layer) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize);
+    l.input_z_layer->batch = batch;
+
+    l.state_z_layer = (layer*)xcalloc(1,sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.state_z_layer) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize);
+    l.state_z_layer->batch = batch;
+
+
+
+    l.input_r_layer = (layer*)xcalloc(1,sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.input_r_layer) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize);
+    l.input_r_layer->batch = batch;
+
+    l.state_r_layer = (layer*)xcalloc(1,sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.state_r_layer) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize);
+    l.state_r_layer->batch = batch;
+
+
+
+    l.input_h_layer = (layer*)xcalloc(1,sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.input_h_layer) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize);
+    l.input_h_layer->batch = batch;
+
+    l.state_h_layer = (layer*)xcalloc(1,sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.state_h_layer) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize);
+    l.state_h_layer->batch = batch;
+
+    l.batch_normalize = batch_normalize;
+
+
+    l.outputs = outputs;
+    l.output = (float*)xcalloc(outputs * batch * steps, sizeof(float));
+    l.delta = (float*)xcalloc(outputs * batch * steps, sizeof(float));
+    l.state = (float*)xcalloc(outputs * batch, sizeof(float));
+    l.prev_state = (float*)xcalloc(outputs * batch, sizeof(float));
+    l.forgot_state = (float*)xcalloc(outputs * batch, sizeof(float));
+    l.forgot_delta = (float*)xcalloc(outputs * batch, sizeof(float));
+
+    l.r_cpu = (float*)xcalloc(outputs * batch, sizeof(float));
+    l.z_cpu = (float*)xcalloc(outputs * batch, sizeof(float));
+    l.h_cpu = (float*)xcalloc(outputs * batch, sizeof(float));
+
+    l.forward = forward_gru_layer;
+    l.backward = backward_gru_layer;
+    l.update = update_gru_layer;
+
+#ifdef GPU
+    l.forward_gpu = forward_gru_layer_gpu;
+    l.backward_gpu = backward_gru_layer_gpu;
+    l.update_gpu = update_gru_layer_gpu;
+
+    l.forgot_state_gpu = cuda_make_array(l.output, batch*outputs);
+    l.forgot_delta_gpu = cuda_make_array(l.output, batch*outputs);
+    l.prev_state_gpu = cuda_make_array(l.output, batch*outputs);
+    l.state_gpu = cuda_make_array(l.output, batch*outputs);
+    l.output_gpu = cuda_make_array(l.output, batch*outputs*steps);
+    l.delta_gpu = cuda_make_array(l.delta, batch*outputs*steps);
+    l.r_gpu = cuda_make_array(l.output_gpu, batch*outputs);
+    l.z_gpu = cuda_make_array(l.output_gpu, batch*outputs);
+    l.h_gpu = cuda_make_array(l.output_gpu, batch*outputs);
+#endif
+
+    return l;
+}
+
+void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+{
+    update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
+}
+
+void forward_gru_layer(layer l, network_state state)
+{
+    network_state s = {0};
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer input_z_layer = *(l.input_z_layer);
+    layer input_r_layer = *(l.input_r_layer);
+    layer input_h_layer = *(l.input_h_layer);
+
+    layer state_z_layer = *(l.state_z_layer);
+    layer state_r_layer = *(l.state_r_layer);
+    layer state_h_layer = *(l.state_h_layer);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, input_z_layer.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, input_r_layer.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, input_h_layer.delta, 1);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, state_z_layer.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, state_r_layer.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, state_h_layer.delta, 1);
+    if(state.train) {
+        fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
+        copy_cpu(l.outputs*l.batch, l.state, 1, l.prev_state, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i) {
+        s.input = l.state;
+        forward_connected_layer(state_z_layer, s);
+        forward_connected_layer(state_r_layer, s);
+
+        s.input = state.input;
+        forward_connected_layer(input_z_layer, s);
+        forward_connected_layer(input_r_layer, s);
+        forward_connected_layer(input_h_layer, s);
+
+
+        copy_cpu(l.outputs*l.batch, input_z_layer.output, 1, l.z_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, state_z_layer.output, 1, l.z_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, input_r_layer.output, 1, l.r_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, state_r_layer.output, 1, l.r_cpu, 1);
+
+        activate_array(l.z_cpu, l.outputs*l.batch, LOGISTIC);
+        activate_array(l.r_cpu, l.outputs*l.batch, LOGISTIC);
+
+        copy_cpu(l.outputs*l.batch, l.state, 1, l.forgot_state, 1);
+        mul_cpu(l.outputs*l.batch, l.r_cpu, 1, l.forgot_state, 1);
+
+        s.input = l.forgot_state;
+        forward_connected_layer(state_h_layer, s);
+
+        copy_cpu(l.outputs*l.batch, input_h_layer.output, 1, l.h_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, state_h_layer.output, 1, l.h_cpu, 1);
+
+        #ifdef USET
+        activate_array(l.h_cpu, l.outputs*l.batch, TANH);
+        #else
+        activate_array(l.h_cpu, l.outputs*l.batch, LOGISTIC);
+        #endif
+
+        weighted_sum_cpu(l.state, l.h_cpu, l.z_cpu, l.outputs*l.batch, l.output);
+
+        copy_cpu(l.outputs*l.batch, l.output, 1, l.state, 1);
+
+        state.input += l.inputs*l.batch;
+        l.output += l.outputs*l.batch;
+        increment_layer(&input_z_layer, 1);
+        increment_layer(&input_r_layer, 1);
+        increment_layer(&input_h_layer, 1);
+
+        increment_layer(&state_z_layer, 1);
+        increment_layer(&state_r_layer, 1);
+        increment_layer(&state_h_layer, 1);
+    }
+}
+
+void backward_gru_layer(layer l, network_state state)
+{
+}
+
+#ifdef GPU
+
+void pull_gru_layer(layer l)
+{
+}
+
+void push_gru_layer(layer l)
+{
+}
+
+void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
+{
+    update_connected_layer_gpu(*(l.input_r_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.input_z_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.input_h_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.state_r_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.state_z_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.state_h_layer), batch, learning_rate, momentum, decay, loss_scale);
+}
+
+void forward_gru_layer_gpu(layer l, network_state state)
+{
+    network_state s = {0};
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer input_z_layer = *(l.input_z_layer);
+    layer input_r_layer = *(l.input_r_layer);
+    layer input_h_layer = *(l.input_h_layer);
+
+    layer state_z_layer = *(l.state_z_layer);
+    layer state_r_layer = *(l.state_r_layer);
+    layer state_h_layer = *(l.state_h_layer);
+
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, input_z_layer.delta_gpu, 1);
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, input_r_layer.delta_gpu, 1);
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, input_h_layer.delta_gpu, 1);
+
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, state_z_layer.delta_gpu, 1);
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, state_r_layer.delta_gpu, 1);
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, state_h_layer.delta_gpu, 1);
+    if(state.train) {
+        fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
+        copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i) {
+        s.input = l.state_gpu;
+        forward_connected_layer_gpu(state_z_layer, s);
+        forward_connected_layer_gpu(state_r_layer, s);
+
+        s.input = state.input;
+        forward_connected_layer_gpu(input_z_layer, s);
+        forward_connected_layer_gpu(input_r_layer, s);
+        forward_connected_layer_gpu(input_h_layer, s);
+
+
+        copy_ongpu(l.outputs*l.batch, input_z_layer.output_gpu, 1, l.z_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, state_z_layer.output_gpu, 1, l.z_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, input_r_layer.output_gpu, 1, l.r_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, state_r_layer.output_gpu, 1, l.r_gpu, 1);
+
+        activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
+        activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
+
+        copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
+        mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
+
+        s.input = l.forgot_state_gpu;
+        forward_connected_layer_gpu(state_h_layer, s);
+
+        copy_ongpu(l.outputs*l.batch, input_h_layer.output_gpu, 1, l.h_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, state_h_layer.output_gpu, 1, l.h_gpu, 1);
+
+        #ifdef USET
+        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
+        #else
+        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
+        #endif
+
+        weighted_sum_gpu(l.state_gpu, l.h_gpu, l.z_gpu, l.outputs*l.batch, l.output_gpu);
+
+        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1);
+
+        state.input += l.inputs*l.batch;
+        l.output_gpu += l.outputs*l.batch;
+        increment_layer(&input_z_layer, 1);
+        increment_layer(&input_r_layer, 1);
+        increment_layer(&input_h_layer, 1);
+
+        increment_layer(&state_z_layer, 1);
+        increment_layer(&state_r_layer, 1);
+        increment_layer(&state_h_layer, 1);
+    }
+}
+
+void backward_gru_layer_gpu(layer l, network_state state)
+{
+    network_state s = {0};
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer input_z_layer = *(l.input_z_layer);
+    layer input_r_layer = *(l.input_r_layer);
+    layer input_h_layer = *(l.input_h_layer);
+
+    layer state_z_layer = *(l.state_z_layer);
+    layer state_r_layer = *(l.state_r_layer);
+    layer state_h_layer = *(l.state_h_layer);
+
+    increment_layer(&input_z_layer, l.steps - 1);
+    increment_layer(&input_r_layer, l.steps - 1);
+    increment_layer(&input_h_layer, l.steps - 1);
+
+    increment_layer(&state_z_layer, l.steps - 1);
+    increment_layer(&state_r_layer, l.steps - 1);
+    increment_layer(&state_h_layer, l.steps - 1);
+
+    state.input += l.inputs*l.batch*(l.steps-1);
+    if(state.delta) state.delta += l.inputs*l.batch*(l.steps-1);
+    l.output_gpu += l.outputs*l.batch*(l.steps-1);
+    l.delta_gpu += l.outputs*l.batch*(l.steps-1);
+    for (i = l.steps-1; i >= 0; --i) {
+        if(i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
+        float *prev_delta_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
+
+        copy_ongpu(l.outputs*l.batch, input_z_layer.output_gpu, 1, l.z_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, state_z_layer.output_gpu, 1, l.z_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, input_r_layer.output_gpu, 1, l.r_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, state_r_layer.output_gpu, 1, l.r_gpu, 1);
+
+        activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
+        activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
+
+        copy_ongpu(l.outputs*l.batch, input_h_layer.output_gpu, 1, l.h_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, state_h_layer.output_gpu, 1, l.h_gpu, 1);
+
+        #ifdef USET
+        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
+        #else
+        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
+        #endif
+
+        weighted_delta_gpu(l.prev_state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, input_h_layer.delta_gpu, input_z_layer.delta_gpu, l.outputs*l.batch, l.delta_gpu);
+
+        #ifdef USET
+        gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH, input_h_layer.delta_gpu);
+        #else
+        gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, input_h_layer.delta_gpu);
+        #endif
+
+        copy_ongpu(l.outputs*l.batch, input_h_layer.delta_gpu, 1, state_h_layer.delta_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.forgot_state_gpu, 1);
+        mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
+        fill_ongpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1);
+
+        s.input = l.forgot_state_gpu;
+        s.delta = l.forgot_delta_gpu;
+
+        backward_connected_layer_gpu(state_h_layer, s);
+        if(prev_delta_gpu) mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.r_gpu, prev_delta_gpu);
+        mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.prev_state_gpu, input_r_layer.delta_gpu);
+
+        gradient_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, input_r_layer.delta_gpu);
+        copy_ongpu(l.outputs*l.batch, input_r_layer.delta_gpu, 1, state_r_layer.delta_gpu, 1);
+
+        gradient_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, input_z_layer.delta_gpu);
+        copy_ongpu(l.outputs*l.batch, input_z_layer.delta_gpu, 1, state_z_layer.delta_gpu, 1);
+
+        s.input = l.prev_state_gpu;
+        s.delta = prev_delta_gpu;
+
+        backward_connected_layer_gpu(state_r_layer, s);
+        backward_connected_layer_gpu(state_z_layer, s);
+
+        s.input = state.input;
+        s.delta = state.delta;
+
+        backward_connected_layer_gpu(input_h_layer, s);
+        backward_connected_layer_gpu(input_r_layer, s);
+        backward_connected_layer_gpu(input_z_layer, s);
+
+
+        state.input -= l.inputs*l.batch;
+        if(state.delta) state.delta -= l.inputs*l.batch;
+        l.output_gpu -= l.outputs*l.batch;
+        l.delta_gpu -= l.outputs*l.batch;
+        increment_layer(&input_z_layer, -1);
+        increment_layer(&input_r_layer, -1);
+        increment_layer(&input_h_layer, -1);
+
+        increment_layer(&state_z_layer, -1);
+        increment_layer(&state_r_layer, -1);
+        increment_layer(&state_h_layer, -1);
+    }
+}
+#endif
diff --git a/darknet-master/src/gru_layer.h b/darknet-master/src/gru_layer.h
new file mode 100644
index 0000000..c13e46e
--- /dev/null
+++ b/darknet-master/src/gru_layer.h
@@ -0,0 +1,30 @@
+
+#ifndef GRU_LAYER_H
+#define GRU_LAYER_H
+
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize);
+
+void forward_gru_layer(layer l, network_state state);
+void backward_gru_layer(layer l, network_state state);
+void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+
+#ifdef GPU
+void forward_gru_layer_gpu(layer l, network_state state);
+void backward_gru_layer_gpu(layer l, network_state state);
+void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale);
+void push_gru_layer(layer l);
+void pull_gru_layer(layer l);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/http_stream.cpp b/darknet-master/src/http_stream.cpp
new file mode 100644
index 0000000..cac00a0
--- /dev/null
+++ b/darknet-master/src/http_stream.cpp
@@ -0,0 +1,939 @@
+#define _XOPEN_SOURCE
+#include "image.h"
+#include "http_stream.h"
+
+//
+// a single-threaded, multi client(using select), debug webserver - streaming out mjpg.
+//  on win, _WIN32 has to be defined, must link against ws2_32.lib (socks on linux are for free)
+//
+
+#include <cstdio>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <atomic>
+#include <ctime>
+using std::cerr;
+using std::endl;
+
+//
+// socket related abstractions:
+//
+#ifdef _WIN32
+#ifndef USE_CMAKE_LIBS
+#pragma comment(lib, "ws2_32.lib")
+#endif
+#define WIN32_LEAN_AND_MEAN
+#define _WINSOCK_DEPRECATED_NO_WARNINGS
+#include <windows.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#include "gettimeofday.h"
+#define PORT        unsigned long
+#define ADDRPOINTER   int*
+struct _INIT_W32DATA
+{
+    WSADATA w;
+    _INIT_W32DATA() { WSAStartup(MAKEWORD(2, 1), &w); }
+} _init_once;
+
+// Graceful closes will first close their output channels and then wait for the peer
+// on the other side of the connection to close its output channels. When both sides are done telling
+// each other they won,t be sending any more data (i.e., closing output channels),
+// the connection can be closed fully, with no risk of reset.
+static int close_socket(SOCKET s) {
+    int close_output = ::shutdown(s, 1); // 0 close input, 1 close output, 2 close both
+    char *buf = (char *)calloc(1024, sizeof(char));
+    ::recv(s, buf, 1024, 0);
+    free(buf);
+    int close_input = ::shutdown(s, 0);
+    int result = ::closesocket(s);
+    cerr << "Close socket: out = " << close_output << ", in = " << close_input << " \n";
+    return result;
+}
+#else   // _WIN32 - else: nix
+#include "darkunistd.h"
+#include <fcntl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <signal.h>
+#define PORT        unsigned short
+#define SOCKET    int
+#define HOSTENT  struct hostent
+#define SOCKADDR    struct sockaddr
+#define SOCKADDR_IN  struct sockaddr_in
+#define ADDRPOINTER  unsigned int*
+#ifndef INVALID_SOCKET
+#define INVALID_SOCKET -1
+#endif
+#ifndef SOCKET_ERROR
+#define SOCKET_ERROR   -1
+#endif
+struct _IGNORE_PIPE_SIGNAL
+{
+    struct sigaction new_actn, old_actn;
+    _IGNORE_PIPE_SIGNAL() {
+        new_actn.sa_handler = SIG_IGN;  // ignore the broken pipe signal
+        sigemptyset(&new_actn.sa_mask);
+        new_actn.sa_flags = 0;
+        sigaction(SIGPIPE, &new_actn, &old_actn);
+        // sigaction (SIGPIPE, &old_actn, NULL); // - to restore the previous signal handling
+    }
+} _init_once;
+
+static int close_socket(SOCKET s) {
+    int close_output = ::shutdown(s, 1); // 0 close input, 1 close output, 2 close both
+    char *buf = (char *)calloc(1024, sizeof(char));
+    ::recv(s, buf, 1024, 0);
+    free(buf);
+    int close_input = ::shutdown(s, 0);
+    int result = close(s);
+    std::cerr << "Close socket: out = " << close_output << ", in = " << close_input << " \n";
+    return result;
+}
+#endif // _WIN32
+
+
+class JSON_sender
+{
+    SOCKET sock;
+    SOCKET maxfd;
+    fd_set master;
+    int timeout; // master sock timeout, shutdown after timeout usec.
+    int close_all_sockets;
+
+    int _write(int sock, char const*const s, int len)
+    {
+        if (len < 1) { len = strlen(s); }
+        return ::send(sock, s, len, 0);
+    }
+
+public:
+
+    JSON_sender(int port = 0, int _timeout = 400000)
+        : sock(INVALID_SOCKET)
+        , timeout(_timeout)
+    {
+        close_all_sockets = 0;
+        FD_ZERO(&master);
+        if (port)
+            open(port);
+    }
+
+    ~JSON_sender()
+    {
+        close_all();
+        release();
+    }
+
+    bool release()
+    {
+        if (sock != INVALID_SOCKET)
+            ::shutdown(sock, 2);
+        sock = (INVALID_SOCKET);
+        return false;
+    }
+
+    void close_all()
+    {
+        close_all_sockets = 1;
+        write("\n]");   // close JSON array
+    }
+
+    bool open(int port)
+    {
+        sock = ::socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+
+        SOCKADDR_IN address;
+        address.sin_addr.s_addr = INADDR_ANY;
+        address.sin_family = AF_INET;
+        address.sin_port = htons(port);    // ::htons(port);
+        int reuse = 1;
+        if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof(reuse)) < 0)
+            cerr << "setsockopt(SO_REUSEADDR) failed" << endl;
+
+        // Non-blocking sockets
+        // Windows: ioctlsocket() and FIONBIO
+        // Linux: fcntl() and O_NONBLOCK
+#ifdef WIN32
+        unsigned long i_mode = 1;
+        int result = ioctlsocket(sock, FIONBIO, &i_mode);
+        if (result != NO_ERROR) {
+            std::cerr << "ioctlsocket(FIONBIO) failed with error: " << result << std::endl;
+        }
+#else // WIN32
+        int flags = fcntl(sock, F_GETFL, 0);
+        fcntl(sock, F_SETFL, flags | O_NONBLOCK);
+#endif // WIN32
+
+#ifdef SO_REUSEPORT
+        if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (const char*)&reuse, sizeof(reuse)) < 0)
+            cerr << "setsockopt(SO_REUSEPORT) failed" << endl;
+#endif
+        if (::bind(sock, (SOCKADDR*)&address, sizeof(SOCKADDR_IN)) == SOCKET_ERROR)
+        {
+            cerr << "error JSON_sender: couldn't bind sock " << sock << " to port " << port << "!" << endl;
+            return release();
+        }
+        if (::listen(sock, 10) == SOCKET_ERROR)
+        {
+            cerr << "error JSON_sender: couldn't listen on sock " << sock << " on port " << port << " !" << endl;
+            return release();
+        }
+        FD_ZERO(&master);
+        FD_SET(sock, &master);
+        maxfd = sock;
+        return true;
+    }
+
+    bool isOpened()
+    {
+        return sock != INVALID_SOCKET;
+    }
+
+    bool write(char const* outputbuf)
+    {
+        fd_set rread = master;
+        struct timeval select_timeout = { 0, 0 };
+        struct timeval socket_timeout = { 0, timeout };
+        if (::select(maxfd + 1, &rread, NULL, NULL, &select_timeout) <= 0)
+            return true; // nothing broken, there's just noone listening
+
+        int outlen = static_cast<int>(strlen(outputbuf));
+
+#ifdef _WIN32
+        for (unsigned i = 0; i<rread.fd_count; i++)
+        {
+            int addrlen = sizeof(SOCKADDR);
+            SOCKET s = rread.fd_array[i];    // fd_set on win is an array, while ...
+#else
+        for (int s = 0; s <= maxfd; s++)
+        {
+            socklen_t addrlen = sizeof(SOCKADDR);
+            if (!FD_ISSET(s, &rread))      // ... on linux it's a bitmask ;)
+                continue;
+#endif
+            if (s == sock) // request on master socket, accept and send main header.
+            {
+                SOCKADDR_IN address = { 0 };
+                SOCKET      client = ::accept(sock, (SOCKADDR*)&address, &addrlen);
+                if (client == SOCKET_ERROR)
+                {
+                    cerr << "error JSON_sender: couldn't accept connection on sock " << sock << " !" << endl;
+                    return false;
+                }
+                if (setsockopt(client, SOL_SOCKET, SO_RCVTIMEO, (char *)&socket_timeout, sizeof(socket_timeout)) < 0) {
+                    cerr << "error JSON_sender: SO_RCVTIMEO setsockopt failed\n";
+                }
+                if (setsockopt(client, SOL_SOCKET, SO_SNDTIMEO, (char *)&socket_timeout, sizeof(socket_timeout)) < 0) {
+                    cerr << "error JSON_sender: SO_SNDTIMEO setsockopt failed\n";
+                }
+                maxfd = (maxfd>client ? maxfd : client);
+                FD_SET(client, &master);
+                _write(client, "HTTP/1.0 200 OK\r\n", 0);
+                _write(client,
+                    "Server: Mozarella/2.2\r\n"
+                    "Accept-Range: bytes\r\n"
+                    "Connection: close\r\n"
+                    "Max-Age: 0\r\n"
+                    "Expires: 0\r\n"
+                    "Cache-Control: no-cache, private\r\n"
+                    "Pragma: no-cache\r\n"
+                    "Content-Type: application/json\r\n"
+                    //"Content-Type: multipart/x-mixed-replace; boundary=boundary\r\n"
+                    "\r\n", 0);
+                _write(client, "[\n", 0);   // open JSON array
+                int n = _write(client, outputbuf, outlen);
+                cerr << "JSON_sender: new client " << client << endl;
+            }
+            else // existing client, just stream pix
+            {
+                //char head[400];
+                // application/x-resource+json or application/x-collection+json -  when you are representing REST resources and collections
+                // application/json or text/json or text/javascript or text/plain.
+                // https://stackoverflow.com/questions/477816/what-is-the-correct-json-content-type
+                //sprintf(head, "\r\nContent-Length: %zu\r\n\r\n", outlen);
+                //sprintf(head, "--boundary\r\nContent-Type: application/json\r\nContent-Length: %zu\r\n\r\n", outlen);
+                //_write(s, head, 0);
+                if (!close_all_sockets) _write(s, ", \n", 0);
+                int n = _write(s, outputbuf, outlen);
+                if (n < (int)outlen)
+                {
+                    cerr << "JSON_sender: kill client " << s << endl;
+                    close_socket(s);
+                    //::shutdown(s, 2);
+                    FD_CLR(s, &master);
+                }
+
+                if (close_all_sockets) {
+                    int result = close_socket(s);
+                    cerr << "JSON_sender: close clinet: " << result << " \n";
+                    continue;
+                }
+            }
+        }
+        if (close_all_sockets) {
+            int result = close_socket(sock);
+            cerr << "JSON_sender: close acceptor: " << result << " \n\n";
+        }
+        return true;
+        }
+};
+// ----------------------------------------
+
+static std::unique_ptr<JSON_sender> js_ptr;
+static std::mutex mtx;
+
+void delete_json_sender()
+{
+    std::lock_guard<std::mutex> lock(mtx);
+    js_ptr.release();
+}
+
+void send_json_custom(char const* send_buf, int port, int timeout)
+{
+    try {
+        std::lock_guard<std::mutex> lock(mtx);
+        if(!js_ptr) js_ptr.reset(new JSON_sender(port, timeout));
+
+        js_ptr->write(send_buf);
+    }
+    catch (...) {
+        cerr << " Error in send_json_custom() function \n";
+    }
+}
+
+void send_json(detection *dets, int nboxes, int classes, char **names, long long int frame_id, int port, int timeout)
+{
+    try {
+        char *send_buf = detection_to_json(dets, nboxes, classes, names, frame_id, NULL);
+
+        send_json_custom(send_buf, port, timeout);
+        std::cout << " JSON-stream sent. \n";
+
+        free(send_buf);
+    }
+    catch (...) {
+        cerr << " Error in send_json() function \n";
+    }
+}
+// ----------------------------------------
+
+
+#ifdef OPENCV
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/highgui/highgui_c.h>
+#include <opencv2/imgproc/imgproc_c.h>
+#ifndef CV_VERSION_EPOCH
+#include <opencv2/videoio/videoio.hpp>
+#endif
+using namespace cv;
+
+
+
+class MJPG_sender
+{
+    SOCKET sock;
+    SOCKET maxfd;
+    fd_set master;
+    int timeout; // master sock timeout, shutdown after timeout usec.
+    int quality; // jpeg compression [1..100]
+    int close_all_sockets;
+
+    int _write(int sock, char const*const s, int len)
+    {
+        if (len < 1) { len = strlen(s); }
+        return ::send(sock, s, len, 0);
+    }
+
+public:
+
+    MJPG_sender(int port = 0, int _timeout = 400000, int _quality = 30)
+        : sock(INVALID_SOCKET)
+        , timeout(_timeout)
+        , quality(_quality)
+    {
+        close_all_sockets = 0;
+        FD_ZERO(&master);
+        if (port)
+            open(port);
+    }
+
+    ~MJPG_sender()
+    {
+        close_all();
+        release();
+    }
+
+    bool release()
+    {
+        if (sock != INVALID_SOCKET)
+            ::shutdown(sock, 2);
+        sock = (INVALID_SOCKET);
+        return false;
+    }
+
+    void close_all()
+    {
+        close_all_sockets = 1;
+        cv::Mat tmp(cv::Size(10, 10), CV_8UC3);
+        write(tmp);
+    }
+
+    bool open(int port)
+    {
+        sock = ::socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+
+        SOCKADDR_IN address;
+        address.sin_addr.s_addr = INADDR_ANY;
+        address.sin_family = AF_INET;
+        address.sin_port = htons(port);    // ::htons(port);
+        int reuse = 1;
+        if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof(reuse)) < 0)
+            cerr << "setsockopt(SO_REUSEADDR) failed" << endl;
+
+        // Non-blocking sockets
+        // Windows: ioctlsocket() and FIONBIO
+        // Linux: fcntl() and O_NONBLOCK
+#ifdef WIN32
+        unsigned long i_mode = 1;
+        int result = ioctlsocket(sock, FIONBIO, &i_mode);
+        if (result != NO_ERROR) {
+            std::cerr << "ioctlsocket(FIONBIO) failed with error: " << result << std::endl;
+        }
+#else // WIN32
+        int flags = fcntl(sock, F_GETFL, 0);
+        fcntl(sock, F_SETFL, flags | O_NONBLOCK);
+#endif // WIN32
+
+#ifdef SO_REUSEPORT
+        if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (const char*)&reuse, sizeof(reuse)) < 0)
+            cerr << "setsockopt(SO_REUSEPORT) failed" << endl;
+#endif
+        if (::bind(sock, (SOCKADDR*)&address, sizeof(SOCKADDR_IN)) == SOCKET_ERROR)
+        {
+            cerr << "error MJPG_sender: couldn't bind sock " << sock << " to port " << port << "!" << endl;
+            return release();
+        }
+        if (::listen(sock, 10) == SOCKET_ERROR)
+        {
+            cerr << "error MJPG_sender: couldn't listen on sock " << sock << " on port " << port << " !" << endl;
+            return release();
+        }
+        FD_ZERO(&master);
+        FD_SET(sock, &master);
+        maxfd = sock;
+        return true;
+    }
+
+    bool isOpened()
+    {
+        return sock != INVALID_SOCKET;
+    }
+
+    bool write(const Mat & frame)
+    {
+        fd_set rread = master;
+        struct timeval select_timeout = { 0, 0 };
+        struct timeval socket_timeout = { 0, timeout };
+        if (::select(maxfd + 1, &rread, NULL, NULL, &select_timeout) <= 0)
+            return true; // nothing broken, there's just noone listening
+
+        std::vector<uchar> outbuf;
+        std::vector<int> params;
+        params.push_back(IMWRITE_JPEG_QUALITY);
+        params.push_back(quality);
+        cv::imencode(".jpg", frame, outbuf, params);  //REMOVED FOR COMPATIBILITY
+        // https://docs.opencv.org/3.4/d4/da8/group__imgcodecs.html#ga292d81be8d76901bff7988d18d2b42ac
+        //std::cerr << "cv::imencode call disabled!" << std::endl;
+        int outlen = static_cast<int>(outbuf.size());
+
+#ifdef _WIN32
+        for (unsigned i = 0; i<rread.fd_count; i++)
+        {
+            int addrlen = sizeof(SOCKADDR);
+            SOCKET s = rread.fd_array[i];    // fd_set on win is an array, while ...
+#else
+        for (int s = 0; s <= maxfd; s++)
+        {
+            socklen_t addrlen = sizeof(SOCKADDR);
+            if (!FD_ISSET(s, &rread))      // ... on linux it's a bitmask ;)
+                continue;
+#endif
+            if (s == sock) // request on master socket, accept and send main header.
+            {
+                SOCKADDR_IN address = { 0 };
+                SOCKET      client = ::accept(sock, (SOCKADDR*)&address, &addrlen);
+                if (client == SOCKET_ERROR)
+                {
+                    cerr << "error MJPG_sender: couldn't accept connection on sock " << sock << " !" << endl;
+                    return false;
+                }
+                if (setsockopt(client, SOL_SOCKET, SO_RCVTIMEO, (char *)&socket_timeout, sizeof(socket_timeout)) < 0) {
+                    cerr << "error MJPG_sender: SO_RCVTIMEO setsockopt failed\n";
+                }
+                if (setsockopt(client, SOL_SOCKET, SO_SNDTIMEO, (char *)&socket_timeout, sizeof(socket_timeout)) < 0) {
+                    cerr << "error MJPG_sender: SO_SNDTIMEO setsockopt failed\n";
+                }
+                maxfd = (maxfd>client ? maxfd : client);
+                FD_SET(client, &master);
+                _write(client, "HTTP/1.0 200 OK\r\n", 0);
+                _write(client,
+                    "Server: Mozarella/2.2\r\n"
+                    "Accept-Range: bytes\r\n"
+                    "Connection: close\r\n"
+                    "Max-Age: 0\r\n"
+                    "Expires: 0\r\n"
+                    "Cache-Control: no-cache, private\r\n"
+                    "Pragma: no-cache\r\n"
+                    "Content-Type: multipart/x-mixed-replace; boundary=mjpegstream\r\n"
+                    "\r\n", 0);
+                cerr << "MJPG_sender: new client " << client << endl;
+            }
+            else // existing client, just stream pix
+            {
+                if (close_all_sockets) {
+                    int result = close_socket(s);
+                    cerr << "MJPG_sender: close clinet: " << result << " \n";
+                    continue;
+                }
+
+                char head[400];
+                sprintf(head, "--mjpegstream\r\nContent-Type: image/jpeg\r\nContent-Length: %d\r\n\r\n", outlen);
+                _write(s, head, 0);
+                int n = _write(s, (char*)(&outbuf[0]), outlen);
+                cerr << "known client: " << s << ", sent = " << n << ", must be sent outlen = " << outlen << endl;
+                if (n < (int)outlen)
+                {
+                    cerr << "MJPG_sender: kill client " << s << endl;
+                    //::shutdown(s, 2);
+                    close_socket(s);
+                    FD_CLR(s, &master);
+                }
+            }
+        }
+        if (close_all_sockets) {
+            int result = close_socket(sock);
+            cerr << "MJPG_sender: close acceptor: " << result << " \n\n";
+        }
+        return true;
+    }
+};
+// ----------------------------------------
+
+static std::mutex mtx_mjpeg;
+
+//struct mat_cv : cv::Mat { int a[0]; };
+
+void send_mjpeg(mat_cv* mat, int port, int timeout, int quality)
+{
+    try {
+        std::lock_guard<std::mutex> lock(mtx_mjpeg);
+        static MJPG_sender wri(port, timeout, quality);
+        //cv::Mat mat = cv::cvarrToMat(ipl);
+        wri.write(*(cv::Mat*)mat);
+        std::cout << " MJPEG-stream sent. \n";
+    }
+    catch (...) {
+        cerr << " Error in send_mjpeg() function \n";
+    }
+}
+// ----------------------------------------
+
+std::string get_system_frame_time_string()
+{
+    std::time_t t = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+    static std::mutex mtx;
+    std::lock_guard<std::mutex> lock(mtx);
+    struct tm *tmp_buf = localtime(&t);
+    char buff[256];
+    std::strftime(buff, 256, "%A %F %T", tmp_buf);
+    std::string system_frame_time = buff;
+    return system_frame_time;
+}
+// ----------------------------------------
+
+
+#ifdef __CYGWIN__
+int send_http_post_request(char *http_post_host, int server_port, const char *videosource,
+    detection *dets, int nboxes, int classes, char **names, long long int frame_id, int ext_output, int timeout)
+{
+    std::cerr << " send_http_post_request() isn't implemented \n";
+    return 0;
+}
+#else   //  __CYGWIN__
+
+#ifndef   NI_MAXHOST
+#define   NI_MAXHOST 1025
+#endif
+
+#ifndef   NI_NUMERICHOST
+#define NI_NUMERICHOST  0x02
+#endif
+
+//#define CPPHTTPLIB_OPENSSL_SUPPORT
+#include "httplib.h"
+
+// https://webhook.site/
+// https://github.com/yhirose/cpp-httplib
+// sent POST http request
+int send_http_post_request(char *http_post_host, int server_port, const char *videosource,
+    detection *dets, int nboxes, int classes, char **names, long long int frame_id, int ext_output, int timeout)
+{
+    const float thresh = 0.005; // function get_network_boxes() has already filtred dets by actual threshold
+
+    std::string message;
+
+    for (int i = 0; i < nboxes; ++i) {
+        char labelstr[4096] = { 0 };
+        int class_id = -1;
+        for (int j = 0; j < classes; ++j) {
+            int show = strncmp(names[j], "dont_show", 9);
+            if (dets[i].prob[j] > thresh && show) {
+                if (class_id < 0) {
+                    strcat(labelstr, names[j]);
+                    class_id = j;
+                    char buff[10];
+                    sprintf(buff, " (%2.0f%%)", dets[i].prob[j] * 100);
+                    strcat(labelstr, buff);
+                }
+                else {
+                    strcat(labelstr, ", ");
+                    strcat(labelstr, names[j]);
+                }
+                printf("%s: %.0f%% ", names[j], dets[i].prob[j] * 100);
+            }
+        }
+        if (class_id >= 0) {
+            message += std::string(names[class_id]) + std::string(", id: ") + std::to_string(class_id) + "\n";
+        }
+    }
+
+    if (!message.empty())
+    {
+        std::string time = get_system_frame_time_string();
+        message += "\ntime:\n" + time + "\n";
+        message += "videosource:\n" + std::string(videosource);
+
+        std::string http_post_host_str = http_post_host;
+        int slash_index = http_post_host_str.find("/");
+
+        std::string http_path = http_post_host_str.substr(slash_index, http_post_host_str.length() - slash_index);
+        http_post_host_str = http_post_host_str.substr(0, slash_index);
+
+        // send HTTP-Post request
+        httplib::Client cli(http_post_host_str.c_str(), server_port, timeout);
+        auto res = cli.Post(http_path.c_str(), message, "text/plain");
+
+        return 1;
+    }
+
+    return 0;
+}
+#endif   //  __CYGWIN__
+
+#endif      // OPENCV
+
+// -----------------------------------------------------
+
+#if __cplusplus >= 201103L || _MSC_VER >= 1900  // C++11
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::steady_clock::time_point steady_start, steady_end;
+static double total_time;
+
+double get_time_point() {
+    std::chrono::steady_clock::time_point current_time = std::chrono::steady_clock::now();
+    //uint64_t now = std::chrono::duration_cast<std::chrono::milliseconds>(current_time.time_since_epoch()).count();
+    return std::chrono::duration_cast<std::chrono::microseconds>(current_time.time_since_epoch()).count();
+}
+
+void start_timer() {
+    steady_start = std::chrono::steady_clock::now();
+}
+
+void stop_timer() {
+    steady_end = std::chrono::steady_clock::now();
+}
+
+double get_time() {
+    double took_time = std::chrono::duration<double>(steady_end - steady_start).count();
+    total_time += took_time;
+    return took_time;
+}
+
+void stop_timer_and_show() {
+    stop_timer();
+    std::cout << " " << get_time() * 1000 << " msec" << std::endl;
+}
+
+void stop_timer_and_show_name(char *name) {
+    stop_timer();
+    std::cout << " " << name;
+    std::cout << " " << get_time() * 1000 << " msec" << std::endl;
+}
+
+void show_total_time() {
+    std::cout << " Total: " << total_time * 1000 << " msec" << std::endl;
+}
+
+
+int custom_create_thread(custom_thread_t * tid, const custom_attr_t * attr, void *(*func) (void *), void *arg)
+{
+    std::thread *ptr = new std::thread(func, arg);
+    *tid = (custom_thread_t *)ptr;
+    if (tid) return 0;
+    else return -1;
+}
+
+int custom_join(custom_thread_t tid, void **value_ptr)
+{
+    std::thread *ptr = (std::thread *)tid;
+    if (ptr) {
+        ptr->join();
+        delete ptr;
+        return 0;
+    }
+    else printf(" Error: ptr of thread is NULL in custom_join() \n");
+
+    return -1;
+}
+
+int custom_atomic_load_int(volatile int* obj)
+{
+    const volatile std::atomic<int>* ptr_a = (const volatile std::atomic<int>*)obj;
+    return std::atomic_load(ptr_a);
+}
+
+void custom_atomic_store_int(volatile int* obj, int desr)
+{
+    volatile std::atomic<int>* ptr_a = (volatile std::atomic<int>*)obj;
+    std::atomic_store(ptr_a, desr);
+}
+
+int get_num_threads()
+{
+    return std::thread::hardware_concurrency();
+}
+
+#if !defined(__MINGW64__)
+void this_thread_sleep_for(int ms_time)
+{
+    std::chrono::milliseconds dura(ms_time);
+    std::this_thread::sleep_for(dura);
+}
+#else
+void this_thread_sleep_for(int ms_time)
+{
+    std::cerr << " this_thread_sleep_for() isn't implemented \n";
+    return;
+}
+#endif
+
+void this_thread_yield()
+{
+    std::this_thread::yield();
+}
+
+#else // C++11
+#include <iostream>
+
+double get_time_point() { return 0; }
+void start_timer() {}
+void stop_timer() {}
+double get_time() { return 0; }
+void stop_timer_and_show() {
+    std::cout << " stop_timer_and_show() isn't implemented " << std::endl;
+}
+void stop_timer_and_show_name(char *name) { stop_timer_and_show(); }
+void total_time() {}
+#endif // C++11
+
+#include <deque>
+#include <vector>
+#include <iostream>
+#include "blas.h"
+#include "utils.h"
+
+struct similarity_detections_t {
+    int old_id, new_id;
+    float sim;
+};
+
+int check_prob(detection det, float thresh)
+{
+    for (int i = 0; i < det.classes; ++i) {
+        if (det.prob[i] > thresh) return 1;
+    }
+    return 0;
+}
+
+int check_classes_id(detection det1, detection det2, float thresh)
+{
+    if (det1.classes != det2.classes) {
+        error("Error: det1.classes != det2.classes", DARKNET_LOC);
+    }
+
+    int det1_id = -1;
+    float det1_prob = 0;
+    int det2_id = -1;
+    float det2_prob = 0;
+
+    for (int i = 0; i < det1.classes; ++i) {
+        if (det1.prob[i] > thresh && det1.prob[i] > det1_prob) {
+            det1_prob = det1.prob[i];
+            det1_id = i;
+        }
+        if (det2.prob[i] > thresh && det2.prob[i] > det2_prob) {
+            det2_prob = det2.prob[i];
+            det2_id = i;
+        }
+    }
+
+    if (det1_id == det2_id && det2_id != -1) return 1;
+
+    //for (int i = 0; i < det1.classes; ++i) {
+    //    if (det1.prob[i] > thresh && det2.prob[i] > thresh) return 1;
+    //}
+    return 0;
+}
+
+int fill_remaining_id(detection *new_dets, int new_dets_num, int new_track_id, float thresh, int detection_count)
+{
+    for (int i = 0; i < new_dets_num; ++i) {
+        if (new_dets[i].track_id == 0 && check_prob(new_dets[i], thresh)) {
+            //printf(" old_tid = %d, new_tid = %d, sim = %f \n", new_dets[i].track_id, new_track_id, new_dets[i].sim);
+            if (new_dets[i].sort_class > detection_count) {
+                new_dets[i].track_id = new_track_id;
+                new_track_id++;
+            }
+        }
+    }
+    return new_track_id;
+}
+
+float *make_float_array(float* src, size_t size)
+{
+    float *dst = (float*)xcalloc(size, sizeof(float));
+    memcpy(dst, src, size*sizeof(float));
+    return dst;
+}
+
+struct detection_t : detection {
+    int det_count;
+    detection_t(detection det) : detection(det), det_count(0)
+    {
+        if (embeddings) embeddings = make_float_array(det.embeddings, embedding_size);
+        if (prob) prob = make_float_array(det.prob, classes);
+        if (uc) uc = make_float_array(det.uc, 4);
+    }
+
+    detection_t(detection_t const& det) : detection(det)
+    {
+        if (embeddings) embeddings = make_float_array(det.embeddings, embedding_size);
+        if (prob) prob = make_float_array(det.prob, classes);
+        if (uc) uc = make_float_array(det.uc, 4);
+    }
+
+    ~detection_t() {
+        if (embeddings) free(embeddings);
+        if (prob) free(prob);
+        if (uc) free(uc);
+    }
+};
+
+
+
+void set_track_id(detection *new_dets, int new_dets_num, float thresh, float sim_thresh, float track_ciou_norm, int deque_size, int dets_for_track, int dets_for_show)
+{
+    static int new_track_id = 1;
+    static std::deque<std::vector<detection_t>> old_dets_dq;
+
+    // copy detections from queue of vectors to the one vector
+    std::vector<detection_t> old_dets;
+    for (std::vector<detection_t> &v : old_dets_dq) {
+        for (int i = 0; i < v.size(); ++i) {
+            old_dets.push_back(v[i]);
+        }
+    }
+
+    std::vector<similarity_detections_t> sim_det(old_dets.size() * new_dets_num);
+
+    // calculate similarity
+    for (int old_id = 0; old_id < old_dets.size(); ++old_id) {
+        for (int new_id = 0; new_id < new_dets_num; ++new_id) {
+            const int index = old_id*new_dets_num + new_id;
+            const float sim = cosine_similarity(new_dets[new_id].embeddings, old_dets[old_id].embeddings, old_dets[0].embedding_size);
+            sim_det[index].new_id = new_id;
+            sim_det[index].old_id = old_id;
+            sim_det[index].sim = sim;
+        }
+    }
+
+    // sort similarity
+    std::sort(sim_det.begin(), sim_det.end(), [](similarity_detections_t v1, similarity_detections_t v2) { return v1.sim > v2.sim; });
+    //if(sim_det.size() > 0) printf(" sim_det_first = %f, sim_det_end = %f \n", sim_det.begin()->sim, sim_det.rbegin()->sim);
+
+    std::vector<int> new_idx(new_dets_num, 1);
+    std::vector<int> old_idx(old_dets.size(), 1);
+    std::vector<int> track_idx(new_track_id, 1);
+
+    // match objects
+    for (int index = 0; index < new_dets_num*old_dets.size(); ++index) {
+        const int new_id = sim_det[index].new_id;
+        const int old_id = sim_det[index].old_id;
+        const int track_id = old_dets[old_id].track_id;
+        const int det_count = old_dets[old_id].sort_class;
+        //printf(" ciou = %f \n", box_ciou(new_dets[new_id].bbox, old_dets[old_id].bbox));
+        if (track_idx[track_id] && new_idx[new_id] && old_idx[old_id] && check_classes_id(new_dets[new_id], old_dets[old_id], thresh)) {
+            float sim = sim_det[index].sim;
+            //float ciou = box_ciou(new_dets[new_id].bbox, old_dets[old_id].bbox);
+            float ciou = box_iou(new_dets[new_id].bbox, old_dets[old_id].bbox);
+            sim = sim * (1 - track_ciou_norm) + ciou * track_ciou_norm;
+            if (sim_thresh < sim && new_dets[new_id].sim < sim) {
+                new_dets[new_id].sim = sim;
+                new_dets[new_id].track_id = track_id;
+                new_dets[new_id].sort_class = det_count + 1;
+                //new_idx[new_id] = 0;
+                old_idx[old_id] = 0;
+                if(track_id) track_idx[track_id] = 0;
+            }
+        }
+    }
+
+    // set new track_id
+    new_track_id = fill_remaining_id(new_dets, new_dets_num, new_track_id, thresh, dets_for_track);
+
+    // store new_detections to the queue of vectors
+    std::vector<detection_t> new_det_vec;
+    for (int i = 0; i < new_dets_num; ++i) {
+        if (check_prob(new_dets[i], thresh)) {
+            new_det_vec.push_back(new_dets[i]);
+        }
+    }
+
+    // add new
+    old_dets_dq.push_back(new_det_vec);
+    // remove old
+    if (old_dets_dq.size() > deque_size) old_dets_dq.pop_front();
+
+    // remove detection which were detected only on few frames
+    for (int i = 0; i < new_dets_num; ++i) {
+        if (new_dets[i].sort_class < dets_for_show) {
+            for (int j = 0; j < new_dets[i].classes; ++j) {
+                new_dets[i].prob[j] = 0;
+            }
+        }
+    }
+}
diff --git a/darknet-master/src/http_stream.h b/darknet-master/src/http_stream.h
new file mode 100644
index 0000000..aace9d7
--- /dev/null
+++ b/darknet-master/src/http_stream.h
@@ -0,0 +1,37 @@
+#ifndef HTTP_STREAM_H
+#define HTTP_STREAM_H
+#include "darknet.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "image.h"
+#include <stdint.h>
+
+void send_json(detection *dets, int nboxes, int classes, char **names, long long int frame_id, int port, int timeout);
+
+#ifdef OPENCV
+void send_mjpeg(mat_cv* mat, int port, int timeout, int quality);
+
+int send_http_post_request(char *http_post_host, int server_port, const char *videosource,
+    detection *dets, int nboxes, int classes, char **names, long long int frame_id, int ext_output, int timeout);
+
+#endif  // OPENCV
+
+typedef void* custom_thread_t;
+typedef void* custom_attr_t;
+
+int custom_create_thread(custom_thread_t * tid, const custom_attr_t * attr, void *(*func) (void *), void *arg);
+int custom_join(custom_thread_t thread, void **value_ptr);
+
+int custom_atomic_load_int(volatile int* obj);
+void custom_atomic_store_int(volatile int* obj, int desr);
+int get_num_threads();
+void this_thread_sleep_for(int ms_time);
+void this_thread_yield();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // HTTP_STREAM_H
diff --git a/darknet-master/src/httplib.h b/darknet-master/src/httplib.h
new file mode 100644
index 0000000..e4678fa
--- /dev/null
+++ b/darknet-master/src/httplib.h
@@ -0,0 +1,4039 @@
+//
+//  httplib.h
+//
+//  Copyright (c) 2019 Yuji Hirose. All rights reserved.
+//  MIT License
+//
+
+#ifndef CPPHTTPLIB_HTTPLIB_H
+#define CPPHTTPLIB_HTTPLIB_H
+
+/*
+ * Configuration
+ */
+
+#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND
+#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5
+#endif
+
+#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND
+#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND 0
+#endif
+
+#ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
+#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5
+#endif
+
+#ifndef CPPHTTPLIB_READ_TIMEOUT_SECOND
+#define CPPHTTPLIB_READ_TIMEOUT_SECOND 5
+#endif
+
+#ifndef CPPHTTPLIB_READ_TIMEOUT_USECOND
+#define CPPHTTPLIB_READ_TIMEOUT_USECOND 0
+#endif
+
+#ifndef CPPHTTPLIB_REQUEST_URI_MAX_LENGTH
+#define CPPHTTPLIB_REQUEST_URI_MAX_LENGTH 8192
+#endif
+
+#ifndef CPPHTTPLIB_REDIRECT_MAX_COUNT
+#define CPPHTTPLIB_REDIRECT_MAX_COUNT 20
+#endif
+
+#ifndef CPPHTTPLIB_PAYLOAD_MAX_LENGTH
+#define CPPHTTPLIB_PAYLOAD_MAX_LENGTH (std::numeric_limits<size_t>::max)()
+#endif
+
+#ifndef CPPHTTPLIB_RECV_BUFSIZ
+#define CPPHTTPLIB_RECV_BUFSIZ size_t(4096u)
+#endif
+
+#ifndef CPPHTTPLIB_THREAD_POOL_COUNT
+#define CPPHTTPLIB_THREAD_POOL_COUNT 8
+#endif
+
+/*
+ * Headers
+ */
+
+#ifdef _WIN32
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif //_CRT_SECURE_NO_WARNINGS
+
+#ifndef _CRT_NONSTDC_NO_DEPRECATE
+#define _CRT_NONSTDC_NO_DEPRECATE
+#endif //_CRT_NONSTDC_NO_DEPRECATE
+
+#if defined(_MSC_VER)
+#ifdef _WIN64
+using ssize_t = __int64;
+#else
+using ssize_t = int;
+#endif
+
+#if _MSC_VER < 1900
+#define snprintf _snprintf_s
+#endif
+#endif // _MSC_VER
+
+#ifndef S_ISREG
+#define S_ISREG(m) (((m)&S_IFREG) == S_IFREG)
+#endif // S_ISREG
+
+#ifndef S_ISDIR
+#define S_ISDIR(m) (((m)&S_IFDIR) == S_IFDIR)
+#endif // S_ISDIR
+
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif // NOMINMAX
+
+#include <io.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+
+#ifndef WSA_FLAG_NO_HANDLE_INHERIT
+#define WSA_FLAG_NO_HANDLE_INHERIT 0x80
+#endif
+
+#ifdef _MSC_VER
+#pragma comment(lib, "ws2_32.lib")
+#endif
+
+#ifndef strcasecmp
+#define strcasecmp _stricmp
+#endif // strcasecmp
+
+using socket_t = SOCKET;
+#ifdef CPPHTTPLIB_USE_POLL
+#define poll(fds, nfds, timeout) WSAPoll(fds, nfds, timeout)
+#endif
+
+#else // not _WIN32
+
+#include <arpa/inet.h>
+#include <cstring>
+#include <netdb.h>
+#include <netinet/in.h>
+#ifdef CPPHTTPLIB_USE_POLL
+#include <poll.h>
+#endif
+#include <pthread.h>
+#include <csignal>
+#include <sys/select.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+using socket_t = int;
+
+#ifndef INVALID_SOCKET
+#define INVALID_SOCKET -1
+#endif
+#endif //_WIN32
+
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <condition_variable>
+#include <errno.h>
+#include <fcntl.h>
+#include <fstream>
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <regex>
+#include <string>
+#include <sys/stat.h>
+#include <thread>
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#include <openssl/err.h>
+#include <openssl/ssl.h>
+#include <openssl/x509v3.h>
+
+// #if OPENSSL_VERSION_NUMBER < 0x1010100fL
+// #error Sorry, OpenSSL versions prior to 1.1.1 are not supported
+// #endif
+
+#if OPENSSL_VERSION_NUMBER < 0x10100000L
+#include <openssl/crypto.h>
+inline const unsigned char *ASN1_STRING_get0_data(const ASN1_STRING *asn1) {
+  return M_ASN1_STRING_data(asn1);
+}
+#endif
+#endif
+
+#ifdef CPPHTTPLIB_ZLIB_SUPPORT
+#include <zlib.h>
+#endif
+
+/*
+ * Declaration
+ */
+namespace httplib {
+
+namespace detail {
+
+struct ci {
+  bool operator()(const std::string &s1, const std::string &s2) const {
+    return std::lexicographical_compare(
+        s1.begin(), s1.end(), s2.begin(), s2.end(),
+        [](char c1, char c2) { return ::tolower(c1) < ::tolower(c2); });
+  }
+};
+
+} // namespace detail
+
+enum class HttpVersion { v1_0 = 0, v1_1 };
+
+using Headers = std::multimap<std::string, std::string, detail::ci>;
+
+using Params = std::multimap<std::string, std::string>;
+using Match = std::smatch;
+
+using DataSink = std::function<void(const char *data, size_t data_len)>;
+
+using Done = std::function<void()>;
+
+using ContentProvider = std::function<void(size_t offset, size_t length, DataSink sink)>;
+
+using ContentProviderWithCloser = std::function<void(size_t offset, size_t length, DataSink sink, Done done)>;
+
+using ContentReceiver = std::function<bool(const char *data, size_t data_length)>;
+
+using ContentReader = std::function<bool(ContentReceiver receiver)>;
+
+using Progress = std::function<bool(uint64_t current, uint64_t total)>;
+
+struct Response;
+using ResponseHandler = std::function<bool(const Response &response)>;
+
+struct MultipartFile {
+  std::string filename;
+  std::string content_type;
+  size_t offset = 0;
+  size_t length = 0;
+};
+using MultipartFiles = std::multimap<std::string, MultipartFile>;
+
+struct MultipartFormData {
+  std::string name;
+  std::string content;
+  std::string filename;
+  std::string content_type;
+};
+using MultipartFormDataItems = std::vector<MultipartFormData>;
+
+using Range = std::pair<ssize_t, ssize_t>;
+using Ranges = std::vector<Range>;
+
+struct Request {
+  std::string method;
+  std::string path;
+  Headers headers;
+  std::string body;
+
+  // for server
+  std::string version;
+  std::string target;
+  Params params;
+  MultipartFiles files;
+  Ranges ranges;
+  Match matches;
+
+  // for client
+  size_t redirect_count = CPPHTTPLIB_REDIRECT_MAX_COUNT;
+  ResponseHandler response_handler;
+  ContentReceiver content_receiver;
+  Progress progress;
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+  const SSL *ssl;
+#endif
+
+  bool has_header(const char *key) const;
+  std::string get_header_value(const char *key, size_t id = 0) const;
+  size_t get_header_value_count(const char *key) const;
+  void set_header(const char *key, const char *val);
+  void set_header(const char *key, const std::string &val);
+
+  bool has_param(const char *key) const;
+  std::string get_param_value(const char *key, size_t id = 0) const;
+  size_t get_param_value_count(const char *key) const;
+
+  bool has_file(const char *key) const;
+  MultipartFile get_file_value(const char *key) const;
+
+  // private members...
+  size_t content_length;
+  ContentProvider content_provider;
+};
+
+struct Response {
+  std::string version;
+  int status;
+  Headers headers;
+  std::string body;
+
+  bool has_header(const char *key) const;
+  std::string get_header_value(const char *key, size_t id = 0) const;
+  size_t get_header_value_count(const char *key) const;
+  void set_header(const char *key, const char *val);
+  void set_header(const char *key, const std::string &val);
+
+  void set_redirect(const char *url);
+  void set_content(const char *s, size_t n, const char *content_type);
+  void set_content(const std::string &s, const char *content_type);
+
+  void set_content_provider(
+      size_t length,
+      std::function<void(size_t offset, size_t length, DataSink sink)> provider,
+      std::function<void()> resource_releaser = [] {});
+
+  void set_chunked_content_provider(
+      std::function<void(size_t offset, DataSink sink, Done done)> provider,
+      std::function<void()> resource_releaser = [] {});
+
+  Response() : status(-1), content_length(0) {}
+
+  ~Response() {
+    if (content_provider_resource_releaser) {
+      content_provider_resource_releaser();
+    }
+  }
+
+  // private members...
+  size_t content_length;
+  ContentProviderWithCloser content_provider;
+  std::function<void()> content_provider_resource_releaser;
+};
+
+class Stream {
+public:
+  virtual ~Stream() = default;
+  virtual int read(char *ptr, size_t size) = 0;
+  virtual int write(const char *ptr, size_t size1) = 0;
+  virtual int write(const char *ptr) = 0;
+  virtual int write(const std::string &s) = 0;
+  virtual std::string get_remote_addr() const = 0;
+
+  template <typename... Args>
+  int write_format(const char *fmt, const Args &... args);
+};
+
+class SocketStream : public Stream {
+public:
+  SocketStream(socket_t sock, time_t read_timeout_sec,
+               time_t read_timeout_usec);
+  ~SocketStream() override;
+
+  int read(char *ptr, size_t size) override;
+  int write(const char *ptr, size_t size) override;
+  int write(const char *ptr) override;
+  int write(const std::string &s) override;
+  std::string get_remote_addr() const override;
+
+private:
+  socket_t sock_;
+  time_t read_timeout_sec_;
+  time_t read_timeout_usec_;
+};
+
+class BufferStream : public Stream {
+public:
+  BufferStream() = default;
+  ~BufferStream() override = default;
+
+  int read(char *ptr, size_t size) override;
+  int write(const char *ptr, size_t size) override;
+  int write(const char *ptr) override;
+  int write(const std::string &s) override;
+  std::string get_remote_addr() const override;
+
+  const std::string &get_buffer() const;
+
+private:
+  std::string buffer;
+};
+
+class TaskQueue {
+public:
+  TaskQueue() = default;
+  virtual ~TaskQueue() = default;
+  virtual void enqueue(std::function<void()> fn) = 0;
+  virtual void shutdown() = 0;
+};
+
+#if CPPHTTPLIB_THREAD_POOL_COUNT > 0
+class ThreadPool : public TaskQueue {
+public:
+  explicit ThreadPool(size_t n) : shutdown_(false) {
+    while (n) {
+      threads_.emplace_back(worker(*this));
+      n--;
+    }
+  }
+
+  ThreadPool(const ThreadPool &) = delete;
+  ~ThreadPool() override = default;
+
+  void enqueue(std::function<void()> fn) override {
+    std::unique_lock<std::mutex> lock(mutex_);
+    jobs_.push_back(fn);
+    cond_.notify_one();
+  }
+
+  void shutdown() override {
+    // Stop all worker threads...
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      shutdown_ = true;
+    }
+
+    cond_.notify_all();
+
+    // Join...
+    for (auto& t : threads_) {
+      t.join();
+    }
+  }
+
+private:
+  struct worker {
+    explicit worker(ThreadPool &pool) : pool_(pool) {}
+
+    void operator()() {
+      for (;;) {
+        std::function<void()> fn;
+        {
+          std::unique_lock<std::mutex> lock(pool_.mutex_);
+
+          pool_.cond_.wait(
+              lock, [&] { return !pool_.jobs_.empty() || pool_.shutdown_; });
+
+          if (pool_.shutdown_ && pool_.jobs_.empty()) { break; }
+
+          fn = pool_.jobs_.front();
+          pool_.jobs_.pop_front();
+        }
+
+        assert(true == static_cast<bool>(fn));
+        fn();
+      }
+    }
+
+    ThreadPool &pool_;
+  };
+  friend struct worker;
+
+  std::vector<std::thread> threads_;
+  std::list<std::function<void()>> jobs_;
+
+  bool shutdown_;
+
+  std::condition_variable cond_;
+  std::mutex mutex_;
+};
+#elif CPPHTTPLIB_THREAD_POOL_COUNT == 0
+class Threads : public TaskQueue {
+public:
+  Threads() : running_threads_(0) {}
+  virtual ~Threads() {}
+
+  virtual void enqueue(std::function<void()> fn) override {
+    std::thread([=]() {
+      {
+        std::lock_guard<std::mutex> guard(running_threads_mutex_);
+        running_threads_++;
+      }
+
+      fn();
+
+      {
+        std::lock_guard<std::mutex> guard(running_threads_mutex_);
+        running_threads_--;
+      }
+    }).detach();
+  }
+
+  virtual void shutdown() override {
+    for (;;) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+      std::lock_guard<std::mutex> guard(running_threads_mutex_);
+      if (!running_threads_) { break; }
+    }
+  }
+
+private:
+  std::mutex running_threads_mutex_;
+  int running_threads_;
+};
+#else
+class NoThread : public TaskQueue {
+public:
+  NoThread() {}
+  virtual ~NoThread() {}
+
+  virtual void enqueue(std::function<void()> fn) override {
+    fn();
+  }
+
+  virtual void shutdown() override {
+  }
+};
+#endif
+
+class Server {
+public:
+  using Handler = std::function<void(const Request &, Response &)>;
+  using HandlerWithContentReader = std::function<void(const Request &, Response &,
+                             const ContentReader &content_reader)>;
+  using Logger = std::function<void(const Request &, const Response &)>;
+
+  Server();
+
+  virtual ~Server();
+
+  virtual bool is_valid() const;
+
+  Server &Get(const char *pattern, Handler handler);
+  Server &Post(const char *pattern, Handler handler);
+  Server &Post(const char *pattern, HandlerWithContentReader handler);
+  Server &Put(const char *pattern, Handler handler);
+  Server &Put(const char *pattern, HandlerWithContentReader handler);
+  Server &Patch(const char *pattern, Handler handler);
+  Server &Patch(const char *pattern, HandlerWithContentReader handler);
+  Server &Delete(const char *pattern, Handler handler);
+  Server &Options(const char *pattern, Handler handler);
+
+  bool set_base_dir(const char *dir, const char *mount_point = nullptr);
+  void set_file_request_handler(Handler handler);
+
+  void set_error_handler(Handler handler);
+  void set_logger(Logger logger);
+
+  void set_keep_alive_max_count(size_t count);
+  void set_read_timeout(time_t sec, time_t usec);
+  void set_payload_max_length(size_t length);
+
+  bool bind_to_port(const char *host, int port, int socket_flags = 0);
+  int bind_to_any_port(const char *host, int socket_flags = 0);
+  bool listen_after_bind();
+
+  bool listen(const char *host, int port, int socket_flags = 0);
+
+  bool is_running() const;
+  void stop();
+
+  std::function<TaskQueue *(void)> new_task_queue;
+
+protected:
+  bool process_request(Stream &strm, bool last_connection,
+                       bool &connection_close,
+                       const std::function<void(Request &)>& setup_request);
+
+  size_t keep_alive_max_count_;
+  time_t read_timeout_sec_;
+  time_t read_timeout_usec_;
+  size_t payload_max_length_;
+
+private:
+  using Handlers = std::vector<std::pair<std::regex, Handler>>;
+  using HandersForContentReader = std::vector<std::pair<std::regex, HandlerWithContentReader>>;
+
+  socket_t create_server_socket(const char *host, int port,
+                                int socket_flags) const;
+  int bind_internal(const char *host, int port, int socket_flags);
+  bool listen_internal();
+
+  bool routing(Request &req, Response &res, Stream &strm, bool last_connection);
+  bool handle_file_request(Request &req, Response &res);
+  bool dispatch_request(Request &req, Response &res, Handlers &handlers);
+  bool dispatch_request_for_content_reader(Request &req, Response &res,
+                                           ContentReader content_reader,
+                                           HandersForContentReader &handlers);
+
+  bool parse_request_line(const char *s, Request &req);
+  bool write_response(Stream &strm, bool last_connection, const Request &req,
+                      Response &res);
+  bool write_content_with_provider(Stream &strm, const Request &req,
+                                   Response &res, const std::string &boundary,
+                                   const std::string &content_type);
+  bool read_content(Stream &strm, bool last_connection, Request &req,
+                    Response &res);
+  bool read_content_with_content_receiver(Stream &strm, bool last_connection,
+                                          Request &req, Response &res,
+                                          ContentReceiver reveiver);
+
+  virtual bool process_and_close_socket(socket_t sock);
+
+  std::atomic<bool> is_running_;
+  std::atomic<socket_t> svr_sock_;
+  std::vector<std::pair<std::string, std::string>> base_dirs_;
+  Handler file_request_handler_;
+  Handlers get_handlers_;
+  Handlers post_handlers_;
+  HandersForContentReader post_handlers_for_content_reader;
+  Handlers put_handlers_;
+  HandersForContentReader put_handlers_for_content_reader;
+  Handlers patch_handlers_;
+  HandersForContentReader patch_handlers_for_content_reader;
+  Handlers delete_handlers_;
+  Handlers options_handlers_;
+  Handler error_handler_;
+  Logger logger_;
+};
+
+class Client {
+public:
+  explicit Client(const char *host, int port = 80, time_t timeout_sec = 300);
+
+  virtual ~Client();
+
+  virtual bool is_valid() const;
+
+  std::shared_ptr<Response> Get(const char *path);
+
+  std::shared_ptr<Response> Get(const char *path, const Headers &headers);
+
+  std::shared_ptr<Response> Get(const char *path, Progress progress);
+
+  std::shared_ptr<Response> Get(const char *path, const Headers &headers,
+                                Progress progress);
+
+  std::shared_ptr<Response> Get(const char *path,
+                                ContentReceiver content_receiver);
+
+  std::shared_ptr<Response> Get(const char *path, const Headers &headers,
+                                ContentReceiver content_receiver);
+
+  std::shared_ptr<Response>
+  Get(const char *path, ContentReceiver content_receiver, Progress progress);
+
+  std::shared_ptr<Response> Get(const char *path, const Headers &headers,
+                                ContentReceiver content_receiver,
+                                Progress progress);
+
+  std::shared_ptr<Response> Get(const char *path, const Headers &headers,
+                                ResponseHandler response_handler,
+                                ContentReceiver content_receiver);
+
+  std::shared_ptr<Response> Get(const char *path, const Headers &headers,
+                                ResponseHandler response_handler,
+                                ContentReceiver content_receiver,
+                                Progress progress);
+
+  std::shared_ptr<Response> Head(const char *path);
+
+  std::shared_ptr<Response> Head(const char *path, const Headers &headers);
+
+  std::shared_ptr<Response> Post(const char *path, const std::string &body,
+                                 const char *content_type,
+                                 bool compress = false);
+
+  std::shared_ptr<Response> Post(const char *path, const Headers &headers,
+                                 const std::string &body,
+                                 const char *content_type,
+                                 bool compress = false);
+
+  std::shared_ptr<Response> Post(const char *path, size_t content_length,
+                                 ContentProvider content_provider,
+                                 const char *content_type,
+                                 bool compress = false);
+
+  std::shared_ptr<Response> Post(const char *path, const Headers &headers,
+                                 size_t content_length,
+                                 ContentProvider content_provider,
+                                 const char *content_type,
+                                 bool compress = false);
+
+  std::shared_ptr<Response> Post(const char *path, const Params &params,
+                                 bool compress = false);
+
+  std::shared_ptr<Response> Post(const char *path, const Headers &headers,
+                                 const Params &params, bool compress = false);
+
+  std::shared_ptr<Response> Post(const char *path,
+                                 const MultipartFormDataItems &items,
+                                 bool compress = false);
+
+  std::shared_ptr<Response> Post(const char *path, const Headers &headers,
+                                 const MultipartFormDataItems &items,
+                                 bool compress = false);
+
+  std::shared_ptr<Response> Put(const char *path, const std::string &body,
+                                const char *content_type,
+                                bool compress = false);
+
+  std::shared_ptr<Response> Put(const char *path, const Headers &headers,
+                                const std::string &body,
+                                const char *content_type,
+                                bool compress = false);
+
+  std::shared_ptr<Response> Put(const char *path, size_t content_length,
+                                ContentProvider content_provider,
+                                const char *content_type,
+                                bool compress = false);
+
+  std::shared_ptr<Response> Put(const char *path, const Headers &headers,
+                                size_t content_length,
+                                ContentProvider content_provider,
+                                const char *content_type,
+                                bool compress = false);
+
+  std::shared_ptr<Response> Patch(const char *path, const std::string &body,
+                                  const char *content_type,
+                                  bool compress = false);
+
+  std::shared_ptr<Response> Patch(const char *path, const Headers &headers,
+                                  const std::string &body,
+                                  const char *content_type,
+                                  bool compress = false);
+
+  std::shared_ptr<Response> Patch(const char *path, size_t content_length,
+                                  ContentProvider content_provider,
+                                  const char *content_type,
+                                  bool compress = false);
+
+  std::shared_ptr<Response> Patch(const char *path, const Headers &headers,
+                                  size_t content_length,
+                                  ContentProvider content_provider,
+                                  const char *content_type,
+                                  bool compress = false);
+
+  std::shared_ptr<Response> Delete(const char *path);
+
+  std::shared_ptr<Response> Delete(const char *path, const std::string &body,
+                                   const char *content_type);
+
+  std::shared_ptr<Response> Delete(const char *path, const Headers &headers);
+
+  std::shared_ptr<Response> Delete(const char *path, const Headers &headers,
+                                   const std::string &body,
+                                   const char *content_type);
+
+  std::shared_ptr<Response> Options(const char *path);
+
+  std::shared_ptr<Response> Options(const char *path, const Headers &headers);
+
+  bool send(const Request &req, Response &res);
+
+  bool send(const std::vector<Request> &requests,
+            std::vector<Response> &responses);
+
+  void set_keep_alive_max_count(size_t count);
+  void set_read_timeout(time_t sec, time_t usec);
+
+  void follow_location(bool on);
+
+protected:
+  bool process_request(Stream &strm, const Request &req, Response &res,
+                       bool last_connection, bool &connection_close);
+
+  const std::string host_;
+  const int port_;
+  time_t timeout_sec_;
+  const std::string host_and_port_;
+  size_t keep_alive_max_count_;
+  time_t read_timeout_sec_;
+  time_t read_timeout_usec_;
+  size_t follow_location_;
+
+private:
+  socket_t create_client_socket() const;
+  bool read_response_line(Stream &strm, Response &res);
+  void write_request(Stream &strm, const Request &req, bool last_connection);
+  bool redirect(const Request &req, Response &res);
+
+  std::shared_ptr<Response>
+  send_with_content_provider(const char *method, const char *path,
+                             const Headers &headers, const std::string &body,
+                             size_t content_length,
+                             ContentProvider content_provider,
+                             const char *content_type, bool compress);
+
+  virtual bool process_and_close_socket(
+      socket_t sock, size_t request_count,
+      std::function<bool(Stream &strm, bool last_connection,
+                         bool &connection_close)>
+          callback);
+
+  virtual bool is_ssl() const;
+};
+
+inline void Get(std::vector<Request> &requests, const char *path,
+                const Headers &headers) {
+  Request req;
+  req.method = "GET";
+  req.path = path;
+  req.headers = headers;
+  requests.emplace_back(std::move(req));
+}
+
+inline void Get(std::vector<Request> &requests, const char *path) {
+  Get(requests, path, Headers());
+}
+
+inline void Post(std::vector<Request> &requests, const char *path,
+                 const Headers &headers, const std::string &body,
+                 const char *content_type) {
+  Request req;
+  req.method = "POST";
+  req.path = path;
+  req.headers = headers;
+  req.headers.emplace("Content-Type", content_type);
+  req.body = body;
+  requests.emplace_back(std::move(req));
+}
+
+inline void Post(std::vector<Request> &requests, const char *path,
+                 const std::string &body, const char *content_type) {
+  Post(requests, path, Headers(), body, content_type);
+}
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+class SSLSocketStream : public Stream {
+public:
+  SSLSocketStream(socket_t sock, SSL *ssl, time_t read_timeout_sec,
+                  time_t read_timeout_usec);
+  virtual ~SSLSocketStream();
+
+  virtual int read(char *ptr, size_t size);
+  virtual int write(const char *ptr, size_t size);
+  virtual int write(const char *ptr);
+  virtual int write(const std::string &s);
+  virtual std::string get_remote_addr() const;
+
+private:
+  socket_t sock_;
+  SSL *ssl_;
+  time_t read_timeout_sec_;
+  time_t read_timeout_usec_;
+};
+
+class SSLServer : public Server {
+public:
+  SSLServer(const char *cert_path, const char *private_key_path,
+            const char *client_ca_cert_file_path = nullptr,
+            const char *client_ca_cert_dir_path = nullptr);
+
+  virtual ~SSLServer();
+
+  virtual bool is_valid() const;
+
+private:
+  virtual bool process_and_close_socket(socket_t sock);
+
+  SSL_CTX *ctx_;
+  std::mutex ctx_mutex_;
+};
+
+class SSLClient : public Client {
+public:
+  SSLClient(const char *host, int port = 443, time_t timeout_sec = 300,
+            const char *client_cert_path = nullptr,
+            const char *client_key_path = nullptr);
+
+  virtual ~SSLClient();
+
+  virtual bool is_valid() const;
+
+  void set_ca_cert_path(const char *ca_ceert_file_path,
+                        const char *ca_cert_dir_path = nullptr);
+  void enable_server_certificate_verification(bool enabled);
+
+  long get_openssl_verify_result() const;
+
+  SSL_CTX *ssl_context() const noexcept;
+
+private:
+  virtual bool process_and_close_socket(
+      socket_t sock, size_t request_count,
+      std::function<bool(Stream &strm, bool last_connection,
+                         bool &connection_close)>
+          callback);
+  virtual bool is_ssl() const;
+
+  bool verify_host(X509 *server_cert) const;
+  bool verify_host_with_subject_alt_name(X509 *server_cert) const;
+  bool verify_host_with_common_name(X509 *server_cert) const;
+  bool check_host_name(const char *pattern, size_t pattern_len) const;
+
+  SSL_CTX *ctx_;
+  std::mutex ctx_mutex_;
+  std::vector<std::string> host_components_;
+  std::string ca_cert_file_path_;
+  std::string ca_cert_dir_path_;
+  bool server_certificate_verification_ = false;
+  long verify_result_ = 0;
+};
+#endif
+
+/*
+ * Implementation
+ */
+
+namespace detail {
+
+inline bool is_hex(char c, int &v) {
+  if (0x20 <= c && isdigit(c)) {
+    v = c - '0';
+    return true;
+  } else if ('A' <= c && c <= 'F') {
+    v = c - 'A' + 10;
+    return true;
+  } else if ('a' <= c && c <= 'f') {
+    v = c - 'a' + 10;
+    return true;
+  }
+  return false;
+}
+
+inline bool from_hex_to_i(const std::string &s, size_t i, size_t cnt,
+                          int &val) {
+  if (i >= s.size()) { return false; }
+
+  val = 0;
+  for (; cnt; i++, cnt--) {
+    if (!s[i]) { return false; }
+    int v = 0;
+    if (is_hex(s[i], v)) {
+      val = val * 16 + v;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline std::string from_i_to_hex(size_t n) {
+  const char *charset = "0123456789abcdef";
+  std::string ret;
+  do {
+    ret = charset[n & 15] + ret;
+    n >>= 4;
+  } while (n > 0);
+  return ret;
+}
+
+inline size_t to_utf8(int code, char *buff) {
+  if (code < 0x0080) {
+    buff[0] = (code & 0x7F);
+    return 1;
+  } else if (code < 0x0800) {
+    buff[0] = (0xC0 | ((code >> 6) & 0x1F));
+    buff[1] = (0x80 | (code & 0x3F));
+    return 2;
+  } else if (code < 0xD800) {
+    buff[0] = (0xE0 | ((code >> 12) & 0xF));
+    buff[1] = (0x80 | ((code >> 6) & 0x3F));
+    buff[2] = (0x80 | (code & 0x3F));
+    return 3;
+  } else if (code < 0xE000) { // D800 - DFFF is invalid...
+    return 0;
+  } else if (code < 0x10000) {
+    buff[0] = (0xE0 | ((code >> 12) & 0xF));
+    buff[1] = (0x80 | ((code >> 6) & 0x3F));
+    buff[2] = (0x80 | (code & 0x3F));
+    return 3;
+  } else if (code < 0x110000) {
+    buff[0] = (0xF0 | ((code >> 18) & 0x7));
+    buff[1] = (0x80 | ((code >> 12) & 0x3F));
+    buff[2] = (0x80 | ((code >> 6) & 0x3F));
+    buff[3] = (0x80 | (code & 0x3F));
+    return 4;
+  }
+
+  // NOTREACHED
+  return 0;
+}
+
+// NOTE: This code came up with the following stackoverflow post:
+// https://stackoverflow.com/questions/180947/base64-decode-snippet-in-c
+inline std::string base64_encode(const std::string &in) {
+  static const auto lookup =
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+  std::string out;
+  out.reserve(in.size());
+
+  int val = 0;
+  int valb = -6;
+
+  for (uint8_t c : in) {
+    val = (val << 8) + c;
+    valb += 8;
+    while (valb >= 0) {
+      out.push_back(lookup[(val >> valb) & 0x3F]);
+      valb -= 6;
+    }
+  }
+
+  if (valb > -6) { out.push_back(lookup[((val << 8) >> (valb + 8)) & 0x3F]); }
+
+  while (out.size() % 4) {
+    out.push_back('=');
+  }
+
+  return out;
+}
+
+inline bool is_file(const std::string &path) {
+  struct stat st;
+  return stat(path.c_str(), &st) >= 0 && S_ISREG(st.st_mode);
+}
+
+inline bool is_dir(const std::string &path) {
+  struct stat st;
+  return stat(path.c_str(), &st) >= 0 && S_ISDIR(st.st_mode);
+}
+
+inline bool is_valid_path(const std::string &path) {
+  size_t level = 0;
+  size_t i = 0;
+
+  // Skip slash
+  while (i < path.size() && path[i] == '/') {
+    i++;
+  }
+
+  while (i < path.size()) {
+    // Read component
+    auto beg = i;
+    while (i < path.size() && path[i] != '/') {
+      i++;
+    }
+
+    auto len = i - beg;
+    assert(len > 0);
+
+    if (!path.compare(beg, len, ".")) {
+      ;
+    } else if (!path.compare(beg, len, "..")) {
+      if (level == 0) { return false; }
+      level--;
+    } else {
+      level++;
+    }
+
+    // Skip slash
+    while (i < path.size() && path[i] == '/') {
+      i++;
+    }
+  }
+
+  return true;
+}
+
+inline void read_file(const std::string &path, std::string &out) {
+  std::ifstream fs(path, std::ios_base::binary);
+  fs.seekg(0, std::ios_base::end);
+  auto size = fs.tellg();
+  fs.seekg(0);
+  out.resize(static_cast<size_t>(size));
+  fs.read(&out[0], size);
+}
+
+inline std::string file_extension(const std::string &path) {
+  std::smatch m;
+  auto re = std::regex("\\.([a-zA-Z0-9]+)$");
+  if (std::regex_search(path, m, re)) { return m[1].str(); }
+  return std::string();
+}
+
+template <class Fn> void split(const char *b, const char *e, char d, Fn fn) {
+  int i = 0;
+  int beg = 0;
+
+  while (e ? (b + i != e) : (b[i] != '\0')) {
+    if (b[i] == d) {
+      fn(&b[beg], &b[i]);
+      beg = i + 1;
+    }
+    i++;
+  }
+
+  if (i) { fn(&b[beg], &b[i]); }
+}
+
+// NOTE: until the read size reaches `fixed_buffer_size`, use `fixed_buffer`
+// to store data. The call can set memory on stack for performance.
+class stream_line_reader {
+public:
+  stream_line_reader(Stream &strm, char *fixed_buffer, size_t fixed_buffer_size)
+      : strm_(strm), fixed_buffer_(fixed_buffer),
+        fixed_buffer_size_(fixed_buffer_size) {}
+
+  const char *ptr() const {
+    if (glowable_buffer_.empty()) {
+      return fixed_buffer_;
+    } else {
+      return glowable_buffer_.data();
+    }
+  }
+
+  size_t size() const {
+    if (glowable_buffer_.empty()) {
+      return fixed_buffer_used_size_;
+    } else {
+      return glowable_buffer_.size();
+    }
+  }
+
+  bool getline() {
+    fixed_buffer_used_size_ = 0;
+    glowable_buffer_.clear();
+
+    for (size_t i = 0;; i++) {
+      char byte;
+      auto n = strm_.read(&byte, 1);
+
+      if (n < 0) {
+        return false;
+      } else if (n == 0) {
+        if (i == 0) {
+          return false;
+        } else {
+          break;
+        }
+      }
+
+      append(byte);
+
+      if (byte == '\n') { break; }
+    }
+
+    return true;
+  }
+
+private:
+  void append(char c) {
+    if (fixed_buffer_used_size_ < fixed_buffer_size_ - 1) {
+      fixed_buffer_[fixed_buffer_used_size_++] = c;
+      fixed_buffer_[fixed_buffer_used_size_] = '\0';
+    } else {
+      if (glowable_buffer_.empty()) {
+        assert(fixed_buffer_[fixed_buffer_used_size_] == '\0');
+        glowable_buffer_.assign(fixed_buffer_, fixed_buffer_used_size_);
+      }
+      glowable_buffer_ += c;
+    }
+  }
+
+  Stream &strm_;
+  char *fixed_buffer_;
+  const size_t fixed_buffer_size_;
+  size_t fixed_buffer_used_size_ = 0;
+  std::string glowable_buffer_;
+};
+
+inline int close_socket(socket_t sock) {
+#ifdef _WIN32
+  return closesocket(sock);
+#else
+  return close(sock);
+#endif
+}
+
+inline int select_read(socket_t sock, time_t sec, time_t usec) {
+#ifdef CPPHTTPLIB_USE_POLL
+  struct pollfd pfd_read;
+  pfd_read.fd = sock;
+  pfd_read.events = POLLIN;
+
+  auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
+
+  return poll(&pfd_read, 1, timeout);
+#else
+  fd_set fds;
+  FD_ZERO(&fds);
+  FD_SET(sock, &fds);
+
+  timeval tv;
+  tv.tv_sec = static_cast<long>(sec);
+  tv.tv_usec = static_cast<long>(usec);
+
+  return select(static_cast<int>(sock + 1), &fds, nullptr, nullptr, &tv);
+#endif
+}
+
+inline bool wait_until_socket_is_ready(socket_t sock, time_t sec, time_t usec) {
+#ifdef CPPHTTPLIB_USE_POLL
+  struct pollfd pfd_read;
+  pfd_read.fd = sock;
+  pfd_read.events = POLLIN | POLLOUT;
+
+  auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
+
+  if (poll(&pfd_read, 1, timeout) > 0 &&
+      pfd_read.revents & (POLLIN | POLLOUT)) {
+    int error = 0;
+    socklen_t len = sizeof(error);
+    return getsockopt(sock, SOL_SOCKET, SO_ERROR,
+                      reinterpret_cast<char *>(&error), &len) >= 0 &&
+           !error;
+  }
+  return false;
+#else
+  fd_set fdsr;
+  FD_ZERO(&fdsr);
+  FD_SET(sock, &fdsr);
+
+  auto fdsw = fdsr;
+  auto fdse = fdsr;
+
+  timeval tv;
+  tv.tv_sec = static_cast<long>(sec);
+  tv.tv_usec = static_cast<long>(usec);
+
+  if (select(static_cast<int>(sock + 1), &fdsr, &fdsw, &fdse, &tv) > 0 &&
+      (FD_ISSET(sock, &fdsr) || FD_ISSET(sock, &fdsw))) {
+    int error = 0;
+    socklen_t len = sizeof(error);
+    return getsockopt(sock, SOL_SOCKET, SO_ERROR, reinterpret_cast<char*>(&error), &len) >= 0 &&
+           !error;
+  }
+  return false;
+#endif
+}
+
+template <typename T>
+inline bool process_and_close_socket(bool is_client_request, socket_t sock,
+                                     size_t keep_alive_max_count,
+                                     time_t read_timeout_sec,
+                                     time_t read_timeout_usec, T callback) {
+  assert(keep_alive_max_count > 0);
+
+  bool ret = false;
+
+  if (keep_alive_max_count > 1) {
+    auto count = keep_alive_max_count;
+    while (count > 0 &&
+           (is_client_request ||
+            detail::select_read(sock, CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND,
+                                CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND) > 0)) {
+      SocketStream strm(sock, read_timeout_sec, read_timeout_usec);
+      auto last_connection = count == 1;
+      auto connection_close = false;
+
+      ret = callback(strm, last_connection, connection_close);
+      if (!ret || connection_close) { break; }
+
+      count--;
+    }
+  } else {
+    SocketStream strm(sock, read_timeout_sec, read_timeout_usec);
+    auto dummy_connection_close = false;
+    ret = callback(strm, true, dummy_connection_close);
+  }
+
+  close_socket(sock);
+  return ret;
+}
+
+inline int shutdown_socket(socket_t sock) {
+#ifdef _WIN32
+  return shutdown(sock, SD_BOTH);
+#else
+  return shutdown(sock, SHUT_RDWR);
+#endif
+}
+
+template <typename Fn>
+socket_t create_socket(const char *host, int port, Fn fn,
+                       int socket_flags = 0) {
+#ifdef _WIN32
+#define SO_SYNCHRONOUS_NONALERT 0x20
+#define SO_OPENTYPE 0x7008
+
+  int opt = SO_SYNCHRONOUS_NONALERT;
+  setsockopt(INVALID_SOCKET, SOL_SOCKET, SO_OPENTYPE, (char *)&opt,
+             sizeof(opt));
+#endif
+
+  // Get address info
+  struct addrinfo hints;
+  struct addrinfo *result;
+
+  memset(&hints, 0, sizeof(struct addrinfo));
+  hints.ai_family = AF_UNSPEC;
+  hints.ai_socktype = SOCK_STREAM;
+  hints.ai_flags = socket_flags;
+  hints.ai_protocol = 0;
+
+  auto service = std::to_string(port);
+
+  if (getaddrinfo(host, service.c_str(), &hints, &result)) {
+    return INVALID_SOCKET;
+  }
+
+  for (auto rp = result; rp; rp = rp->ai_next) {
+    // Create a socket
+#ifdef _WIN32
+    auto sock = WSASocketW(rp->ai_family, rp->ai_socktype, rp->ai_protocol,
+                           nullptr, 0, WSA_FLAG_NO_HANDLE_INHERIT);
+#else
+    auto sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
+#endif
+    if (sock == INVALID_SOCKET) { continue; }
+
+#ifndef _WIN32
+    if (fcntl(sock, F_SETFD, FD_CLOEXEC) == -1) { continue; }
+#endif
+
+    // Make 'reuse address' option available
+    int yes = 1;
+    setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast<char *>(&yes),
+               sizeof(yes));
+#ifdef SO_REUSEPORT
+    setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, reinterpret_cast<char *>(&yes),
+               sizeof(yes));
+#endif
+
+    // bind or connect
+    if (fn(sock, *rp)) {
+      freeaddrinfo(result);
+      return sock;
+    }
+
+    close_socket(sock);
+  }
+
+  freeaddrinfo(result);
+  return INVALID_SOCKET;
+}
+
+inline void set_nonblocking(socket_t sock, bool nonblocking) {
+#ifdef _WIN32
+  auto flags = nonblocking ? 1UL : 0UL;
+  ioctlsocket(sock, FIONBIO, &flags);
+#else
+  auto flags = fcntl(sock, F_GETFL, 0);
+  fcntl(sock, F_SETFL,
+        nonblocking ? (flags | O_NONBLOCK) : (flags & (~O_NONBLOCK)));
+#endif
+}
+
+inline bool is_connection_error() {
+#ifdef _WIN32
+  return WSAGetLastError() != WSAEWOULDBLOCK;
+#else
+  return errno != EINPROGRESS;
+#endif
+}
+
+inline std::string get_remote_addr(socket_t sock) {
+  struct sockaddr_storage addr;
+  socklen_t len = sizeof(addr);
+
+  if (!getpeername(sock, reinterpret_cast<struct sockaddr *>(&addr), &len)) {
+    std::array<char, NI_MAXHOST> ipstr{};
+
+    if (!getnameinfo(reinterpret_cast<struct sockaddr *>(&addr), len, ipstr.data(), ipstr.size(),
+                     nullptr, 0, NI_NUMERICHOST)) {
+      return ipstr.data();
+    }
+  }
+
+  return std::string();
+}
+
+inline const char *find_content_type(const std::string &path) {
+  auto ext = file_extension(path);
+  if (ext == "txt") {
+    return "text/plain";
+  } else if (ext == "html" || ext == "htm") {
+    return "text/html";
+  } else if (ext == "css") {
+    return "text/css";
+  } else if (ext == "jpeg" || ext == "jpg") {
+    return "image/jpg";
+  } else if (ext == "png") {
+    return "image/png";
+  } else if (ext == "gif") {
+    return "image/gif";
+  } else if (ext == "svg") {
+    return "image/svg+xml";
+  } else if (ext == "ico") {
+    return "image/x-icon";
+  } else if (ext == "json") {
+    return "application/json";
+  } else if (ext == "pdf") {
+    return "application/pdf";
+  } else if (ext == "js") {
+    return "application/javascript";
+  } else if (ext == "xml") {
+    return "application/xml";
+  } else if (ext == "xhtml") {
+    return "application/xhtml+xml";
+  }
+  return nullptr;
+}
+
+inline const char *status_message(int status) {
+  switch (status) {
+  case 200: return "OK";
+  case 206: return "Partial Content";
+  case 301: return "Moved Permanently";
+  case 302: return "Found";
+  case 303: return "See Other";
+  case 304: return "Not Modified";
+  case 400: return "Bad Request";
+  case 403: return "Forbidden";
+  case 404: return "Not Found";
+  case 413: return "Payload Too Large";
+  case 414: return "Request-URI Too Long";
+  case 415: return "Unsupported Media Type";
+  case 416: return "Range Not Satisfiable";
+
+  default:
+  case 500: return "Internal Server Error";
+  }
+}
+
+#ifdef CPPHTTPLIB_ZLIB_SUPPORT
+inline bool can_compress(const std::string &content_type) {
+  return !content_type.find("text/") || content_type == "image/svg+xml" ||
+         content_type == "application/javascript" ||
+         content_type == "application/json" ||
+         content_type == "application/xml" ||
+         content_type == "application/xhtml+xml";
+}
+
+inline bool compress(std::string &content) {
+  z_stream strm;
+  strm.zalloc = Z_NULL;
+  strm.zfree = Z_NULL;
+  strm.opaque = Z_NULL;
+
+  auto ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 31, 8,
+                          Z_DEFAULT_STRATEGY);
+  if (ret != Z_OK) { return false; }
+
+  strm.avail_in = content.size();
+  strm.next_in =
+      const_cast<Bytef *>(reinterpret_cast<const Bytef *>(content.data()));
+
+  std::string compressed;
+
+  std::array<char, 16384> buff{};
+  do {
+    strm.avail_out = buff.size();
+    strm.next_out = reinterpret_cast<Bytef*>(buff.data());
+    ret = deflate(&strm, Z_FINISH);
+    assert(ret != Z_STREAM_ERROR);
+    compressed.append(buff.data(), buff.size() - strm.avail_out);
+  } while (strm.avail_out == 0);
+
+  assert(ret == Z_STREAM_END);
+  assert(strm.avail_in == 0);
+
+  content.swap(compressed);
+
+  deflateEnd(&strm);
+  return true;
+}
+
+class decompressor {
+public:
+  decompressor() {
+    strm.zalloc = Z_NULL;
+    strm.zfree = Z_NULL;
+    strm.opaque = Z_NULL;
+
+    // 15 is the value of wbits, which should be at the maximum possible value
+    // to ensure that any gzip stream can be decoded. The offset of 16 specifies
+    // that the stream to decompress will be formatted with a gzip wrapper.
+    is_valid_ = inflateInit2(&strm, 16 + 15) == Z_OK;
+  }
+
+  ~decompressor() { inflateEnd(&strm); }
+
+  bool is_valid() const { return is_valid_; }
+
+  template <typename T>
+  bool decompress(const char *data, size_t data_length, T callback) {
+    int ret = Z_OK;
+
+    strm.avail_in = data_length;
+    strm.next_in = const_cast<Bytef *>(reinterpret_cast<const Bytef *>(data));
+
+    std::array<char, 16384> buff{};
+    do {
+      strm.avail_out = buff.size();
+      strm.next_out = reinterpret_cast<Bytef*>(buff.data());
+
+      ret = inflate(&strm, Z_NO_FLUSH);
+      assert(ret != Z_STREAM_ERROR);
+      switch (ret) {
+      case Z_NEED_DICT:
+      case Z_DATA_ERROR:
+      case Z_MEM_ERROR: inflateEnd(&strm); return false;
+      }
+
+      if (!callback(buff.data(), buff.size() - strm.avail_out)) { return false; }
+    } while (strm.avail_out == 0);
+
+    return ret == Z_OK || ret == Z_STREAM_END;
+  }
+
+private:
+  bool is_valid_;
+  z_stream strm;
+};
+#endif
+
+inline bool has_header(const Headers &headers, const char *key) {
+  return headers.find(key) != headers.end();
+}
+
+inline const char *get_header_value(const Headers &headers, const char *key,
+                                    size_t id = 0, const char *def = nullptr) {
+  auto it = headers.find(key);
+  std::advance(it, id);
+  if (it != headers.end()) { return it->second.c_str(); }
+  return def;
+}
+
+inline uint64_t get_header_value_uint64(const Headers &headers, const char *key,
+                                        int def = 0) {
+  auto it = headers.find(key);
+  if (it != headers.end()) {
+    return std::strtoull(it->second.data(), nullptr, 10);
+  }
+  return def;
+}
+
+inline bool read_headers(Stream &strm, Headers &headers) {
+  static std::regex re(R"((.+?):\s*(.+?)\s*\r\n)");
+
+  const auto bufsiz = 2048;
+  char buf[bufsiz];
+
+  stream_line_reader line_reader(strm, buf, bufsiz);
+
+  for (;;) {
+    if (!line_reader.getline()) { return false; }
+    if (!strcmp(line_reader.ptr(), "\r\n")) { break; }
+    std::cmatch m;
+    if (std::regex_match(line_reader.ptr(), m, re)) {
+      auto key = std::string(m[1]);
+      auto val = std::string(m[2]);
+      headers.emplace(key, val);
+    }
+  }
+
+  return true;
+}
+
+inline bool read_content_with_length(Stream &strm, uint64_t len,
+                                     Progress progress, ContentReceiver out) {
+  char buf[CPPHTTPLIB_RECV_BUFSIZ];
+
+  uint64_t r = 0;
+  while (r < len) {
+    auto read_len = static_cast<size_t>(len - r);
+    auto n = strm.read(buf, std::min(read_len, CPPHTTPLIB_RECV_BUFSIZ));
+    if (n <= 0) { return false; }
+
+    if (!out(buf, n)) { return false; }
+
+    r += n;
+
+    if (progress) {
+      if (!progress(r, len)) { return false; }
+    }
+  }
+
+  return true;
+}
+
+inline void skip_content_with_length(Stream &strm, uint64_t len) {
+  char buf[CPPHTTPLIB_RECV_BUFSIZ];
+  uint64_t r = 0;
+  while (r < len) {
+    auto read_len = static_cast<size_t>(len - r);
+    auto n = strm.read(buf, std::min(read_len, CPPHTTPLIB_RECV_BUFSIZ));
+    if (n <= 0) { return; }
+    r += n;
+  }
+}
+
+inline bool read_content_without_length(Stream &strm, ContentReceiver out) {
+  char buf[CPPHTTPLIB_RECV_BUFSIZ];
+  for (;;) {
+    auto n = strm.read(buf, CPPHTTPLIB_RECV_BUFSIZ);
+    if (n < 0) {
+      return false;
+    } else if (n == 0) {
+      return true;
+    }
+    if (!out(buf, n)) { return false; }
+  }
+
+  return true;
+}
+
+inline bool read_content_chunked(Stream &strm, ContentReceiver out) {
+  const auto bufsiz = 16;
+  char buf[bufsiz];
+
+  stream_line_reader line_reader(strm, buf, bufsiz);
+
+  if (!line_reader.getline()) { return false; }
+
+  auto chunk_len = std::stoi(line_reader.ptr(), 0, 16);
+
+  while (chunk_len > 0) {
+    if (!read_content_with_length(strm, chunk_len, nullptr, out)) {
+      return false;
+    }
+
+    if (!line_reader.getline()) { return false; }
+
+    if (strcmp(line_reader.ptr(), "\r\n")) { break; }
+
+    if (!line_reader.getline()) { return false; }
+
+    chunk_len = std::stoi(line_reader.ptr(), 0, 16);
+  }
+
+  if (chunk_len == 0) {
+    // Reader terminator after chunks
+    if (!line_reader.getline() || strcmp(line_reader.ptr(), "\r\n"))
+      return false;
+  }
+
+  return true;
+}
+
+inline bool is_chunked_transfer_encoding(const Headers &headers) {
+  return !strcasecmp(get_header_value(headers, "Transfer-Encoding", 0, ""),
+                     "chunked");
+}
+
+template <typename T>
+bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status,
+                  Progress progress, ContentReceiver receiver) {
+
+  ContentReceiver out = [&](const char *buf, size_t n) {
+    return receiver(buf, n);
+  };
+
+#ifdef CPPHTTPLIB_ZLIB_SUPPORT
+  detail::decompressor decompressor;
+
+  if (!decompressor.is_valid()) {
+    status = 500;
+    return false;
+  }
+
+  if (x.get_header_value("Content-Encoding") == "gzip") {
+    out = [&](const char *buf, size_t n) {
+      return decompressor.decompress(
+          buf, n, [&](const char *buf, size_t n) { return receiver(buf, n); });
+    };
+  }
+#else
+  if (x.get_header_value("Content-Encoding") == "gzip") {
+    status = 415;
+    return false;
+  }
+#endif
+
+  auto ret = true;
+  auto exceed_payload_max_length = false;
+
+  if (is_chunked_transfer_encoding(x.headers)) {
+    ret = read_content_chunked(strm, out);
+  } else if (!has_header(x.headers, "Content-Length")) {
+    ret = read_content_without_length(strm, out);
+  } else {
+    auto len = get_header_value_uint64(x.headers, "Content-Length", 0);
+    if (len > payload_max_length) {
+      exceed_payload_max_length = true;
+      skip_content_with_length(strm, len);
+      ret = false;
+    } else if (len > 0) {
+      ret = read_content_with_length(strm, len, progress, out);
+    }
+  }
+
+  if (!ret) { status = exceed_payload_max_length ? 413 : 400; }
+
+  return ret;
+}
+
+template <typename T>
+inline int write_headers(Stream &strm, const T &info, const Headers &headers) {
+  auto write_len = 0;
+  for (const auto &x : info.headers) {
+    auto len =
+        strm.write_format("%s: %s\r\n", x.first.c_str(), x.second.c_str());
+    if (len < 0) { return len; }
+    write_len += len;
+  }
+  for (const auto &x : headers) {
+    auto len =
+        strm.write_format("%s: %s\r\n", x.first.c_str(), x.second.c_str());
+    if (len < 0) { return len; }
+    write_len += len;
+  }
+  auto len = strm.write("\r\n");
+  if (len < 0) { return len; }
+  write_len += len;
+  return write_len;
+}
+
+inline ssize_t write_content(Stream &strm,
+                             ContentProviderWithCloser content_provider,
+                             size_t offset, size_t length) {
+  size_t begin_offset = offset;
+  size_t end_offset = offset + length;
+  while (offset < end_offset) {
+    ssize_t written_length = 0;
+    content_provider(
+        offset, end_offset - offset,
+        [&](const char *d, size_t l) {
+          offset += l;
+          written_length = strm.write(d, l);
+        },
+        [&](void) { written_length = -1; });
+    if (written_length < 0) { return written_length; }
+  }
+  return static_cast<ssize_t>(offset - begin_offset);
+}
+
+inline ssize_t
+write_content_chunked(Stream &strm,
+                      ContentProviderWithCloser content_provider) {
+  size_t offset = 0;
+  auto data_available = true;
+  ssize_t total_written_length = 0;
+  while (data_available) {
+    ssize_t written_length = 0;
+    content_provider(
+        offset, 0,
+        [&](const char *d, size_t l) {
+          data_available = l > 0;
+          offset += l;
+
+          // Emit chunked response header and footer for each chunk
+          auto chunk = from_i_to_hex(l) + "\r\n" + std::string(d, l) + "\r\n";
+          written_length = strm.write(chunk);
+        },
+        [&](void) {
+          data_available = false;
+          written_length = strm.write("0\r\n\r\n");
+        });
+
+    if (written_length < 0) { return written_length; }
+    total_written_length += written_length;
+  }
+  return total_written_length;
+}
+
+template <typename T>
+inline bool redirect(T &cli, const Request &req, Response &res,
+                     const std::string &path) {
+  Request new_req;
+  new_req.method = req.method;
+  new_req.path = path;
+  new_req.headers = req.headers;
+  new_req.body = req.body;
+  new_req.redirect_count = req.redirect_count - 1;
+  new_req.response_handler = req.response_handler;
+  new_req.content_receiver = req.content_receiver;
+  new_req.progress = req.progress;
+
+  Response new_res;
+  auto ret = cli.send(new_req, new_res);
+  if (ret) { res = new_res; }
+  return ret;
+}
+
+inline std::string encode_url(const std::string &s) {
+  std::string result;
+
+  for (auto i = 0; s[i]; i++) {
+    switch (s[i]) {
+    case ' ': result += "%20"; break;
+    case '+': result += "%2B"; break;
+    case '\r': result += "%0D"; break;
+    case '\n': result += "%0A"; break;
+    case '\'': result += "%27"; break;
+    case ',': result += "%2C"; break;
+    case ':': result += "%3A"; break;
+    case ';': result += "%3B"; break;
+    default:
+      auto c = static_cast<uint8_t>(s[i]);
+      if (c >= 0x80) {
+        result += '%';
+        char hex[4];
+        size_t len = snprintf(hex, sizeof(hex) - 1, "%02X", c);
+        assert(len == 2);
+        result.append(hex, len);
+      } else {
+        result += s[i];
+      }
+      break;
+    }
+  }
+
+  return result;
+}
+
+inline std::string decode_url(const std::string &s) {
+  std::string result;
+
+  for (size_t i = 0; i < s.size(); i++) {
+    if (s[i] == '%' && i + 1 < s.size()) {
+      if (s[i + 1] == 'u') {
+        int val = 0;
+        if (from_hex_to_i(s, i + 2, 4, val)) {
+          // 4 digits Unicode codes
+          char buff[4];
+          size_t len = to_utf8(val, buff);
+          if (len > 0) { result.append(buff, len); }
+          i += 5; // 'u0000'
+        } else {
+          result += s[i];
+        }
+      } else {
+        int val = 0;
+        if (from_hex_to_i(s, i + 1, 2, val)) {
+          // 2 digits hex codes
+          result += static_cast<char>(val);
+          i += 2; // '00'
+        } else {
+          result += s[i];
+        }
+      }
+    } else if (s[i] == '+') {
+      result += ' ';
+    } else {
+      result += s[i];
+    }
+  }
+
+  return result;
+}
+
+inline void parse_query_text(const std::string &s, Params &params) {
+  split(&s[0], &s[s.size()], '&', [&](const char *b, const char *e) {
+    std::string key;
+    std::string val;
+    split(b, e, '=', [&](const char *b, const char *e) {
+      if (key.empty()) {
+        key.assign(b, e);
+      } else {
+        val.assign(b, e);
+      }
+    });
+    params.emplace(key, decode_url(val));
+  });
+}
+
+inline bool parse_multipart_boundary(const std::string &content_type,
+                                     std::string &boundary) {
+  auto pos = content_type.find("boundary=");
+  if (pos == std::string::npos) { return false; }
+
+  boundary = content_type.substr(pos + 9);
+  return true;
+}
+
+inline bool parse_multipart_formdata(const std::string &boundary,
+                                     const std::string &body,
+                                     MultipartFiles &files) {
+  static std::string dash = "--";
+  static std::string crlf = "\r\n";
+
+  static std::regex re_content_type("Content-Type: (.*?)$",
+                                    std::regex_constants::icase);
+
+  static std::regex re_content_disposition(
+      "Content-Disposition: form-data; name=\"(.*?)\"(?:; filename=\"(.*?)\")?",
+      std::regex_constants::icase);
+
+  auto dash_boundary = dash + boundary;
+
+  auto pos = body.find(dash_boundary);
+  if (pos != 0) { return false; }
+
+  pos += dash_boundary.size();
+
+  auto next_pos = body.find(crlf, pos);
+  if (next_pos == std::string::npos) { return false; }
+
+  pos = next_pos + crlf.size();
+
+  while (pos < body.size()) {
+    next_pos = body.find(crlf, pos);
+    if (next_pos == std::string::npos) { return false; }
+
+    std::string name;
+    MultipartFile file;
+
+    auto header = body.substr(pos, (next_pos - pos));
+
+    while (pos != next_pos) {
+      std::smatch m;
+      if (std::regex_match(header, m, re_content_type)) {
+        file.content_type = m[1];
+      } else if (std::regex_match(header, m, re_content_disposition)) {
+        name = m[1];
+        file.filename = m[2];
+      }
+
+      pos = next_pos + crlf.size();
+
+      next_pos = body.find(crlf, pos);
+      if (next_pos == std::string::npos) { return false; }
+
+      header = body.substr(pos, (next_pos - pos));
+    }
+
+    pos = next_pos + crlf.size();
+
+    next_pos = body.find(crlf + dash_boundary, pos);
+
+    if (next_pos == std::string::npos) { return false; }
+
+    file.offset = pos;
+    file.length = next_pos - pos;
+
+    pos = next_pos + crlf.size() + dash_boundary.size();
+
+    next_pos = body.find(crlf, pos);
+    if (next_pos == std::string::npos) { return false; }
+
+    files.emplace(name, file);
+
+    pos = next_pos + crlf.size();
+  }
+
+  return true;
+}
+
+inline bool parse_range_header(const std::string &s, Ranges &ranges) {
+  try {
+    static auto re_first_range =
+        std::regex(R"(bytes=(\d*-\d*(?:,\s*\d*-\d*)*))");
+    std::smatch m;
+    if (std::regex_match(s, m, re_first_range)) {
+      auto pos = m.position(1);
+      auto len = m.length(1);
+      detail::split(
+          &s[pos], &s[pos + len], ',', [&](const char *b, const char *e) {
+            static auto re_another_range = std::regex(R"(\s*(\d*)-(\d*))");
+            std::cmatch m;
+            if (std::regex_match(b, e, m, re_another_range)) {
+              ssize_t first = -1;
+              if (!m.str(1).empty()) {
+                first = static_cast<ssize_t>(std::stoll(m.str(1)));
+              }
+
+              ssize_t last = -1;
+              if (!m.str(2).empty()) {
+                last = static_cast<ssize_t>(std::stoll(m.str(2)));
+              }
+
+              if (first != -1 && last != -1 && first > last) {
+                throw std::runtime_error("invalid range error");
+              }
+              ranges.emplace_back(std::make_pair(first, last));
+            }
+          });
+      return true;
+    }
+    return false;
+  } catch (...) { return false; }
+}
+
+inline std::string to_lower(const char *beg, const char *end) {
+  std::string out;
+  auto it = beg;
+  while (it != end) {
+    out += static_cast<char>(::tolower(*it));
+    it++;
+  }
+  return out;
+}
+
+inline std::string make_multipart_data_boundary() {
+  static const char data[] =
+      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+  std::random_device seed_gen;
+  std::mt19937 engine(seed_gen());
+
+  std::string result = "--cpp-httplib-multipart-data-";
+
+  for (auto i = 0; i < 16; i++) {
+    result += data[engine() % (sizeof(data) - 1)];
+  }
+
+  return result;
+}
+
+inline std::pair<size_t, size_t>
+get_range_offset_and_length(const Request &req, size_t content_length,
+                            size_t index) {
+  auto r = req.ranges[index];
+
+  if (r.first == -1 && r.second == -1) {
+    return std::make_pair(0, content_length);
+  }
+
+  if (r.first == -1) {
+    r.first = content_length - r.second;
+    r.second = content_length - 1;
+  }
+
+  if (r.second == -1) { r.second = content_length - 1; }
+
+  return std::make_pair(r.first, r.second - r.first + 1);
+}
+
+inline std::string make_content_range_header_field(size_t offset, size_t length,
+                                                   size_t content_length) {
+  std::string field = "bytes ";
+  field += std::to_string(offset);
+  field += "-";
+  field += std::to_string(offset + length - 1);
+  field += "/";
+  field += std::to_string(content_length);
+  return field;
+}
+
+template <typename SToken, typename CToken, typename Content>
+bool process_multipart_ranges_data(const Request &req, Response &res,
+                                   const std::string &boundary,
+                                   const std::string &content_type,
+                                   SToken stoken, CToken ctoken,
+                                   Content content) {
+  for (size_t i = 0; i < req.ranges.size(); i++) {
+    ctoken("--");
+    stoken(boundary);
+    ctoken("\r\n");
+    if (!content_type.empty()) {
+      ctoken("Content-Type: ");
+      stoken(content_type);
+      ctoken("\r\n");
+    }
+
+    auto offsets = detail::get_range_offset_and_length(req, res.body.size(), i);
+    auto offset = offsets.first;
+    auto length = offsets.second;
+
+    ctoken("Content-Range: ");
+    stoken(make_content_range_header_field(offset, length, res.body.size()));
+    ctoken("\r\n");
+    ctoken("\r\n");
+    if (!content(offset, length)) { return false; }
+    ctoken("\r\n");
+  }
+
+  ctoken("--");
+  stoken(boundary);
+  ctoken("--\r\n");
+
+  return true;
+}
+
+inline std::string make_multipart_ranges_data(const Request &req, Response &res,
+                                              const std::string &boundary,
+                                              const std::string &content_type) {
+  std::string data;
+
+  process_multipart_ranges_data(
+      req, res, boundary, content_type,
+      [&](const std::string &token) { data += token; },
+      [&](const char *token) { data += token; },
+      [&](size_t offset, size_t length) {
+        data += res.body.substr(offset, length);
+        return true;
+      });
+
+  return data;
+}
+
+inline size_t
+get_multipart_ranges_data_length(const Request &req, Response &res,
+                                 const std::string &boundary,
+                                 const std::string &content_type) {
+  size_t data_length = 0;
+
+  process_multipart_ranges_data(
+      req, res, boundary, content_type,
+      [&](const std::string &token) { data_length += token.size(); },
+      [&](const char *token) { data_length += strlen(token); },
+      [&](size_t /*offset*/, size_t length) {
+        data_length += length;
+        return true;
+      });
+
+  return data_length;
+}
+
+inline bool write_multipart_ranges_data(Stream &strm, const Request &req,
+                                        Response &res,
+                                        const std::string &boundary,
+                                        const std::string &content_type) {
+  return process_multipart_ranges_data(
+      req, res, boundary, content_type,
+      [&](const std::string &token) { strm.write(token); },
+      [&](const char *token) { strm.write(token); },
+      [&](size_t offset, size_t length) {
+        return detail::write_content(strm, res.content_provider, offset,
+                                     length) >= 0;
+      });
+}
+
+inline std::pair<size_t, size_t>
+get_range_offset_and_length(const Request &req, const Response &res,
+                            size_t index) {
+  auto r = req.ranges[index];
+
+  if (r.second == -1) { r.second = res.content_length - 1; }
+
+  return std::make_pair(r.first, r.second - r.first + 1);
+}
+
+#ifdef _WIN32
+class WSInit {
+public:
+  WSInit() {
+    WSADATA wsaData;
+    WSAStartup(0x0002, &wsaData);
+  }
+
+  ~WSInit() { WSACleanup(); }
+};
+
+static WSInit wsinit_;
+#endif
+
+} // namespace detail
+
+// Header utilities
+inline std::pair<std::string, std::string> make_range_header(Ranges ranges) {
+  std::string field = "bytes=";
+  auto i = 0;
+  for (auto r : ranges) {
+    if (i != 0) { field += ", "; }
+    if (r.first != -1) { field += std::to_string(r.first); }
+    field += '-';
+    if (r.second != -1) { field += std::to_string(r.second); }
+    i++;
+  }
+  return std::make_pair("Range", field);
+}
+
+inline std::pair<std::string, std::string>
+make_basic_authentication_header(const std::string &username,
+                                 const std::string &password) {
+  auto field = "Basic " + detail::base64_encode(username + ":" + password);
+  return std::make_pair("Authorization", field);
+}
+
+// Request implementation
+inline bool Request::has_header(const char *key) const {
+  return detail::has_header(headers, key);
+}
+
+inline std::string Request::get_header_value(const char *key, size_t id) const {
+  return detail::get_header_value(headers, key, id, "");
+}
+
+inline size_t Request::get_header_value_count(const char *key) const {
+  auto r = headers.equal_range(key);
+  return std::distance(r.first, r.second);
+}
+
+inline void Request::set_header(const char *key, const char *val) {
+  headers.emplace(key, val);
+}
+
+inline void Request::set_header(const char *key, const std::string &val) {
+  headers.emplace(key, val);
+}
+
+inline bool Request::has_param(const char *key) const {
+  return params.find(key) != params.end();
+}
+
+inline std::string Request::get_param_value(const char *key, size_t id) const {
+  auto it = params.find(key);
+  std::advance(it, id);
+  if (it != params.end()) { return it->second; }
+  return std::string();
+}
+
+inline size_t Request::get_param_value_count(const char *key) const {
+  auto r = params.equal_range(key);
+  return std::distance(r.first, r.second);
+}
+
+inline bool Request::has_file(const char *key) const {
+  return files.find(key) != files.end();
+}
+
+inline MultipartFile Request::get_file_value(const char *key) const {
+  auto it = files.find(key);
+  if (it != files.end()) { return it->second; }
+  return MultipartFile();
+}
+
+// Response implementation
+inline bool Response::has_header(const char *key) const {
+  return headers.find(key) != headers.end();
+}
+
+inline std::string Response::get_header_value(const char *key,
+                                              size_t id) const {
+  return detail::get_header_value(headers, key, id, "");
+}
+
+inline size_t Response::get_header_value_count(const char *key) const {
+  auto r = headers.equal_range(key);
+  return std::distance(r.first, r.second);
+}
+
+inline void Response::set_header(const char *key, const char *val) {
+  headers.emplace(key, val);
+}
+
+inline void Response::set_header(const char *key, const std::string &val) {
+  headers.emplace(key, val);
+}
+
+inline void Response::set_redirect(const char *url) {
+  set_header("Location", url);
+  status = 302;
+}
+
+inline void Response::set_content(const char *s, size_t n,
+                                  const char *content_type) {
+  body.assign(s, n);
+  set_header("Content-Type", content_type);
+}
+
+inline void Response::set_content(const std::string &s,
+                                  const char *content_type) {
+  body = s;
+  set_header("Content-Type", content_type);
+}
+
+inline void Response::set_content_provider(
+    size_t length,
+    std::function<void(size_t offset, size_t length, DataSink sink)> provider,
+    std::function<void()> resource_releaser) {
+  assert(length > 0);
+  content_length = length;
+  content_provider = [provider](size_t offset, size_t length, DataSink sink,
+                                Done) { provider(offset, length, sink); };
+  content_provider_resource_releaser = resource_releaser;
+}
+
+inline void Response::set_chunked_content_provider(
+    std::function<void(size_t offset, DataSink sink, Done done)> provider,
+    std::function<void()> resource_releaser) {
+  content_length = 0;
+  content_provider = [provider](size_t offset, size_t, DataSink sink,
+                                Done done) { provider(offset, sink, done); };
+  content_provider_resource_releaser = resource_releaser;
+}
+
+// Rstream implementation
+template <typename... Args>
+inline int Stream::write_format(const char *fmt, const Args &... args) {
+  std::array<char, 2048> buf;
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  auto n = _snprintf_s(buf, bufsiz, buf.size() - 1, fmt, args...);
+#else
+  auto n = snprintf(buf.data(), buf.size() - 1, fmt, args...);
+#endif
+  if (n <= 0) { return n; }
+
+  if (n >= static_cast<int>(buf.size()) - 1) {
+    std::vector<char> glowable_buf(buf.size());
+
+    while (n >= static_cast<int>(glowable_buf.size() - 1)) {
+      glowable_buf.resize(glowable_buf.size() * 2);
+#if defined(_MSC_VER) && _MSC_VER < 1900
+      n = _snprintf_s(&glowable_buf[0], glowable_buf.size(),
+                      glowable_buf.size() - 1, fmt, args...);
+#else
+      n = snprintf(&glowable_buf[0], glowable_buf.size() - 1, fmt, args...);
+#endif
+    }
+    return write(&glowable_buf[0], n);
+  } else {
+    return write(buf.data(), n);
+  }
+}
+
+// Socket stream implementation
+inline SocketStream::SocketStream(socket_t sock, time_t read_timeout_sec,
+                                  time_t read_timeout_usec)
+    : sock_(sock), read_timeout_sec_(read_timeout_sec),
+      read_timeout_usec_(read_timeout_usec) {}
+
+inline SocketStream::~SocketStream() {}
+
+inline int SocketStream::read(char *ptr, size_t size) {
+  if (detail::select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0) {
+    return recv(sock_, ptr, static_cast<int>(size), 0);
+  }
+  return -1;
+}
+
+inline int SocketStream::write(const char *ptr, size_t size) {
+  return send(sock_, ptr, static_cast<int>(size), 0);
+}
+
+inline int SocketStream::write(const char *ptr) {
+  return write(ptr, strlen(ptr));
+}
+
+inline int SocketStream::write(const std::string &s) {
+  return write(s.data(), s.size());
+}
+
+inline std::string SocketStream::get_remote_addr() const {
+  return detail::get_remote_addr(sock_);
+}
+
+// Buffer stream implementation
+inline int BufferStream::read(char *ptr, size_t size) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return static_cast<int>(buffer._Copy_s(ptr, size, size));
+#else
+  return static_cast<int>(buffer.copy(ptr, size));
+#endif
+}
+
+inline int BufferStream::write(const char *ptr, size_t size) {
+  buffer.append(ptr, size);
+  return static_cast<int>(size);
+}
+
+inline int BufferStream::write(const char *ptr) {
+  return write(ptr, strlen(ptr));
+}
+
+inline int BufferStream::write(const std::string &s) {
+  return write(s.data(), s.size());
+}
+
+inline std::string BufferStream::get_remote_addr() const { return ""; }
+
+inline const std::string &BufferStream::get_buffer() const { return buffer; }
+
+// HTTP server implementation
+inline Server::Server()
+    : keep_alive_max_count_(CPPHTTPLIB_KEEPALIVE_MAX_COUNT),
+      read_timeout_sec_(CPPHTTPLIB_READ_TIMEOUT_SECOND),
+      read_timeout_usec_(CPPHTTPLIB_READ_TIMEOUT_USECOND),
+      payload_max_length_(CPPHTTPLIB_PAYLOAD_MAX_LENGTH), is_running_(false),
+      svr_sock_(INVALID_SOCKET) {
+#ifndef _WIN32
+  signal(SIGPIPE, SIG_IGN);
+#endif
+  new_task_queue = [] {
+#if CPPHTTPLIB_THREAD_POOL_COUNT > 0
+    return new ThreadPool(CPPHTTPLIB_THREAD_POOL_COUNT);
+#elif CPPHTTPLIB_THREAD_POOL_COUNT == 0
+    return new Threads();
+#else
+    return new NoThread();
+#endif
+  };
+}
+
+inline Server::~Server() {}
+
+inline Server &Server::Get(const char *pattern, Handler handler) {
+  get_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
+  return *this;
+}
+
+inline Server &Server::Post(const char *pattern, Handler handler) {
+  post_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
+  return *this;
+}
+
+inline Server &Server::Post(const char *pattern,
+                            HandlerWithContentReader handler) {
+  post_handlers_for_content_reader.push_back(
+      std::make_pair(std::regex(pattern), handler));
+  return *this;
+}
+
+inline Server &Server::Put(const char *pattern, Handler handler) {
+  put_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
+  return *this;
+}
+
+inline Server &Server::Put(const char *pattern,
+                           HandlerWithContentReader handler) {
+  put_handlers_for_content_reader.push_back(
+      std::make_pair(std::regex(pattern), handler));
+  return *this;
+}
+
+inline Server &Server::Patch(const char *pattern, Handler handler) {
+  patch_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
+  return *this;
+}
+
+inline Server &Server::Patch(const char *pattern,
+                             HandlerWithContentReader handler) {
+  patch_handlers_for_content_reader.push_back(
+      std::make_pair(std::regex(pattern), handler));
+  return *this;
+}
+
+inline Server &Server::Delete(const char *pattern, Handler handler) {
+  delete_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
+  return *this;
+}
+
+inline Server &Server::Options(const char *pattern, Handler handler) {
+  options_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
+  return *this;
+}
+
+inline bool Server::set_base_dir(const char *dir, const char *mount_point) {
+  if (detail::is_dir(dir)) {
+    std::string mnt = mount_point ? mount_point : "/";
+    if (!mnt.empty() && mnt[0] == '/') {
+      base_dirs_.emplace_back(mnt, dir);
+      return true;
+    }
+  }
+  return false;
+}
+
+inline void Server::set_file_request_handler(Handler handler) {
+  file_request_handler_ = std::move(handler);
+}
+
+inline void Server::set_error_handler(Handler handler) {
+  error_handler_ = std::move(handler);
+}
+
+inline void Server::set_logger(Logger logger) { logger_ = std::move(logger); }
+
+inline void Server::set_keep_alive_max_count(size_t count) {
+  keep_alive_max_count_ = count;
+}
+
+inline void Server::set_read_timeout(time_t sec, time_t usec) {
+  read_timeout_sec_ = sec;
+  read_timeout_usec_ = usec;
+}
+
+inline void Server::set_payload_max_length(size_t length) {
+  payload_max_length_ = length;
+}
+
+inline bool Server::bind_to_port(const char *host, int port, int socket_flags) {
+  if (bind_internal(host, port, socket_flags) < 0) return false;
+  return true;
+}
+inline int Server::bind_to_any_port(const char *host, int socket_flags) {
+  return bind_internal(host, 0, socket_flags);
+}
+
+inline bool Server::listen_after_bind() { return listen_internal(); }
+
+inline bool Server::listen(const char *host, int port, int socket_flags) {
+  return bind_to_port(host, port, socket_flags) && listen_internal();
+}
+
+inline bool Server::is_running() const { return is_running_; }
+
+inline void Server::stop() {
+  if (is_running_) {
+    assert(svr_sock_ != INVALID_SOCKET);
+    std::atomic<socket_t> sock(svr_sock_.exchange(INVALID_SOCKET));
+    detail::shutdown_socket(sock);
+    detail::close_socket(sock);
+  }
+}
+
+inline bool Server::parse_request_line(const char *s, Request &req) {
+  static std::regex re(
+      "(GET|HEAD|POST|PUT|DELETE|CONNECT|OPTIONS|TRACE|PATCH|PRI) "
+      "(([^?]+)(?:\\?(.*?))?) (HTTP/1\\.[01])\r\n");
+
+  std::cmatch m;
+  if (std::regex_match(s, m, re)) {
+    req.version = std::string(m[5]);
+    req.method = std::string(m[1]);
+    req.target = std::string(m[2]);
+    req.path = detail::decode_url(m[3]);
+
+    // Parse query text
+    auto len = std::distance(m[4].first, m[4].second);
+    if (len > 0) { detail::parse_query_text(m[4], req.params); }
+
+    return true;
+  }
+
+  return false;
+}
+
+inline bool Server::write_response(Stream &strm, bool last_connection,
+                                   const Request &req, Response &res) {
+  assert(res.status != -1);
+
+  if (400 <= res.status && error_handler_) { error_handler_(req, res); }
+
+  // Response line
+  if (!strm.write_format("HTTP/1.1 %d %s\r\n", res.status,
+                         detail::status_message(res.status))) {
+    return false;
+  }
+
+  // Headers
+  if (last_connection || req.get_header_value("Connection") == "close") {
+    res.set_header("Connection", "close");
+  }
+
+  if (!last_connection && req.get_header_value("Connection") == "Keep-Alive") {
+    res.set_header("Connection", "Keep-Alive");
+  }
+
+  if (!res.has_header("Content-Type")) {
+    res.set_header("Content-Type", "text/plain");
+  }
+
+  if (!res.has_header("Accept-Ranges")) {
+    res.set_header("Accept-Ranges", "bytes");
+  }
+
+  std::string content_type;
+  std::string boundary;
+
+  if (req.ranges.size() > 1) {
+    boundary = detail::make_multipart_data_boundary();
+
+    auto it = res.headers.find("Content-Type");
+    if (it != res.headers.end()) {
+      content_type = it->second;
+      res.headers.erase(it);
+    }
+
+    res.headers.emplace("Content-Type",
+                        "multipart/byteranges; boundary=" + boundary);
+  }
+
+  if (res.body.empty()) {
+    if (res.content_length > 0) {
+      size_t length = 0;
+      if (req.ranges.empty()) {
+        length = res.content_length;
+      } else if (req.ranges.size() == 1) {
+        auto offsets =
+            detail::get_range_offset_and_length(req, res.content_length, 0);
+        auto offset = offsets.first;
+        length = offsets.second;
+        auto content_range = detail::make_content_range_header_field(
+            offset, length, res.content_length);
+        res.set_header("Content-Range", content_range);
+      } else {
+        length = detail::get_multipart_ranges_data_length(req, res, boundary,
+                                                          content_type);
+      }
+      res.set_header("Content-Length", std::to_string(length));
+    } else {
+      if (res.content_provider) {
+        res.set_header("Transfer-Encoding", "chunked");
+      } else {
+        res.set_header("Content-Length", "0");
+      }
+    }
+  } else {
+    if (req.ranges.empty()) {
+      ;
+    } else if (req.ranges.size() == 1) {
+      auto offsets =
+          detail::get_range_offset_and_length(req, res.body.size(), 0);
+      auto offset = offsets.first;
+      auto length = offsets.second;
+      auto content_range = detail::make_content_range_header_field(
+          offset, length, res.body.size());
+      res.set_header("Content-Range", content_range);
+      res.body = res.body.substr(offset, length);
+    } else {
+      res.body =
+          detail::make_multipart_ranges_data(req, res, boundary, content_type);
+    }
+
+#ifdef CPPHTTPLIB_ZLIB_SUPPORT
+    // TODO: 'Accpet-Encoding' has gzip, not gzip;q=0
+    const auto &encodings = req.get_header_value("Accept-Encoding");
+    if (encodings.find("gzip") != std::string::npos &&
+        detail::can_compress(res.get_header_value("Content-Type"))) {
+      if (detail::compress(res.body)) {
+        res.set_header("Content-Encoding", "gzip");
+      }
+    }
+#endif
+
+    auto length = std::to_string(res.body.size());
+    res.set_header("Content-Length", length);
+  }
+
+  if (!detail::write_headers(strm, res, Headers())) { return false; }
+
+  // Body
+  if (req.method != "HEAD") {
+    if (!res.body.empty()) {
+      if (!strm.write(res.body)) { return false; }
+    } else if (res.content_provider) {
+      if (!write_content_with_provider(strm, req, res, boundary,
+                                       content_type)) {
+        return false;
+      }
+    }
+  }
+
+  // Log
+  if (logger_) { logger_(req, res); }
+
+  return true;
+}
+
+inline bool
+Server::write_content_with_provider(Stream &strm, const Request &req,
+                                    Response &res, const std::string &boundary,
+                                    const std::string &content_type) {
+  if (res.content_length) {
+    if (req.ranges.empty()) {
+      if (detail::write_content(strm, res.content_provider, 0,
+                                res.content_length) < 0) {
+        return false;
+      }
+    } else if (req.ranges.size() == 1) {
+      auto offsets =
+          detail::get_range_offset_and_length(req, res.content_length, 0);
+      auto offset = offsets.first;
+      auto length = offsets.second;
+      if (detail::write_content(strm, res.content_provider, offset, length) <
+          0) {
+        return false;
+      }
+    } else {
+      if (!detail::write_multipart_ranges_data(strm, req, res, boundary,
+                                               content_type)) {
+        return false;
+      }
+    }
+  } else {
+    if (detail::write_content_chunked(strm, res.content_provider) < 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool Server::read_content(Stream &strm, bool last_connection,
+                                 Request &req, Response &res) {
+  if (!detail::read_content(strm, req, payload_max_length_, res.status,
+                            Progress(), [&](const char *buf, size_t n) {
+                              if (req.body.size() + n > req.body.max_size()) {
+                                return false;
+                              }
+                              req.body.append(buf, n);
+                              return true;
+                            })) {
+    return write_response(strm, last_connection, req, res);
+  }
+
+  const auto &content_type = req.get_header_value("Content-Type");
+
+  if (!content_type.find("application/x-www-form-urlencoded")) {
+    detail::parse_query_text(req.body, req.params);
+  } else if (!content_type.find("multipart/form-data")) {
+    std::string boundary;
+    if (!detail::parse_multipart_boundary(content_type, boundary) ||
+        !detail::parse_multipart_formdata(boundary, req.body, req.files)) {
+      res.status = 400;
+      return write_response(strm, last_connection, req, res);
+    }
+  }
+
+  return true;
+}
+
+inline bool
+Server::read_content_with_content_receiver(Stream &strm, bool last_connection,
+                                           Request &req, Response &res,
+                                           ContentReceiver receiver) {
+  if (!detail::read_content(
+          strm, req, payload_max_length_, res.status, Progress(),
+          [&](const char *buf, size_t n) { return receiver(buf, n); })) {
+    return write_response(strm, last_connection, req, res);
+  }
+
+  return true;
+}
+
+inline bool Server::handle_file_request(Request &req, Response &res) {
+  for (const auto& kv: base_dirs_) {
+    const auto& mount_point = kv.first;
+    const auto& base_dir = kv.second;
+
+    // Prefix match
+    if (!req.path.find(mount_point)) {
+      std::string sub_path = "/" + req.path.substr(mount_point.size());
+      if (detail::is_valid_path(sub_path)) {
+        auto path = base_dir + sub_path;
+        if (path.back() == '/') { path += "index.html"; }
+
+        if (detail::is_file(path)) {
+          detail::read_file(path, res.body);
+          auto type = detail::find_content_type(path);
+          if (type) { res.set_header("Content-Type", type); }
+          res.status = 200;
+          if (file_request_handler_) { file_request_handler_(req, res); }
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+inline socket_t Server::create_server_socket(const char *host, int port,
+                                             int socket_flags) const {
+  return detail::create_socket(
+      host, port,
+      [](socket_t sock, struct addrinfo &ai) -> bool {
+        if (::bind(sock, ai.ai_addr, static_cast<int>(ai.ai_addrlen))) {
+          return false;
+        }
+        if (::listen(sock, 5)) { // Listen through 5 channels
+          return false;
+        }
+        return true;
+      },
+      socket_flags);
+}
+
+inline int Server::bind_internal(const char *host, int port, int socket_flags) {
+  if (!is_valid()) { return -1; }
+
+  svr_sock_ = create_server_socket(host, port, socket_flags);
+  if (svr_sock_ == INVALID_SOCKET) { return -1; }
+
+  if (port == 0) {
+    struct sockaddr_storage address;
+    socklen_t len = sizeof(address);
+    if (getsockname(svr_sock_, reinterpret_cast<struct sockaddr *>(&address),
+                    &len) == -1) {
+      return -1;
+    }
+    if (address.ss_family == AF_INET) {
+      return ntohs(reinterpret_cast<struct sockaddr_in *>(&address)->sin_port);
+    } else if (address.ss_family == AF_INET6) {
+      return ntohs(reinterpret_cast<struct sockaddr_in6 *>(&address)->sin6_port);
+    } else {
+      return -1;
+    }
+  } else {
+    return port;
+  }
+}
+
+inline bool Server::listen_internal() {
+  auto ret = true;
+  is_running_ = true;
+
+  {
+    std::unique_ptr<TaskQueue> task_queue(new_task_queue());
+
+    for (;;) {
+      if (svr_sock_ == INVALID_SOCKET) {
+        // The server socket was closed by 'stop' method.
+        break;
+      }
+
+      auto val = detail::select_read(svr_sock_, 0, 100000);
+
+      if (val == 0) { // Timeout
+        continue;
+      }
+
+      socket_t sock = accept(svr_sock_, nullptr, nullptr);
+
+      if (sock == INVALID_SOCKET) {
+        if (errno == EMFILE) {
+          // The per-process limit of open file descriptors has been reached.
+          // Try to accept new connections after a short sleep.
+          std::this_thread::sleep_for(std::chrono::milliseconds(1));
+          continue;
+        }
+        if (svr_sock_ != INVALID_SOCKET) {
+          detail::close_socket(svr_sock_);
+          ret = false;
+        } else {
+          ; // The server socket was closed by user.
+        }
+        break;
+      }
+
+      task_queue->enqueue([=]() { process_and_close_socket(sock); });
+    }
+
+    task_queue->shutdown();
+  }
+
+  is_running_ = false;
+  return ret;
+}
+
+inline bool Server::routing(Request &req, Response &res, Stream &strm, bool last_connection) {
+  // File handler
+  if (req.method == "GET" && handle_file_request(req, res)) { return true; }
+
+  // Content reader handler
+  if (req.method == "POST" || req.method == "PUT" || req.method == "PATCH") {
+    ContentReader content_reader = [&](ContentReceiver receiver) {
+      return read_content_with_content_receiver(strm, last_connection, req, res, receiver);
+    };
+
+    if (req.method == "POST") {
+      if (dispatch_request_for_content_reader(req, res, content_reader,
+                                              post_handlers_for_content_reader)) {
+        return true;
+      }
+    } else if (req.method == "PUT") {
+      if (dispatch_request_for_content_reader(req, res, content_reader,
+                                              put_handlers_for_content_reader)) {
+        return true;
+      }
+    } else if (req.method == "PATCH") {
+      if (dispatch_request_for_content_reader(
+              req, res, content_reader, patch_handlers_for_content_reader)) {
+        return true;
+      }
+    }
+  }
+
+  // Read content into `req.body`
+  if (req.method == "POST" || req.method == "PUT" || req.method == "PATCH" || req.method == "PRI") {
+    if (!read_content(strm, last_connection, req, res)) {
+      return false;
+    }
+  }
+
+  // Regular handler
+  if (req.method == "GET" || req.method == "HEAD") {
+    return dispatch_request(req, res, get_handlers_);
+  } else if (req.method == "POST") {
+    return dispatch_request(req, res, post_handlers_);
+  } else if (req.method == "PUT") {
+    return dispatch_request(req, res, put_handlers_);
+  } else if (req.method == "DELETE") {
+    return dispatch_request(req, res, delete_handlers_);
+  } else if (req.method == "OPTIONS") {
+    return dispatch_request(req, res, options_handlers_);
+  } else if (req.method == "PATCH") {
+    return dispatch_request(req, res, patch_handlers_);
+  }
+
+  res.status = 400;
+  return false;
+}
+
+inline bool Server::dispatch_request(Request &req, Response &res,
+                                     Handlers &handlers) {
+  for (const auto &x : handlers) {
+    const auto &pattern = x.first;
+    const auto &handler = x.second;
+
+    if (std::regex_match(req.path, req.matches, pattern)) {
+      handler(req, res);
+      return true;
+    }
+  }
+  return false;
+}
+
+inline bool
+Server::dispatch_request_for_content_reader(Request &req, Response &res,
+                                            ContentReader content_reader,
+                                            HandersForContentReader &handlers) {
+  for (const auto &x : handlers) {
+    const auto &pattern = x.first;
+    const auto &handler = x.second;
+
+    if (std::regex_match(req.path, req.matches, pattern)) {
+      handler(req, res, content_reader);
+      return true;
+    }
+  }
+  return false;
+}
+
+inline bool
+Server::process_request(Stream &strm, bool last_connection,
+                        bool &connection_close,
+                        const std::function<void(Request &)>& setup_request) {
+  std::array<char, 2048> buf{};
+
+  detail::stream_line_reader line_reader(strm, buf.data(), buf.size());
+
+  // Connection has been closed on client
+  if (!line_reader.getline()) { return false; }
+
+  Request req;
+  Response res;
+
+  res.version = "HTTP/1.1";
+
+  // Check if the request URI doesn't exceed the limit
+  if (line_reader.size() > CPPHTTPLIB_REQUEST_URI_MAX_LENGTH) {
+    Headers dummy;
+    detail::read_headers(strm, dummy);
+    res.status = 414;
+    return write_response(strm, last_connection, req, res);
+  }
+
+  // Request line and headers
+  if (!parse_request_line(line_reader.ptr(), req) ||
+      !detail::read_headers(strm, req.headers)) {
+    res.status = 400;
+    return write_response(strm, last_connection, req, res);
+  }
+
+  if (req.get_header_value("Connection") == "close") {
+    connection_close = true;
+  }
+
+  if (req.version == "HTTP/1.0" &&
+      req.get_header_value("Connection") != "Keep-Alive") {
+    connection_close = true;
+  }
+
+  req.set_header("REMOTE_ADDR", strm.get_remote_addr());
+
+  if (req.has_header("Range")) {
+    const auto &range_header_value = req.get_header_value("Range");
+    if (!detail::parse_range_header(range_header_value, req.ranges)) {
+      // TODO: error
+    }
+  }
+
+  if (setup_request) { setup_request(req); }
+
+  // Rounting
+  if (routing(req, res, strm, last_connection)) {
+    if (res.status == -1) { res.status = req.ranges.empty() ? 200 : 206; }
+  } else {
+    if (res.status == -1) { res.status = 404; }
+  }
+
+  return write_response(strm, last_connection, req, res);
+}
+
+inline bool Server::is_valid() const { return true; }
+
+inline bool Server::process_and_close_socket(socket_t sock) {
+  return detail::process_and_close_socket(
+      false, sock, keep_alive_max_count_, read_timeout_sec_, read_timeout_usec_,
+      [this](Stream &strm, bool last_connection, bool &connection_close) {
+        return process_request(strm, last_connection, connection_close,
+                               nullptr);
+      });
+}
+
+// HTTP client implementation
+inline Client::Client(const char *host, int port, time_t timeout_sec)
+    : host_(host), port_(port), timeout_sec_(timeout_sec),
+      host_and_port_(host_ + ":" + std::to_string(port_)),
+      keep_alive_max_count_(CPPHTTPLIB_KEEPALIVE_MAX_COUNT),
+      read_timeout_sec_(CPPHTTPLIB_READ_TIMEOUT_SECOND),
+      read_timeout_usec_(CPPHTTPLIB_READ_TIMEOUT_USECOND),
+      follow_location_(false) {}
+
+inline Client::~Client() {}
+
+inline bool Client::is_valid() const { return true; }
+
+inline socket_t Client::create_client_socket() const {
+  return detail::create_socket(
+      host_.c_str(), port_, [=](socket_t sock, struct addrinfo &ai) -> bool {
+        detail::set_nonblocking(sock, true);
+
+        auto ret = connect(sock, ai.ai_addr, static_cast<int>(ai.ai_addrlen));
+        if (ret < 0) {
+          if (detail::is_connection_error() ||
+              !detail::wait_until_socket_is_ready(sock, timeout_sec_, 0)) {
+            detail::close_socket(sock);
+            return false;
+          }
+        }
+
+        detail::set_nonblocking(sock, false);
+        return true;
+      });
+}
+
+inline bool Client::read_response_line(Stream &strm, Response &res) {
+  std::array<char, 2048> buf;
+
+  detail::stream_line_reader line_reader(strm, buf.data(), buf.size());
+
+  if (!line_reader.getline()) { return false; }
+
+  const static std::regex re("(HTTP/1\\.[01]) (\\d+?) .*\r\n");
+
+  std::cmatch m;
+  if (std::regex_match(line_reader.ptr(), m, re)) {
+    res.version = std::string(m[1]);
+    res.status = std::stoi(std::string(m[2]));
+  }
+
+  return true;
+}
+
+inline bool Client::send(const Request &req, Response &res) {
+  if (req.path.empty()) { return false; }
+
+  auto sock = create_client_socket();
+  if (sock == INVALID_SOCKET) { return false; }
+
+  auto ret = process_and_close_socket(
+      sock, 1, [&](Stream &strm, bool last_connection, bool &connection_close) {
+        return process_request(strm, req, res, last_connection,
+                               connection_close);
+      });
+
+  if (ret && follow_location_ && (300 < res.status && res.status < 400)) {
+    ret = redirect(req, res);
+  }
+
+  return ret;
+}
+
+inline bool Client::send(const std::vector<Request> &requests,
+                         std::vector<Response> &responses) {
+  size_t i = 0;
+  while (i < requests.size()) {
+    auto sock = create_client_socket();
+    if (sock == INVALID_SOCKET) { return false; }
+
+    if (!process_and_close_socket(
+            sock, requests.size() - i,
+            [&](Stream &strm, bool last_connection,
+                bool &connection_close) -> bool {
+              auto &req = requests[i];
+              auto res = Response();
+              i++;
+
+              if (req.path.empty()) { return false; }
+              auto ret = process_request(strm, req, res, last_connection,
+                                         connection_close);
+
+              if (ret && follow_location_ &&
+                  (300 < res.status && res.status < 400)) {
+                ret = redirect(req, res);
+              }
+
+              if (ret) { responses.emplace_back(std::move(res)); }
+
+              return ret;
+            })) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+inline bool Client::redirect(const Request &req, Response &res) {
+  if (req.redirect_count == 0) { return false; }
+
+  auto location = res.get_header_value("location");
+  if (location.empty()) { return false; }
+
+  std::regex re(
+      R"(^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*(?:\?[^#]*)?)(?:#.*)?)");
+
+  auto scheme = is_ssl() ? "https" : "http";
+
+  std::smatch m;
+  if (regex_match(location, m, re)) {
+    auto next_scheme = m[1].str();
+    auto next_host = m[2].str();
+    auto next_path = m[3].str();
+    if (next_host.empty()) { next_host = host_; }
+    if (next_path.empty()) { next_path = "/"; }
+
+    if (next_scheme == scheme && next_host == host_) {
+      return detail::redirect(*this, req, res, next_path);
+    } else {
+      if (next_scheme == "https") {
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+        SSLClient cli(next_host.c_str());
+        cli.follow_location(true);
+        return detail::redirect(cli, req, res, next_path);
+#else
+        return false;
+#endif
+      } else {
+        Client cli(next_host.c_str());
+        cli.follow_location(true);
+        return detail::redirect(cli, req, res, next_path);
+      }
+    }
+  }
+  return false;
+}
+
+inline void Client::write_request(Stream &strm, const Request &req,
+                                  bool last_connection) {
+  BufferStream bstrm;
+
+  // Request line
+  auto path = detail::encode_url(req.path);
+
+  bstrm.write_format("%s %s HTTP/1.1\r\n", req.method.c_str(), path.c_str());
+
+  // Additonal headers
+  Headers headers;
+  if (last_connection) { headers.emplace("Connection", "close"); }
+
+  if (!req.has_header("Host")) {
+    if (is_ssl()) {
+      if (port_ == 443) {
+        headers.emplace("Host", host_);
+      } else {
+        headers.emplace("Host", host_and_port_);
+      }
+    } else {
+      if (port_ == 80) {
+        headers.emplace("Host", host_);
+      } else {
+        headers.emplace("Host", host_and_port_);
+      }
+    }
+  }
+
+  if (!req.has_header("Accept")) { headers.emplace("Accept", "*/*"); }
+
+  if (!req.has_header("User-Agent")) {
+    headers.emplace("User-Agent", "cpp-httplib/0.2");
+  }
+
+  if (req.body.empty()) {
+    if (req.content_provider) {
+      auto length = std::to_string(req.content_length);
+      headers.emplace("Content-Length", length);
+    } else {
+      headers.emplace("Content-Length", "0");
+    }
+  } else {
+    if (!req.has_header("Content-Type")) {
+      headers.emplace("Content-Type", "text/plain");
+    }
+
+    if (!req.has_header("Content-Length")) {
+      auto length = std::to_string(req.body.size());
+      headers.emplace("Content-Length", length);
+    }
+  }
+
+  detail::write_headers(bstrm, req, headers);
+
+  // Flush buffer
+  auto &data = bstrm.get_buffer();
+  strm.write(data.data(), data.size());
+
+  // Body
+  if (req.body.empty()) {
+    if (req.content_provider) {
+      size_t offset = 0;
+      size_t end_offset = req.content_length;
+      while (offset < end_offset) {
+        req.content_provider(offset, end_offset - offset,
+                             [&](const char *d, size_t l) {
+                               auto written_length = strm.write(d, l);
+                               offset += written_length;
+                             });
+      }
+    }
+  } else {
+    strm.write(req.body);
+  }
+}
+
+inline std::shared_ptr<Response> Client::send_with_content_provider(
+    const char *method, const char *path, const Headers &headers,
+    const std::string &body, size_t content_length,
+    ContentProvider content_provider, const char *content_type, bool compress) {
+#ifndef CPPHTTPLIB_ZLIB_SUPPORT
+  (void)compress;
+#endif
+
+  Request req;
+  req.method = method;
+  req.headers = headers;
+  req.path = path;
+
+  req.headers.emplace("Content-Type", content_type);
+
+#ifdef CPPHTTPLIB_ZLIB_SUPPORT
+  if (compress) {
+    if (content_provider) {
+      size_t offset = 0;
+      while (offset < content_length) {
+        content_provider(offset, content_length - offset,
+                         [&](const char *data, size_t data_len) {
+                           req.body.append(data, data_len);
+                           offset += data_len;
+                         });
+      }
+    } else {
+      req.body = body;
+    }
+
+    if (!detail::compress(req.body)) { return nullptr; }
+    req.headers.emplace("Content-Encoding", "gzip");
+  } else
+#endif
+  {
+    if (content_provider) {
+      req.content_length = content_length;
+      req.content_provider = content_provider;
+    } else {
+      req.body = body;
+    }
+  }
+
+  auto res = std::make_shared<Response>();
+
+  return send(req, *res) ? res : nullptr;
+}
+
+inline bool Client::process_request(Stream &strm, const Request &req,
+                                    Response &res, bool last_connection,
+                                    bool &connection_close) {
+  // Send request
+  write_request(strm, req, last_connection);
+
+  // Receive response and headers
+  if (!read_response_line(strm, res) ||
+      !detail::read_headers(strm, res.headers)) {
+    return false;
+  }
+
+  if (res.get_header_value("Connection") == "close" ||
+      res.version == "HTTP/1.0") {
+    connection_close = true;
+  }
+
+  if (req.response_handler) {
+    if (!req.response_handler(res)) { return false; }
+  }
+
+  // Body
+  if (req.method != "HEAD") {
+    ContentReceiver out = [&](const char *buf, size_t n) {
+      if (res.body.size() + n > res.body.max_size()) { return false; }
+      res.body.append(buf, n);
+      return true;
+    };
+
+    if (req.content_receiver) {
+      out = [&](const char *buf, size_t n) {
+        return req.content_receiver(buf, n);
+      };
+    }
+
+    int dummy_status;
+    if (!detail::read_content(strm, res, std::numeric_limits<size_t>::max(),
+                              dummy_status, req.progress, out)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+inline bool Client::process_and_close_socket(
+    socket_t sock, size_t request_count,
+    std::function<bool(Stream &strm, bool last_connection,
+                       bool &connection_close)>
+        callback) {
+  request_count = std::min(request_count, keep_alive_max_count_);
+  return detail::process_and_close_socket(true, sock, request_count,
+                                          read_timeout_sec_, read_timeout_usec_,
+                                          callback);
+}
+
+inline bool Client::is_ssl() const { return false; }
+
+inline std::shared_ptr<Response> Client::Get(const char *path) {
+  Progress dummy;
+  return Get(path, Headers(), dummy);
+}
+
+inline std::shared_ptr<Response> Client::Get(const char *path,
+                                             Progress progress) {
+  return Get(path, Headers(), std::move(progress));
+}
+
+inline std::shared_ptr<Response> Client::Get(const char *path,
+                                             const Headers &headers) {
+  Progress dummy;
+  return Get(path, headers, dummy);
+}
+
+inline std::shared_ptr<Response>
+Client::Get(const char *path, const Headers &headers, Progress progress) {
+  Request req;
+  req.method = "GET";
+  req.path = path;
+  req.headers = headers;
+  req.progress = std::move(progress);
+
+  auto res = std::make_shared<Response>();
+  return send(req, *res) ? res : nullptr;
+}
+
+inline std::shared_ptr<Response> Client::Get(const char *path,
+                                             ContentReceiver content_receiver) {
+  Progress dummy;
+  return Get(path, Headers(), nullptr, std::move(content_receiver), dummy);
+}
+
+inline std::shared_ptr<Response> Client::Get(const char *path,
+                                             ContentReceiver content_receiver,
+                                             Progress progress) {
+  return Get(path, Headers(), nullptr, std::move(content_receiver), progress);
+}
+
+inline std::shared_ptr<Response> Client::Get(const char *path,
+                                             const Headers &headers,
+                                             ContentReceiver content_receiver) {
+  Progress dummy;
+  return Get(path, headers, nullptr, std::move(content_receiver), dummy);
+}
+
+inline std::shared_ptr<Response> Client::Get(const char *path,
+                                             const Headers &headers,
+                                             ContentReceiver content_receiver,
+                                             Progress progress) {
+  return Get(path, headers, nullptr, std::move(content_receiver), progress);
+}
+
+inline std::shared_ptr<Response> Client::Get(const char *path,
+                                             const Headers &headers,
+                                             ResponseHandler response_handler,
+                                             ContentReceiver content_receiver) {
+  Progress dummy;
+  return Get(path, headers, std::move(response_handler), content_receiver, dummy);
+}
+
+inline std::shared_ptr<Response> Client::Get(const char *path,
+                                             const Headers &headers,
+                                             ResponseHandler response_handler,
+                                             ContentReceiver content_receiver,
+                                             Progress progress) {
+  Request req;
+  req.method = "GET";
+  req.path = path;
+  req.headers = headers;
+  req.response_handler = std::move(response_handler);
+  req.content_receiver = std::move(content_receiver);
+  req.progress = std::move(progress);
+
+  auto res = std::make_shared<Response>();
+  return send(req, *res) ? res : nullptr;
+}
+
+inline std::shared_ptr<Response> Client::Head(const char *path) {
+  return Head(path, Headers());
+}
+
+inline std::shared_ptr<Response> Client::Head(const char *path,
+                                              const Headers &headers) {
+  Request req;
+  req.method = "HEAD";
+  req.headers = headers;
+  req.path = path;
+
+  auto res = std::make_shared<Response>();
+
+  return send(req, *res) ? res : nullptr;
+}
+
+inline std::shared_ptr<Response> Client::Post(const char *path,
+                                              const std::string &body,
+                                              const char *content_type,
+                                              bool compress) {
+  return Post(path, Headers(), body, content_type, compress);
+}
+
+inline std::shared_ptr<Response>
+Client::Post(const char *path, const Headers &headers, const std::string &body,
+             const char *content_type, bool compress) {
+  return send_with_content_provider("POST", path, headers, body, 0, nullptr,
+                                    content_type, compress);
+}
+
+inline std::shared_ptr<Response>
+Client::Post(const char *path, const Params &params, bool compress) {
+  return Post(path, Headers(), params, compress);
+}
+
+inline std::shared_ptr<Response> Client::Post(const char *path,
+                                              size_t content_length,
+                                              ContentProvider content_provider,
+                                              const char *content_type,
+                                              bool compress) {
+  return Post(path, Headers(), content_length, content_provider, content_type,
+              compress);
+}
+
+inline std::shared_ptr<Response>
+Client::Post(const char *path, const Headers &headers, size_t content_length,
+             ContentProvider content_provider, const char *content_type,
+             bool compress) {
+  return send_with_content_provider("POST", path, headers, std::string(),
+                                    content_length, content_provider,
+                                    content_type, compress);
+}
+
+inline std::shared_ptr<Response> Client::Post(const char *path,
+                                              const Headers &headers,
+                                              const Params &params,
+                                              bool compress) {
+  std::string query;
+  for (auto it = params.begin(); it != params.end(); ++it) {
+    if (it != params.begin()) { query += "&"; }
+    query += it->first;
+    query += "=";
+    query += detail::encode_url(it->second);
+  }
+
+  return Post(path, headers, query, "application/x-www-form-urlencoded",
+              compress);
+}
+
+inline std::shared_ptr<Response>
+Client::Post(const char *path, const MultipartFormDataItems &items,
+             bool compress) {
+  return Post(path, Headers(), items, compress);
+}
+
+inline std::shared_ptr<Response>
+Client::Post(const char *path, const Headers &headers,
+             const MultipartFormDataItems &items, bool compress) {
+  auto boundary = detail::make_multipart_data_boundary();
+
+  std::string body;
+
+  for (const auto &item : items) {
+    body += "--" + boundary + "\r\n";
+    body += "Content-Disposition: form-data; name=\"" + item.name + "\"";
+    if (!item.filename.empty()) {
+      body += "; filename=\"" + item.filename + "\"";
+    }
+    body += "\r\n";
+    if (!item.content_type.empty()) {
+      body += "Content-Type: " + item.content_type + "\r\n";
+    }
+    body += "\r\n";
+    body += item.content + "\r\n";
+  }
+
+  body += "--" + boundary + "--\r\n";
+
+  std::string content_type = "multipart/form-data; boundary=" + boundary;
+  return Post(path, headers, body, content_type.c_str(), compress);
+}
+
+inline std::shared_ptr<Response> Client::Put(const char *path,
+                                             const std::string &body,
+                                             const char *content_type,
+                                             bool compress) {
+  return Put(path, Headers(), body, content_type, compress);
+}
+
+inline std::shared_ptr<Response>
+Client::Put(const char *path, const Headers &headers, const std::string &body,
+            const char *content_type, bool compress) {
+  return send_with_content_provider("PUT", path, headers, body, 0, nullptr,
+                                    content_type, compress);
+}
+
+inline std::shared_ptr<Response> Client::Put(const char *path,
+                                             size_t content_length,
+                                             ContentProvider content_provider,
+                                             const char *content_type,
+                                             bool compress) {
+  return Put(path, Headers(), content_length, content_provider, content_type,
+             compress);
+}
+
+inline std::shared_ptr<Response>
+Client::Put(const char *path, const Headers &headers, size_t content_length,
+            ContentProvider content_provider, const char *content_type,
+            bool compress) {
+  return send_with_content_provider("PUT", path, headers, std::string(),
+                                    content_length, content_provider,
+                                    content_type, compress);
+}
+
+inline std::shared_ptr<Response> Client::Patch(const char *path,
+                                               const std::string &body,
+                                               const char *content_type,
+                                               bool compress) {
+  return Patch(path, Headers(), body, content_type, compress);
+}
+
+inline std::shared_ptr<Response>
+Client::Patch(const char *path, const Headers &headers, const std::string &body,
+              const char *content_type, bool compress) {
+  return send_with_content_provider("PATCH", path, headers, body, 0, nullptr,
+                                    content_type, compress);
+}
+
+inline std::shared_ptr<Response> Client::Patch(const char *path,
+                                               size_t content_length,
+                                               ContentProvider content_provider,
+                                               const char *content_type,
+                                               bool compress) {
+  return Patch(path, Headers(), content_length, content_provider, content_type,
+               compress);
+}
+
+inline std::shared_ptr<Response>
+Client::Patch(const char *path, const Headers &headers, size_t content_length,
+              ContentProvider content_provider, const char *content_type,
+              bool compress) {
+  return send_with_content_provider("PATCH", path, headers, std::string(),
+                                    content_length, content_provider,
+                                    content_type, compress);
+}
+
+inline std::shared_ptr<Response> Client::Delete(const char *path) {
+  return Delete(path, Headers(), std::string(), nullptr);
+}
+
+inline std::shared_ptr<Response> Client::Delete(const char *path,
+                                                const std::string &body,
+                                                const char *content_type) {
+  return Delete(path, Headers(), body, content_type);
+}
+
+inline std::shared_ptr<Response> Client::Delete(const char *path,
+                                                const Headers &headers) {
+  return Delete(path, headers, std::string(), nullptr);
+}
+
+inline std::shared_ptr<Response> Client::Delete(const char *path,
+                                                const Headers &headers,
+                                                const std::string &body,
+                                                const char *content_type) {
+  Request req;
+  req.method = "DELETE";
+  req.headers = headers;
+  req.path = path;
+
+  if (content_type) { req.headers.emplace("Content-Type", content_type); }
+  req.body = body;
+
+  auto res = std::make_shared<Response>();
+
+  return send(req, *res) ? res : nullptr;
+}
+
+inline std::shared_ptr<Response> Client::Options(const char *path) {
+  return Options(path, Headers());
+}
+
+inline std::shared_ptr<Response> Client::Options(const char *path,
+                                                 const Headers &headers) {
+  Request req;
+  req.method = "OPTIONS";
+  req.path = path;
+  req.headers = headers;
+
+  auto res = std::make_shared<Response>();
+
+  return send(req, *res) ? res : nullptr;
+}
+
+inline void Client::set_keep_alive_max_count(size_t count) {
+  keep_alive_max_count_ = count;
+}
+
+inline void Client::set_read_timeout(time_t sec, time_t usec) {
+  read_timeout_sec_ = sec;
+  read_timeout_usec_ = usec;
+}
+
+inline void Client::follow_location(bool on) { follow_location_ = on; }
+
+/*
+ * SSL Implementation
+ */
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+namespace detail {
+
+template <typename U, typename V, typename T>
+inline bool process_and_close_socket_ssl(
+    bool is_client_request, socket_t sock, size_t keep_alive_max_count,
+    time_t read_timeout_sec, time_t read_timeout_usec, SSL_CTX *ctx,
+    std::mutex &ctx_mutex, U SSL_connect_or_accept, V setup, T callback) {
+  assert(keep_alive_max_count > 0);
+
+  SSL *ssl = nullptr;
+  {
+    std::lock_guard<std::mutex> guard(ctx_mutex);
+    ssl = SSL_new(ctx);
+  }
+
+  if (!ssl) {
+    close_socket(sock);
+    return false;
+  }
+
+  auto bio = BIO_new_socket(static_cast<int>(sock), BIO_NOCLOSE);
+  SSL_set_bio(ssl, bio, bio);
+
+  if (!setup(ssl)) {
+    SSL_shutdown(ssl);
+    {
+      std::lock_guard<std::mutex> guard(ctx_mutex);
+      SSL_free(ssl);
+    }
+
+    close_socket(sock);
+    return false;
+  }
+
+  bool ret = false;
+
+  if (SSL_connect_or_accept(ssl) == 1) {
+    if (keep_alive_max_count > 1) {
+      auto count = keep_alive_max_count;
+      while (count > 0 &&
+             (is_client_request ||
+              detail::select_read(sock, CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND,
+                                  CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND) > 0)) {
+        SSLSocketStream strm(sock, ssl, read_timeout_sec, read_timeout_usec);
+        auto last_connection = count == 1;
+        auto connection_close = false;
+
+        ret = callback(ssl, strm, last_connection, connection_close);
+        if (!ret || connection_close) { break; }
+
+        count--;
+      }
+    } else {
+      SSLSocketStream strm(sock, ssl, read_timeout_sec, read_timeout_usec);
+      auto dummy_connection_close = false;
+      ret = callback(ssl, strm, true, dummy_connection_close);
+    }
+  }
+
+  SSL_shutdown(ssl);
+  {
+    std::lock_guard<std::mutex> guard(ctx_mutex);
+    SSL_free(ssl);
+  }
+
+  close_socket(sock);
+
+  return ret;
+}
+
+#if OPENSSL_VERSION_NUMBER < 0x10100000L
+static std::shared_ptr<std::vector<std::mutex>> openSSL_locks_;
+
+class SSLThreadLocks {
+public:
+  SSLThreadLocks() {
+    openSSL_locks_ =
+        std::make_shared<std::vector<std::mutex>>(CRYPTO_num_locks());
+    CRYPTO_set_locking_callback(locking_callback);
+  }
+
+  ~SSLThreadLocks() { CRYPTO_set_locking_callback(nullptr); }
+
+private:
+  static void locking_callback(int mode, int type, const char * /*file*/,
+                               int /*line*/) {
+    auto &locks = *openSSL_locks_;
+    if (mode & CRYPTO_LOCK) {
+      locks[type].lock();
+    } else {
+      locks[type].unlock();
+    }
+  }
+};
+
+#endif
+
+class SSLInit {
+public:
+  SSLInit() {
+#if OPENSSL_VERSION_NUMBER < 0x1010001fL
+    SSL_load_error_strings();
+    SSL_library_init();
+#else
+    OPENSSL_init_ssl(
+        OPENSSL_INIT_LOAD_SSL_STRINGS | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
+#endif
+  }
+
+  ~SSLInit() {
+#if OPENSSL_VERSION_NUMBER < 0x1010001fL
+    ERR_free_strings();
+#endif
+  }
+
+private:
+#if OPENSSL_VERSION_NUMBER < 0x10100000L
+  SSLThreadLocks thread_init_;
+#endif
+};
+
+static SSLInit sslinit_;
+
+} // namespace detail
+
+// SSL socket stream implementation
+inline SSLSocketStream::SSLSocketStream(socket_t sock, SSL *ssl,
+                                        time_t read_timeout_sec,
+                                        time_t read_timeout_usec)
+    : sock_(sock), ssl_(ssl), read_timeout_sec_(read_timeout_sec),
+      read_timeout_usec_(read_timeout_usec) {}
+
+inline SSLSocketStream::~SSLSocketStream() {}
+
+inline int SSLSocketStream::read(char *ptr, size_t size) {
+  if (SSL_pending(ssl_) > 0 ||
+      detail::select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0) {
+    return SSL_read(ssl_, ptr, static_cast<int>(size));
+  }
+  return -1;
+}
+
+inline int SSLSocketStream::write(const char *ptr, size_t size) {
+  return SSL_write(ssl_, ptr, static_cast<int>(size));
+}
+
+inline int SSLSocketStream::write(const char *ptr) {
+  return write(ptr, strlen(ptr));
+}
+
+inline int SSLSocketStream::write(const std::string &s) {
+  return write(s.data(), s.size());
+}
+
+inline std::string SSLSocketStream::get_remote_addr() const {
+  return detail::get_remote_addr(sock_);
+}
+
+// SSL HTTP server implementation
+inline SSLServer::SSLServer(const char *cert_path, const char *private_key_path,
+                            const char *client_ca_cert_file_path,
+                            const char *client_ca_cert_dir_path) {
+  ctx_ = SSL_CTX_new(SSLv23_server_method());
+
+  if (ctx_) {
+    SSL_CTX_set_options(ctx_,
+                        SSL_OP_ALL | SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 |
+                            SSL_OP_NO_COMPRESSION |
+                            SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
+
+    // auto ecdh = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1);
+    // SSL_CTX_set_tmp_ecdh(ctx_, ecdh);
+    // EC_KEY_free(ecdh);
+
+    if (SSL_CTX_use_certificate_chain_file(ctx_, cert_path) != 1 ||
+        SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) !=
+            1) {
+      SSL_CTX_free(ctx_);
+      ctx_ = nullptr;
+    } else if (client_ca_cert_file_path || client_ca_cert_dir_path) {
+      // if (client_ca_cert_file_path) {
+      //   auto list = SSL_load_client_CA_file(client_ca_cert_file_path);
+      //   SSL_CTX_set_client_CA_list(ctx_, list);
+      // }
+
+      SSL_CTX_load_verify_locations(ctx_, client_ca_cert_file_path,
+                                    client_ca_cert_dir_path);
+
+      SSL_CTX_set_verify(
+          ctx_,
+          SSL_VERIFY_PEER |
+              SSL_VERIFY_FAIL_IF_NO_PEER_CERT, // SSL_VERIFY_CLIENT_ONCE,
+          nullptr);
+    }
+  }
+}
+
+inline SSLServer::~SSLServer() {
+  if (ctx_) { SSL_CTX_free(ctx_); }
+}
+
+inline bool SSLServer::is_valid() const { return ctx_; }
+
+inline bool SSLServer::process_and_close_socket(socket_t sock) {
+  return detail::process_and_close_socket_ssl(
+      false, sock, keep_alive_max_count_, read_timeout_sec_, read_timeout_usec_,
+      ctx_, ctx_mutex_, SSL_accept, [](SSL * /*ssl*/) { return true; },
+      [this](SSL *ssl, Stream &strm, bool last_connection,
+             bool &connection_close) {
+        return process_request(strm, last_connection, connection_close,
+                               [&](Request &req) { req.ssl = ssl; });
+      });
+}
+
+// SSL HTTP client implementation
+inline SSLClient::SSLClient(const char *host, int port, time_t timeout_sec,
+                            const char *client_cert_path,
+                            const char *client_key_path)
+    : Client(host, port, timeout_sec) {
+  ctx_ = SSL_CTX_new(SSLv23_client_method());
+
+  detail::split(&host_[0], &host_[host_.size()], '.',
+                [&](const char *b, const char *e) {
+                  host_components_.emplace_back(std::string(b, e));
+                });
+  if (client_cert_path && client_key_path) {
+    if (SSL_CTX_use_certificate_file(ctx_, client_cert_path,
+                                     SSL_FILETYPE_PEM) != 1 ||
+        SSL_CTX_use_PrivateKey_file(ctx_, client_key_path, SSL_FILETYPE_PEM) !=
+            1) {
+      SSL_CTX_free(ctx_);
+      ctx_ = nullptr;
+    }
+  }
+}
+
+inline SSLClient::~SSLClient() {
+  if (ctx_) { SSL_CTX_free(ctx_); }
+}
+
+inline bool SSLClient::is_valid() const { return ctx_; }
+
+inline void SSLClient::set_ca_cert_path(const char *ca_cert_file_path,
+                                        const char *ca_cert_dir_path) {
+  if (ca_cert_file_path) { ca_cert_file_path_ = ca_cert_file_path; }
+  if (ca_cert_dir_path) { ca_cert_dir_path_ = ca_cert_dir_path; }
+}
+
+inline void SSLClient::enable_server_certificate_verification(bool enabled) {
+  server_certificate_verification_ = enabled;
+}
+
+inline long SSLClient::get_openssl_verify_result() const {
+  return verify_result_;
+}
+
+inline SSL_CTX *SSLClient::ssl_context() const noexcept { return ctx_; }
+
+inline bool SSLClient::process_and_close_socket(
+    socket_t sock, size_t request_count,
+    std::function<bool(Stream &strm, bool last_connection,
+                       bool &connection_close)>
+        callback) {
+
+  request_count = std::min(request_count, keep_alive_max_count_);
+
+  return is_valid() &&
+         detail::process_and_close_socket_ssl(
+             true, sock, request_count, read_timeout_sec_, read_timeout_usec_,
+             ctx_, ctx_mutex_,
+             [&](SSL *ssl) {
+               if (ca_cert_file_path_.empty()) {
+                 SSL_CTX_set_verify(ctx_, SSL_VERIFY_NONE, nullptr);
+               } else {
+                 if (!SSL_CTX_load_verify_locations(
+                         ctx_, ca_cert_file_path_.c_str(), nullptr)) {
+                   return false;
+                 }
+                 SSL_CTX_set_verify(ctx_, SSL_VERIFY_PEER, nullptr);
+               }
+
+               if (SSL_connect(ssl) != 1) { return false; }
+
+               if (server_certificate_verification_) {
+                 verify_result_ = SSL_get_verify_result(ssl);
+
+                 if (verify_result_ != X509_V_OK) { return false; }
+
+                 auto server_cert = SSL_get_peer_certificate(ssl);
+
+                 if (server_cert == nullptr) { return false; }
+
+                 if (!verify_host(server_cert)) {
+                   X509_free(server_cert);
+                   return false;
+                 }
+                 X509_free(server_cert);
+               }
+
+               return true;
+             },
+             [&](SSL *ssl) {
+               SSL_set_tlsext_host_name(ssl, host_.c_str());
+               return true;
+             },
+             [&](SSL * /*ssl*/, Stream &strm, bool last_connection,
+                 bool &connection_close) {
+               return callback(strm, last_connection, connection_close);
+             });
+}
+
+inline bool SSLClient::is_ssl() const { return true; }
+
+inline bool SSLClient::verify_host(X509 *server_cert) const {
+  /* Quote from RFC2818 section 3.1 "Server Identity"
+
+     If a subjectAltName extension of type dNSName is present, that MUST
+     be used as the identity. Otherwise, the (most specific) Common Name
+     field in the Subject field of the certificate MUST be used. Although
+     the use of the Common Name is existing practice, it is deprecated and
+     Certification Authorities are encouraged to use the dNSName instead.
+
+     Matching is performed using the matching rules specified by
+     [RFC2459].  If more than one identity of a given type is present in
+     the certificate (e.g., more than one dNSName name, a match in any one
+     of the set is considered acceptable.) Names may contain the wildcard
+     character * which is considered to match any single domain name
+     component or component fragment. E.g., *.a.com matches foo.a.com but
+     not bar.foo.a.com. f*.com matches foo.com but not bar.com.
+
+     In some cases, the URI is specified as an IP address rather than a
+     hostname. In this case, the iPAddress subjectAltName must be present
+     in the certificate and must exactly match the IP in the URI.
+
+  */
+  return verify_host_with_subject_alt_name(server_cert) ||
+         verify_host_with_common_name(server_cert);
+}
+
+inline bool
+SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const {
+  auto ret = false;
+
+  auto type = GEN_DNS;
+
+  struct in6_addr addr6;
+  struct in_addr addr;
+  size_t addr_len = 0;
+
+#ifndef __MINGW32__
+  if (inet_pton(AF_INET6, host_.c_str(), &addr6)) {
+    type = GEN_IPADD;
+    addr_len = sizeof(struct in6_addr);
+  } else if (inet_pton(AF_INET, host_.c_str(), &addr)) {
+    type = GEN_IPADD;
+    addr_len = sizeof(struct in_addr);
+  }
+#endif
+
+  auto alt_names = static_cast<const struct stack_st_GENERAL_NAME *>(
+      X509_get_ext_d2i(server_cert, NID_subject_alt_name, nullptr, nullptr));
+
+  if (alt_names) {
+    auto dsn_matched = false;
+    auto ip_mached = false;
+
+    auto count = sk_GENERAL_NAME_num(alt_names);
+
+    for (auto i = 0; i < count && !dsn_matched; i++) {
+      auto val = sk_GENERAL_NAME_value(alt_names, i);
+      if (val->type == type) {
+        auto name = (const char *)ASN1_STRING_get0_data(val->d.ia5);
+        auto name_len = (size_t)ASN1_STRING_length(val->d.ia5);
+
+        if (strlen(name) == name_len) {
+          switch (type) {
+          case GEN_DNS: dsn_matched = check_host_name(name, name_len); break;
+
+          case GEN_IPADD:
+            if (!memcmp(&addr6, name, addr_len) ||
+                !memcmp(&addr, name, addr_len)) {
+              ip_mached = true;
+            }
+            break;
+          }
+        }
+      }
+    }
+
+    if (dsn_matched || ip_mached) { ret = true; }
+  }
+
+  GENERAL_NAMES_free((STACK_OF(GENERAL_NAME) *)alt_names);
+
+  return ret;
+}
+
+inline bool SSLClient::verify_host_with_common_name(X509 *server_cert) const {
+  const auto subject_name = X509_get_subject_name(server_cert);
+
+  if (subject_name != nullptr) {
+    char name[BUFSIZ];
+    auto name_len = X509_NAME_get_text_by_NID(subject_name, NID_commonName,
+                                              name, sizeof(name));
+
+    if (name_len != -1) { return check_host_name(name, name_len); }
+  }
+
+  return false;
+}
+
+inline bool SSLClient::check_host_name(const char *pattern,
+                                       size_t pattern_len) const {
+  if (host_.size() == pattern_len && host_ == pattern) { return true; }
+
+  // Wildcard match
+  // https://bugs.launchpad.net/ubuntu/+source/firefox-3.0/+bug/376484
+  std::vector<std::string> pattern_components;
+  detail::split(&pattern[0], &pattern[pattern_len], '.',
+                [&](const char *b, const char *e) {
+                  pattern_components.emplace_back(std::string(b, e));
+                });
+
+  if (host_components_.size() != pattern_components.size()) { return false; }
+
+  auto itr = pattern_components.begin();
+  for (const auto &h : host_components_) {
+    auto &p = *itr;
+    if (p != h && p != "*") {
+      auto partial_match = (p.size() > 0 && p[p.size() - 1] == '*' &&
+                            !p.compare(0, p.size() - 1, h));
+      if (!partial_match) { return false; }
+    }
+    ++itr;
+  }
+
+  return true;
+}
+#endif
+
+} // namespace httplib
+
+#endif // CPPHTTPLIB_HTTPLIB_H
diff --git a/darknet-master/src/im2col.c b/darknet-master/src/im2col.c
new file mode 100644
index 0000000..4951f8f
--- /dev/null
+++ b/darknet-master/src/im2col.c
@@ -0,0 +1,93 @@
+#include "im2col.h"
+#include <stdio.h>
+float im2col_get_pixel(float *im, int height, int width, int channels,
+                        int row, int col, int channel, int pad)
+{
+    row -= pad;
+    col -= pad;
+
+    if (row < 0 || col < 0 ||
+        row >= height || col >= width) return 0;
+    return im[col + width*(row + height*channel)];
+}
+
+//From Berkeley Vision's Caffe!
+//https://github.com/BVLC/caffe/blob/master/LICENSE
+void im2col_cpu(float* data_im,
+     int channels,  int height,  int width,
+     int ksize,  int stride, int pad, float* data_col)
+{
+    int c,h,w;
+    int height_col = (height + 2*pad - ksize) / stride + 1;
+    int width_col = (width + 2*pad - ksize) / stride + 1;
+
+    int channels_col = channels * ksize * ksize;
+    for (c = 0; c < channels_col; ++c) {
+        int w_offset = c % ksize;
+        int h_offset = (c / ksize) % ksize;
+        int c_im = c / ksize / ksize;
+        for (h = 0; h < height_col; ++h) {
+            for (w = 0; w < width_col; ++w) {
+                int im_row = h_offset + h * stride;
+                int im_col = w_offset + w * stride;
+                int col_index = (c * height_col + h) * width_col + w;
+                data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
+                        im_row, im_col, c_im, pad);
+            }
+        }
+    }
+}
+
+
+// Function uses casting from int to unsigned to compare if value of
+// parameter a is greater or equal to zero and lower than value of
+// parameter b. The b parameter is of type signed and is always positive,
+// therefore its value is always lower than 0x800... where casting
+// negative value of a parameter converts it to value higher than 0x800...
+// The casting allows to use one condition instead of two.
+inline static int is_a_ge_zero_and_a_lt_b(int a, int b) {
+    return (unsigned)(a) < (unsigned)(b);
+}
+
+// https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cpp
+void im2col_cpu_ext(const float* data_im, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    float* data_col)
+{
+    const int output_h = (height + 2 * pad_h -
+        (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int output_w = (width + 2 * pad_w -
+        (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+    const int channel_size = height * width;
+    int channel, kernel_row, kernel_col, output_rows, output_col;
+    for (channel = channels; channel--; data_im += channel_size) {
+        for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+            for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+                int input_row = -pad_h + kernel_row * dilation_h;
+                for (output_rows = output_h; output_rows; output_rows--) {
+                    if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+                        for (output_col = output_w; output_col; output_col--) {
+                            *(data_col++) = 0;
+                        }
+                    }
+                    else {
+                        int input_col = -pad_w + kernel_col * dilation_w;
+                        for (output_col = output_w; output_col; output_col--) {
+                            if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                                *(data_col++) = data_im[input_row * width + input_col];
+                            }
+                            else {
+                                *(data_col++) = 0;
+                            }
+                            input_col += stride_w;
+                        }
+                    }
+                    input_row += stride_h;
+                }
+            }
+        }
+    }
+}
diff --git a/darknet-master/src/im2col.h b/darknet-master/src/im2col.h
new file mode 100644
index 0000000..65dd6ec
--- /dev/null
+++ b/darknet-master/src/im2col.h
@@ -0,0 +1,88 @@
+#ifndef IM2COL_H
+#define IM2COL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "darknet.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void im2col_cpu(float* data_im,
+        int channels, int height, int width,
+        int ksize, int stride, int pad, float* data_col);
+float im2col_get_pixel(float* im, int height, int width, int channels,
+    int row, int col, int channel, int pad);
+
+void im2col_cpu_ext(const float* data_im, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    float* data_col);
+
+#ifdef GPU
+
+void im2col_ongpu(float *im,
+         int channels, int height, int width,
+         int ksize, int stride, int pad,float *data_col);
+
+void im2col_gpu_ext(const float* data_im, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    float* data_col);
+
+void im2col_align_ongpu(float *im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float *data_col, int bit_align);
+
+void im2col_align_bin_ongpu(float *im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float *data_col, int bit_align);
+
+void float_to_bit_gpu(float *src, unsigned char *dst, size_t size);
+
+void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const int m,
+    const int lda, const int ldb, const int block_size);
+
+void transpose_uint32_gpu(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align);
+
+void transpose_uint32_gpu_2(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align);
+
+void repack_input_gpu(float *input, float *re_packed_input, int w, int h, int c);
+
+void repack_input_gpu_2(float *input, float *re_packed_input, int w, int h, int c);
+
+void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c);
+
+void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size);
+
+// shared_memory + partial coalescing = GOOD
+void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr, float *bias, int leaky_activation,
+    float *shortcut_in_gpu, float *shortcut_out_gpu);
+
+// sequentially - BAD
+void gemm_nn_custom_bin_mean_transposed_sequentially_gpu(int M, int N, int K,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr);
+
+void convolve_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad);
+
+void convolve_bin_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad,
+    int new_lda, float *mean_arr_gpu);
+
+//void convolve_bin_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad, int new_lda, float *mean_arr_gpu);
+
+//void convolve_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad);
+
+#endif
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/im2col_kernels.cu b/darknet-master/src/im2col_kernels.cu
new file mode 100644
index 0000000..ac7ccc8
--- /dev/null
+++ b/darknet-master/src/im2col_kernels.cu
@@ -0,0 +1,2287 @@
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+#include <stdint.h>
+
+#include "im2col.h"
+#include "dark_cuda.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+
+template<typename T1, typename T2>
+__device__ inline T1 __shfl_custom(T1 val, T2 lane) {
+#if CUDART_VERSION >= 9000
+    return __shfl_sync(FULL_MASK, val, lane);
+#else
+    return __shfl(val, lane);
+#endif
+}
+
+template<typename T>
+__device__ inline uint32_t __ballot_custom(T val) {
+#if CUDART_VERSION >= 9000
+    return __ballot_sync(FULL_MASK, val);
+#else
+    return __ballot(val);
+#endif
+}
+
+
+// src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
+// You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE
+
+__global__ void im2col_gpu_kernel(const int n, const float* data_im,
+        const int height, const int width, const int ksize,
+        const int pad,
+        const int stride,
+        const int height_col, const int width_col,
+        float *data_col) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    for(; index < n; index += blockDim.x*gridDim.x){
+        int w_out = index % width_col;
+        int h_index = index / width_col;
+        int h_out = h_index % height_col;
+        int channel_in = h_index / height_col;
+        int channel_out = channel_in * ksize * ksize;
+        int h_in = h_out * stride - pad;
+        int w_in = w_out * stride - pad;
+        float* data_col_ptr = data_col;
+        data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+        const float* data_im_ptr = data_im;
+        data_im_ptr += (channel_in * height + h_in) * width + w_in;
+        for (int i = 0; i < ksize; ++i) {
+            for (int j = 0; j < ksize; ++j) {
+                int h = h_in + i;
+                int w = w_in + j;
+
+                *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+                    data_im_ptr[i * width + j] : 0;
+
+                //data_im[(channel_in * height + h_in) * width + w_in + i * width + j];
+                //(*data_col_ptr) = data_im_ptr[ii * width + jj];
+
+                data_col_ptr += height_col * width_col;
+            }
+        }
+    }
+}
+
+void im2col_ongpu(float *im,
+         int channels, int height, int width,
+         int ksize, int stride, int pad, float *data_col){
+    // We are going to launch channels * height_col * width_col kernels, each
+    // kernel responsible for copying a single-channel grid.
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    int num_kernels = channels * height_col * width_col;
+    im2col_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,
+        BLOCK, 0, get_cuda_stream()>>>(
+                num_kernels, im, height, width, ksize, pad,
+                stride, height_col,
+                width_col, data_col);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+// --------------------------------
+
+/*
+__global__ void im2col_align_gpu_kernel(const int n, const float* data_im,
+    const int height, const int width, const int ksize,
+    const int pad,
+    const int stride,
+    const int height_col, const int width_col,
+    float *data_col, const int bit_align)
+{
+    //__shared__ float tmp_s[1];
+
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    for (; index < n; index += blockDim.x*gridDim.x) {
+        int w_out = index % width_col;
+        int h_index = index / width_col;
+        int h_out = h_index % height_col;
+        int channel_in = h_index / height_col;
+        int channel_out = channel_in * ksize * ksize;
+        int h_in = h_out * stride - pad;
+        int w_in = w_out * stride - pad;
+        float* data_col_ptr = data_col;
+        //data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+        data_col_ptr += channel_out * bit_align + h_out * width_col + w_out;
+        float* data_col_ptr_32 = data_col + (channel_out * bit_align + h_out * width_col + w_out)/32;
+        const float* data_im_ptr = data_im;
+        data_im_ptr += (channel_in * height + h_in) * width + w_in;
+        for (int i = 0; i < ksize; ++i) {
+            for (int j = 0; j < ksize; ++j) {
+                int h = h_in + i;
+                int w = w_in + j;
+
+                float val = (h >= 0 && w >= 0 && h < height && w < width) ?
+                    data_im_ptr[i * width + j] : 0;
+
+                *data_col_ptr = val;
+                //tmp_s[0] = val;
+
+                //(*data_col_ptr) = (h >= 0 && w >= 0 && h < height && w < width) ?
+                //    data_im_ptr[i * width + j] : 0;
+
+                //float src_val = (h >= 0 && w >= 0 && h < height && w < width) ? data_im_ptr[i * width + j] : 0;
+                //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
+                //if (threadIdx.x % WARP_SIZE == 0) *((unsigned int*)data_col_ptr_32) = bit_mask;
+                // use atomicOr() // *dst_ptr |= (mask << (col_index % 8));
+                //data_col_ptr_32 += bit_align / 32;
+
+                //data_col_ptr += height_col * width_col;
+                data_col_ptr += bit_align;
+            }
+        }
+    }
+}
+*/
+
+// float 32
+__global__ void im2col_align_gpu_kernel(const int n, const float* data_im,
+    const int height, const int width, const int ksize,
+    const int pad,
+    const int stride,
+    const int height_col, const int width_col,
+    float *data_col, const int bit_align)
+{
+    //__shared__ float tmp_s[1];
+
+
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    for (; index < n; index += blockDim.x*gridDim.x) {
+        int w_out = index % width_col;
+        int h_index = index / width_col;
+        int h_out = h_index % height_col;
+        int channel_in = h_index / height_col;
+        int channel_out = channel_in * ksize * ksize;
+        int h_in = h_out * stride - pad;
+        int w_in = w_out * stride - pad;
+        //float* data_col_ptr = data_col;
+        //float* data_col_ptr_32 = data_col + (channel_out * bit_align + h_out * width_col + w_out) / 32;
+        //data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+        //data_col_ptr += channel_out * bit_align + h_out * width_col + w_out;
+        float* data_col_ptr = &data_col[channel_out * bit_align + h_out * width_col + w_out];
+        const float* data_im_ptr = data_im;
+        data_im_ptr += (channel_in * height + h_in) * width + w_in;
+        for (int i = 0; i < ksize; ++i) {
+            for (int j = 0; j < ksize; ++j) {
+                int h = h_in + i;
+                int w = w_in + j;
+
+                float val = (h >= 0 && w >= 0 && h < height && w < width) ?
+                    data_im_ptr[i * width + j] : 0;
+
+                int pre_out_index = index % (width_col*height_col);
+                int out_index = (channel_out + i*ksize + j) * bit_align + pre_out_index;// h_out * width_col + w_out;
+                data_col[out_index] = val;
+
+                //(*data_col_ptr) = val;
+                //dst_s[threadIdx.x] = val;
+                //tmp_s[0] = val;
+
+                //(*data_col_ptr) = (h >= 0 && w >= 0 && h < height && w < width) ?
+                //    data_im_ptr[i * width + j] : 0;
+
+                //float src_val = (h >= 0 && w >= 0 && h < height && w < width) ? data_im_ptr[i * width + j] : 0;
+                //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
+                //if (threadIdx.x % WARP_SIZE == 0) *((unsigned int*)data_col_ptr_32) = bit_mask;
+                // use atomicOr() // *dst_ptr |= (mask << (col_index % 8));
+                //data_col_ptr_32 += bit_align / 32;
+
+                //data_col_ptr += height_col * width_col;
+                data_col_ptr += bit_align;
+            }
+        }
+    }
+}
+
+void im2col_align_ongpu(float *im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float *data_col, int bit_align) {
+    // We are going to launch channels * height_col * width_col kernels, each
+    // kernel responsible for copying a single-channel grid.
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    int num_kernels = channels * height_col * width_col;
+    im2col_align_gpu_kernel <<<(num_kernels + BLOCK - 1) / BLOCK,
+        BLOCK, 0, get_cuda_stream() >>>(
+            num_kernels, im, height, width, ksize, pad,
+            stride, height_col,
+            width_col, data_col, bit_align);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+// --------------------------------
+
+
+
+// binary im2col - stride=1
+__global__ void im2col_align_bin_gpu_kernel(const int n, const float* data_im,
+    const int height, const int width, const int ksize, const int channels,
+    const int pad,
+    const int stride,
+    const int height_col, const int width_col,
+    float *data_col, const int bit_align)
+{
+    //__shared__ float tmp_s[1];
+    //__shared__ ulonglong4 tmp256_s[1];
+
+
+    //#define SHRED_VALS ((BLOCK / 169) * )
+    //__shared__ float dst_s[1024];
+    //__shared__ float dst_s[1024];
+    //__shared__ uint32_t bit_s[32];
+    //__shared__ uint8_t bit_s[128];
+
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    //for (; index < n; index += blockDim.x*gridDim.x)
+    {
+        int c_index = index;
+        int channel_in = c_index % channels;
+
+        //int h_out = index % height_col;
+        //int c_index = index / height_col;
+        //int channel_in = c_index % channels;
+
+        int channel_out = channel_in * ksize * ksize;
+
+        int j_index = c_index / channels;
+        int j = j_index % ksize;
+        int i = j_index / ksize;
+
+        int pre_out_index = (channel_out + i*ksize + j) * bit_align;
+        int j_pad = (j - pad);
+        int i_pad = (i - pad);
+
+        for(int wh_index = 0; wh_index < (height_col*width_col); wh_index += 32)
+        //for (int h_out = 0; h_out < height_col; ++h_out)
+        {
+
+            // the end of padding
+            //if(0)
+            //for (int w_out = 0; w_out < (width_col); w_out += 32)
+            {
+                const int w_out = wh_index % width_col;
+                const int h_out = wh_index / width_col;
+
+                const int w = w_out + j_pad;
+                const int h = h_out + i_pad;
+
+                int pre_in_index = channel_in * height * width;
+                int pre_in_wh_index = h * width + w;
+
+                int send_wh_index = wh_index;
+                if (i >= ksize) send_wh_index = height_col*width_col;
+
+                #pragma unroll
+                for (int t = 0; t < WARP_SIZE; ++t)
+                {
+                    const int lane_id = threadIdx.x % WARP_SIZE;
+
+                    const int cur_wh_index = __shfl_custom(send_wh_index, t) + lane_id;
+
+                    if (cur_wh_index < (width_col*height_col))// && (cur_i_pad+pad) < ksize)
+                    {
+                        const int cur_pre_out_index = __shfl_custom(pre_out_index, t);
+
+                        const int cur_pre_in_index = __shfl_custom(pre_in_index, t);
+                        const int cur_pre_in_wh_index = __shfl_custom(pre_in_wh_index, t) + lane_id;
+
+                        int w = cur_pre_in_wh_index % width;
+                        int h = cur_pre_in_wh_index / width;
+                        int in_index = cur_pre_in_index + cur_pre_in_wh_index;
+
+                        int out_index = cur_pre_out_index + cur_wh_index;
+
+                        float val = (w >= 0 && w < width && h >= 0 && h < height) ?
+                            data_im[in_index] : float();
+
+                        //data_col[out_index] = val;
+                        //tmp_s[0] = val;
+
+                        uint32_t bit_mask = __ballot_custom(val > 0);
+                        if (lane_id == 0) {
+                            uint8_t *bit8_ptr = &(((uint8_t *)data_col)[out_index / 8]);
+                            uint32_t *bit32_ptr = (uint32_t *)bit8_ptr;
+                            *bit32_ptr = bit_mask;
+                        }
+                    }
+
+
+                }
+
+            }// w_out
+
+        }
+    }
+}
+
+
+void im2col_align_bin_ongpu(float *im,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float *data_col, int bit_align) {
+    // We are going to launch channels * height_col * width_col kernels, each
+    // kernel responsible for copying a single-channel grid.
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    //int num_kernels = channels * height_col * width_col * ksize * ksize;
+    //int num_kernels = channels * ksize * ksize * height_col;
+    int num_kernels = channels * ksize * ksize;
+    int num_blocks = num_kernels / BLOCK + 1;
+
+    //im2col_align_bin_gpu_kernel <<<(num_kernels + BLOCK - 1) / BLOCK,
+    im2col_align_bin_gpu_kernel <<<num_blocks,
+        BLOCK, 0, get_cuda_stream() >>>(
+            num_kernels, im, height, width, ksize, channels, pad,
+            stride, height_col,
+            width_col, data_col, bit_align);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+// --------------------------------
+
+/*
+__global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t size)
+{
+    //const int size_aligned = size + (WARP_SIZE - size % WARP_SIZE);
+
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    float src_val;
+
+    //for (; index < size_aligned; index += blockDim.x*gridDim.x)
+    {
+        //src_val = src[index];
+        if(index < size) src_val = src[index];
+        else src_val = 0;
+        //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
+        unsigned int bit_mask = __ballot_custom(src_val > 0);
+        if (threadIdx.x % WARP_SIZE == 0) ((unsigned int*)dst)[index / 32] = bit_mask;
+    }
+}
+*/
+
+/*
+__global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t size)
+{
+    //const int size_aligned = size + (WARP_SIZE - size % WARP_SIZE);
+    __shared__ uint32_t tmp[WARP_SIZE];
+
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    float src_val;
+    uint32_t *dst32_ptr = ((unsigned int*)dst);
+
+    //for (; index < size_aligned; index += blockDim.x*gridDim.x)
+    {
+        //src_val = src[index];
+        if (index < size) src_val = src[index];
+        else src_val = 0;
+        //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
+        const int num_of_warps = blockDim.x / WARP_SIZE;
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+
+        uint32_t bit_mask = __ballot_custom(src_val > 0);
+
+        if (lane_id == 0) tmp[warp_id] = bit_mask;
+
+        __syncthreads();
+        if (warp_id == 0) {
+            if (lane_id < num_of_warps) {
+                dst32_ptr[index / 32 + lane_id] = tmp[lane_id];
+            }
+        }
+        __syncthreads();
+    }
+}
+*/
+
+__global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t size)
+{
+    __shared__ uint32_t tmp[WARP_SIZE*32];
+
+    int index = 32*blockIdx.x*blockDim.x + threadIdx.x;
+    float src_val;
+    uint32_t *dst32_ptr = ((unsigned int*)dst);
+
+    int i;
+    for(i = 0; i < 32; ++i)
+    {
+        if ((index + i * 1024) < size) src_val = src[index + i*1024];
+        else src_val = 0;
+        //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
+        //const int num_of_warps = blockDim.x / WARP_SIZE;
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+
+        uint32_t bit_mask = __ballot_custom(src_val > 0);
+        if (lane_id == 0) tmp[i * 32 + warp_id] = bit_mask;
+    }
+    __syncthreads();
+    dst32_ptr[blockIdx.x*blockDim.x + threadIdx.x] = tmp[threadIdx.x];
+}
+
+
+void float_to_bit_gpu(float *src, unsigned char *dst, size_t size)
+{
+    //const int num_blocks = size / 1024 + 1;
+    //const int num_blocks = size / (32*1024) + 1;
+    const int num_blocks = get_number_of_blocks(size, 32 * 1024);
+    float_to_bit_gpu_kernel<<<num_blocks, 1024, 0, get_cuda_stream()>>>(src, dst, size);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+// --------------------------------
+
+/*
+__device__ __host__ static inline void remove_bit(unsigned char *const dst, size_t index) {
+    size_t dst_i = index / 8;
+    int dst_shift = index % 8;
+    dst[dst_i] &= ~(1 << dst_shift);
+}
+
+__device__ __host__ static inline void set_bit(unsigned char *const dst, size_t index) {
+    size_t dst_i = index / 8;
+    int dst_shift = index % 8;
+    dst[dst_i] |= 1 << dst_shift;
+    //dst[dst_i] |= 1 << (8 - dst_shift);
+}
+*/
+
+__device__ __host__ static inline unsigned char get_bit(unsigned char const*const src, size_t index) {
+    size_t src_i = index / 8;
+    int src_shift = index % 8;
+    unsigned char val = (src[src_i] & (1 << src_shift)) > 0;
+    //unsigned char val = (src[src_i] & (1 << (8 - src_shift))) > 0;
+    return val;
+}
+
+// Intel CPUs and nVidia CUDA GPU are little endian
+__device__ __host__ unsigned char reverse_byte(unsigned char a)
+{
+    return ((a & 0x1) << 7) | ((a & 0x2) << 5) |
+        ((a & 0x4) << 3) | ((a & 0x8) << 1) |
+        ((a & 0x10) >> 1) | ((a & 0x20) >> 3) |
+        ((a & 0x40) >> 5) | ((a & 0x80) >> 7);
+}
+
+__device__ __host__ unsigned char reverse_byte_2(unsigned char a)
+{
+    return ((a * 0x0802LU & 0x22110LU) | (a * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;
+}
+
+__device__ unsigned char reverse_byte_CUDA(unsigned char a)
+{
+    uint32_t tmp = __brev(a);
+    return tmp >> 24;
+}
+
+__device__ void transpose8rS32_reversed_diagonale(unsigned char* A, unsigned char* B, int m, int n)
+{
+    unsigned x, y, t;
+
+    // Load the array and pack it into x and y.
+    x = (A[0] << 24) | (A[m] << 16) | (A[2 * m] << 8) | A[3 * m];
+    y = (A[4 * m] << 24) | (A[5 * m] << 16) | (A[6 * m] << 8) | A[7 * m];
+
+    t = (x ^ (x >> 7)) & 0x00AA00AA;  x = x ^ t ^ (t << 7);
+    t = (y ^ (y >> 7)) & 0x00AA00AA;  y = y ^ t ^ (t << 7);
+
+    t = (x ^ (x >> 14)) & 0x0000CCCC;  x = x ^ t ^ (t << 14);
+    t = (y ^ (y >> 14)) & 0x0000CCCC;  y = y ^ t ^ (t << 14);
+
+    t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);
+    y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);
+    x = t;
+
+    B[7 * n] = reverse_byte_CUDA(x >> 24);  B[6 * n] = reverse_byte_CUDA(x >> 16);  B[5 * n] = reverse_byte_CUDA(x >> 8);  B[4 * n] = reverse_byte_CUDA(x);
+    B[3 * n] = reverse_byte_CUDA(y >> 24);  B[2 * n] = reverse_byte_CUDA(y >> 16);  B[1 * n] = reverse_byte_CUDA(y >> 8);  B[0 * n] = reverse_byte_CUDA(y);
+
+    //__device__ ​ unsigned int     __brev(unsigned int  x)
+    //Reverse the bit order of a 32 bit unsigned integer.
+    // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html
+}
+
+
+// transpose 8x8 bit
+__global__ void transpose_bin_gpu_kernel(unsigned char *A, unsigned char *B, const int n, const int m,
+    const int lda, const int ldb, const int block_size)
+{
+    int i;
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    //for (i = 0; i < n; i += 8)
+    {
+        i = (index*8) % n;
+        int j;
+        //for (j = 0; j < m - 8; j += 8)
+        {
+            j = ((index * 8) / n) * 8;
+            if (j < m) {
+                int a_index = i*lda + j;
+                int b_index = j*ldb + i;
+                transpose8rS32_reversed_diagonale(&A[a_index / 8], &B[b_index / 8], lda / 8, ldb / 8);
+            }
+            //else if (j < m) {
+            //    for (; j < m; ++j) {
+            //        if (get_bit(A, i*lda + j)) set_bit(B, j*ldb + i);
+            //        else remove_bit(B, j*ldb + i);
+            //    }
+            //}
+        }
+    }
+}
+
+
+
+__device__ __host__ uint8_t reverse_8_bit(uint8_t a) {
+    return ((a * 0x0802LU & 0x22110LU) | (a * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;
+}
+
+__device__ uint32_t reverse_32_bit(uint32_t a)
+{
+    // __device__ ​ unsigned int __brev(unsigned int  x) // CUDA
+    // unsigned int __rbit(unsigned int val) // for ARM    //__asm__("rbit %0, %1\n" : "=r"(output) : "r"(input));
+    return __brev(a);
+    //return (reverse_8_bit(a >> 24) << 0) |
+    //    (reverse_8_bit(a >> 16) << 8) |
+    //    (reverse_8_bit(a >> 8) << 16) |
+    //    (reverse_8_bit(a >> 0) << 24);
+}
+
+#define swap(a0, a1, j, m) t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j);
+
+__device__ void transpose32_optimized(uint32_t A[32]) {
+    int j, k;
+    unsigned m, t;
+
+    //m = 0x0000FFFF;
+    //for (j = 16; j != 0; j = j >> 1, m = m ^ (m << j)) {
+    //    for (k = 0; k < 32; k = (k + j + 1) & ~j) {
+    //        t = (A[k] ^ (A[k + j] >> j)) & m;
+    //        A[k] = A[k] ^ t;
+    //        A[k + j] = A[k + j] ^ (t << j);
+    //    }
+    //}
+
+    j = 16;
+    m = 0x0000FFFF;
+    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }
+
+    j = 8;
+    m = 0x00ff00ff;
+    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }
+
+    j = 4;
+    m = 0x0f0f0f0f;
+    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }
+
+    j = 2;
+    m = 0x33333333;
+    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }
+
+    j = 1;
+    m = 0x55555555;
+    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }
+
+    // reverse Y
+    for (j = 0; j < 16; ++j) {
+        uint32_t tmp = A[j];
+        A[j] = reverse_32_bit(A[31 - j]);
+        A[31 - j] = reverse_32_bit(tmp);
+    }
+}
+
+extern "C" {
+__device__ void transpose_32x32_bits_reversed_diagonale(uint32_t *A, uint32_t *B, int m, int n)
+{
+    //unsigned A_tmp[32];
+    //int i;
+    //#pragma unroll
+    //for (i = 0; i < 32; ++i) A_tmp[i] = A[i * m];
+    //transpose32_optimized(A_tmp);
+    //#pragma unroll
+    //for (i = 0; i < 32; ++i) B[i*n] = A_tmp[i];
+
+    __shared__ uint32_t A_shared[32 * BLOCK_TRANSPOSE32];
+    uint32_t *A_tmp = &A_shared[32 * threadIdx.x];
+
+    int i;
+    #pragma unroll 32
+    for (i = 0; i < 32; ++i) A_tmp[i] = A[i * m];
+    transpose32_optimized(A_tmp);
+    #pragma unroll 32
+    for (i = 0; i < 32; ++i) B[i*n] = A_tmp[i];
+}
+}
+
+// transpose 32x32 bit
+__global__ void transpose_bin_gpu_kernel_32(uint32_t *A, uint32_t *B, const int n, const int m,
+    const int lda, const int ldb, const int block_size)
+{
+    int i;
+    int index = (blockIdx.x*blockDim.x + threadIdx.x) * 32;
+
+    //for (i = 0; i < n; i += 8)
+    {
+        i = index % n;
+        int j;
+        //for (j = 0; j < m - 8; j += 8)
+        {
+            j = (index / n) * 32;
+            if (j < m) {
+                int a_index = i*lda + j;
+                int b_index = j*ldb + i;
+                transpose_32x32_bits_reversed_diagonale(&A[a_index / 32], &B[b_index / 32], lda / 32, ldb / 32);
+            }
+        }
+    }
+}
+
+void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const int m,
+    const int lda, const int ldb, const int block_size)
+{
+    //int size = n*m/ (8*8) + 1;
+    int size32 = n*m / (32*32) + 1;
+    //const int num_blocks = size / BLOCK + 1;
+    const int num_blocks32 = size32 / BLOCK_TRANSPOSE32 + 1;
+    transpose_bin_gpu_kernel_32 <<<num_blocks32, BLOCK_TRANSPOSE32, 0, get_cuda_stream() >>>((uint32_t *)A, (uint32_t *)B, n, m, lda, ldb, block_size);
+    //transpose_bin_gpu_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(A, B, n, m, lda, ldb, block_size);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+// --------------------------------
+
+__global__ void transpose_uint32_kernel(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
+{
+    //l.bit_align - algined (n) by 32
+    //new_ldb - aligned (k) by 256
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    //for (i = 0; i < src_h; i += 1)
+    int i = index % src_h;  // l.size*l.size*l.c;
+    {
+        //for (j = 0; j < src_w; j += 1)
+        int j = index / src_h;  // out_h*out_w;
+        if(j < src_w)
+        {
+            ((uint32_t *)dst)[j*dst_align / 32 + i] = ((uint32_t *)src)[i*src_align + j];
+        }
+    }
+}
+
+void transpose_uint32_gpu(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
+{
+    int size = src_w * src_h;
+    const int num_blocks = size / BLOCK + 1;
+    transpose_uint32_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(src, dst, src_h, src_w, src_align, dst_align);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+// --------------------------------
+
+//#define TRANS_LOOP 10
+
+__global__ void transpose_uint32_kernel_2(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
+{
+    __shared__ uint32_t tmp[33 * 32];   // misaligned_array[32x32]
+    const int w_align = 33;
+    //const int shared_size = w_align * 32;
+
+    //l.bit_align - algined (n) by 32
+    //new_ldb - aligned (k) by 256
+
+    const int src_w_align = src_w + (32 - src_w % 32);
+    //const int src_h_align = src_h + (32 - src_h % 32);
+
+    const int warps_in_width = src_w_align / 32;
+    //const int warps_in_height = src_h_align / 32;
+
+
+
+    const int local_x = threadIdx.x % 32;   // index % 32;
+    const int local_x_index = threadIdx.x / 32; // index / 32;
+    const int local_y = local_x_index % 32;
+
+//#pragma unroll TRANS_LOOP
+    //for (int i = 0; i < TRANS_LOOP; ++i)
+    {
+        const int global_index = blockIdx.x;// blockIdx.x*TRANS_LOOP + i;// local_x_index / 32;
+        const int global_x_index = global_index % warps_in_width;
+        const int global_y_index = global_index / warps_in_width;
+
+        const int global_x = global_x_index * 32 + local_x;
+        const int global_y = global_y_index * 32 + local_y;
+
+        uint32_t val = 0;
+        if (global_x < src_w && global_y < src_h) {
+            val = src[global_y * src_align + global_x];
+        }
+        //dst[global_x * dst_align / 32 + global_y] = val;
+        //tmp[local_y * 32 + local_x] = val;
+
+        tmp[local_x * w_align + local_y] = val;
+        __syncthreads();
+        val = tmp[local_y * w_align + local_x];
+
+        const int new_global_x = global_y_index * 32 + local_x;
+        const int new_global_y = global_x_index * 32 + local_y;
+
+        if (new_global_x < src_h && new_global_y < src_w) {
+            dst[new_global_y * (dst_align / 32) + new_global_x] = val;
+        }
+    }
+}
+
+#define TRANS_BLOCK 1024
+void transpose_uint32_gpu_2(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
+{
+    int src_w_align = src_w + (32 - src_w % 32);
+    int src_h_align = src_h + (32 - src_h % 32);
+
+    int size = src_w_align * src_h_align;
+    int num_blocks = size / TRANS_BLOCK;
+    transpose_uint32_kernel_2 <<<num_blocks, TRANS_BLOCK, 0, get_cuda_stream() >>>(src, dst, src_h, src_w, src_align, dst_align);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+// --------------------------------
+
+
+// 32 channels -> 1 channel (with 32 floats)
+// 256 channels -> 8 channels (with 32 floats)
+__global__ void repack_input_kernel(float *input, float *re_packed_input, int w, int h, int c)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    const int items_per_channel = w * h;
+
+    int c_pack = index % 32;
+    int chan_index = index / 32;
+    int chan = (chan_index * 32) % c;
+    int i = (chan_index * 32) / c;
+
+    //for (chan = 0; chan < c; chan += 32)
+    {
+        //for (i = 0; i < items_per_channel; ++i)
+        if(i < items_per_channel)
+        {
+            //for (c_pack = 0; c_pack < 32; ++c_pack)
+            {
+                float src = input[(chan + c_pack)*items_per_channel + i];
+
+                re_packed_input[chan*items_per_channel + i * 32 + c_pack] = src;
+            }
+        }
+    }
+}
+
+void repack_input_gpu(float *input, float *re_packed_input, int w, int h, int c)
+{
+    int size = w * h * c;
+    const int num_blocks = size / BLOCK + 1;
+    repack_input_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(input, re_packed_input, w, h, c);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+// --------------------------------
+
+
+// 32 channels -> 1 channel (with 32 floats)
+// 256 channels -> 8 channels (with 32 floats)
+__global__ void repack_input_kernel_2(float *input, float *re_packed_input, int w, int h, int c)
+{
+    //__shared__ uint32_t tmp[33 * 32];  // 33x32 is misaligned 32 x 32 to avoid bank conflicts
+
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    const int items_per_channel = w * h;
+
+    int c_pack = index % 32;
+    int chan_index = index / 32;
+    int chan = (chan_index * 32) % c;
+    int i = (chan_index * 32) / c;
+
+    //for (chan = 0; chan < c; chan += 32)
+    {
+        //for (i = 0; i < items_per_channel; ++i)
+        if (i < items_per_channel)
+        {
+            //for (c_pack = 0; c_pack < 32; ++c_pack)
+            {
+                float src = input[(chan + c_pack)*items_per_channel + i];
+
+                re_packed_input[chan*items_per_channel + i * 32 + c_pack] = src;
+            }
+        }
+    }
+}
+
+void repack_input_gpu_2(float *input, float *re_packed_input, int w, int h, int c)
+{
+    int size = w * h * c;
+    const int num_blocks = size / BLOCK + 1;
+    repack_input_kernel_2 <<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(input, re_packed_input, w, h, c);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+// --------------------------------
+
+
+// 32 channels -> 1 channel (with 32 floats)
+// 256 channels -> 8 channels (with 32 floats)
+__global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
+{
+    //__shared__ uint32_t tmp[32];
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    const int global_warp_id = index / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    const int items_per_channel = w * h;
+    const int items_per_channel_aligned = items_per_channel + WARP_SIZE - (items_per_channel % WARP_SIZE);
+
+    int i = 32 * (global_warp_id % (items_per_channel_aligned / WARP_SIZE));
+    int chan = 32 * (global_warp_id / (items_per_channel_aligned / WARP_SIZE));
+
+    if (chan < c)
+    {
+        uint32_t result_bits = 0;
+
+        for (int c_pack = 0; c_pack < 32; ++c_pack)
+        {
+            float src = 0;
+            if ((i + lane_id) < items_per_channel) {
+                src = input[(chan + c_pack)*items_per_channel + (i + lane_id)];
+            }
+            uint32_t bit_mask = __ballot_custom(src > 0);
+
+            uint32_t cur_bit = (bit_mask >> lane_id) & uint32_t(1);
+
+            result_bits |= (cur_bit << c_pack);
+        }
+        if ((i + lane_id) < items_per_channel) {
+            re_packed_input_bin[chan*items_per_channel / 32 + (i + lane_id)] = result_bits;
+        }
+    }
+}
+
+void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
+{
+    int size = (w * h * c) / 32 + 1;
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    //printf("\n num_blocks = %d, num_blocks/32 = %d,  block_size = %d \n", num_blocks, num_blocks / 32, block_size);
+    repack_input_kernel_bin <<<num_blocks, block_size, 0, get_cuda_stream() >>>(input, re_packed_input_bin, w, h, c);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+/*
+// 32 channels -> 1 channel (with 32 floats)
+// 256 channels -> 8 channels (with 32 floats)
+__global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
+{
+    //__shared__ uint32_t tmp[32];
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    //const int num_of_warps = blockDim.x / WARP_SIZE;
+    //const int warp_id = threadIdx.x / WARP_SIZE;
+    //const int lane_id = threadIdx.x % WARP_SIZE;
+
+    const int items_per_channel = w * h;
+
+    int c_pack = index % 32;
+    int chan_index = index / 32;
+    //int chan = (chan_index * 32) % c;
+    //int i = (chan_index * 32) / c;
+
+    int i = (chan_index) % items_per_channel;
+    int chan = ((chan_index ) / items_per_channel)*32;
+
+
+    //for (chan = 0; chan < c; chan += 32)
+    if(chan < c)
+    {
+        //for (i = 0; i < items_per_channel; ++i)
+        //if (i < items_per_channel)
+        {
+            //for (c_pack = 0; c_pack < 32; ++c_pack)
+            {
+                float src = input[(chan + c_pack)*items_per_channel + i];
+
+                uint32_t bit_mask = __ballot_custom(src > 0);
+                if (threadIdx.x % 32 == 0)
+                    re_packed_input_bin[chan*items_per_channel / 32 + i] = bit_mask;
+            }
+        }
+    }
+}
+
+void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
+{
+    int size = w * h * c;
+    const int block_size = 256;// 128;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    printf("\n num_blocks = %d, num_blocks/32 = %d,  block_size = %d \n", num_blocks, num_blocks/32, block_size);
+    repack_input_kernel_bin <<<num_blocks, block_size, 0, get_cuda_stream() >>>(input, re_packed_input_bin, w, h, c);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+*/
+
+
+
+__global__ void fill_int8_gpu_kernel(unsigned char *src, unsigned char val, size_t size) {
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if(index < size) src[index] = 0;
+}
+
+void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size) {
+    const int num_blocks = size / BLOCK + 1;
+    fill_int8_gpu_kernel<<<num_blocks, BLOCK, 0, get_cuda_stream()>>>(src, val, size);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+// --------------------------------
+
+//typedef unsigned long long int uint64_t;
+//typedef unsigned int uint32_t;
+//typedef unsigned char uint8_t;
+//typedef char int8_t;
+/*
+__device__ __host__ static inline uint64_t broadcast_bit_1_to_64(uint8_t src) {
+    return (src > 0) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+*/
+__device__ __host__ static inline uint8_t xnor_bit1(uint8_t a, uint8_t b) {
+    return ~(a^b) & 0b1;
+}
+/*
+__device__ __host__ static inline uint32_t xnor_int32(uint32_t a, uint32_t b) {
+    return ~(a^b);
+}
+
+__device__ __host__ static inline uint64_t xnor_int64(uint64_t a, uint64_t b) {
+    return ~(a^b);
+}
+
+__device__ __host__ static inline uint4 xnor_int128(uint4 a, uint4 b) {
+    uint4 res;
+    res.w = ~(a.w^b.w);
+    res.x = ~(a.x^b.x);
+    res.y = ~(a.y^b.y);
+    res.z = ~(a.z^b.z);
+    return res;
+}
+
+__device__ __host__ static inline ulonglong4 xnor_int256(ulonglong4 a, ulonglong4 b) {
+    ulonglong4 res;
+    res.w = ~(a.w^b.w);
+    res.x = ~(a.x^b.x);
+    res.y = ~(a.y^b.y);
+    res.z = ~(a.z^b.z);
+    return res;
+}
+*/
+//-------
+/*
+__device__ __host__ static inline uint8_t xor_bit1(uint8_t a, uint8_t b) {
+    return (a^b) & 0b1;
+}
+*/
+__device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) {
+    return (a^b);
+}
+
+__device__ __host__ static inline uint64_t xor_int64(uint64_t a, uint64_t b) {
+    return (a^b);
+}
+/*
+__device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) {
+    uint4 res;
+    res.w = (a.w^b.w);
+    res.x = (a.x^b.x);
+    res.y = (a.y^b.y);
+    res.z = (a.z^b.z);
+    return res;
+}
+*/
+__device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4 b) {
+    ulonglong4 res;
+    res.w = (a.w^b.w);
+    res.x = (a.x^b.x);
+    res.y = (a.y^b.y);
+    res.z = (a.z^b.z);
+    return res;
+}
+
+/*
+__device__ static inline int popcnt_256(ulonglong4 a) {
+    return __popcll(a.w) + __popcll(a.x) + __popcll(a.y) + __popcll(a.z);
+}
+
+__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    //if (index == 0)
+    {
+        int i, j, k, h;
+
+        //#pragma omp parallel for
+        //for (i = 0; i < M; ++i)
+        i = index % M;
+        //if(i < M)
+        {   // l.n - filters [16 - 55 - 1024]
+            float mean_val = mean_arr[i];
+
+            //for (j = 0; j < N; ++j)
+            j = index / M;
+            if(j < N)
+            { // out_h*out_w - one channel output size [169 - 173056]
+                int count = 0;
+
+                for (k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                    uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8));
+                    uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8));
+                    uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
+
+                    int tmp_count = __popcll(c_bit64);
+
+                    if (K - k < 64)  tmp_count = tmp_count - (64 - (K - k));    // remove extra bits
+                    count += tmp_count;
+                    //binary_int64_printf(c_bit64);
+                    //printf(", count = %d \n\n", tmp_count);
+                }
+
+                C[i*ldc + j] = (2 * count - K) * mean_val;
+            }
+        }
+    }
+}
+*/
+
+
+/*
+// B (input) in the shared_memory
+__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+
+    __shared__ uint64_t B_s[4096];  // 32 KB // [ldb x N`] // max = 262 144 bits
+
+    int start_j = blockIdx.x*blockDim.x / M;
+    {
+        int end_j = (blockIdx.x*blockDim.x + blockDim.x) / M + 1;
+
+        size_t shared_size = ldb * (end_j - start_j);
+
+        //float tmp_shared_size = ldb * (blockDim.x / M);
+        //int passes = (4096 * 64) / tmp_shared_size - 1;
+        //size_t shared_size = tmp_shared_size * passes;
+
+        int k;
+        for (int k = threadIdx.x * 256; k < shared_size; k += blockDim.x * 256) {
+            int x = start_j*ldb + k;
+            if (x < (N*ldb)) *((ulonglong4 *)(B_s + k / 8)) = *((ulonglong4 *)(B + x / 8));
+        }
+
+        ////if (j_cur < N && (index % M == 0 || threadIdx.x == 0)) {
+          ////  for (int k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+            ////    *((uint64_t *)(B_s + (local_j*ldb + k) / 8)) = *((uint64_t *)(B + (j_cur*ldb + k) / 8));    // input
+            ////}
+        ////}
+    }
+    __syncthreads();
+
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+
+    //if (index == 0)
+    //for(int in_tmp = threadIdx.x; in_tmp < 1*blockDim.x; in_tmp += blockDim.x)
+    {
+        //int index = blockIdx.x*blockDim.x*1 + in_tmp;
+
+        int j_cur = index / M;
+        int local_j = j_cur - start_j;
+
+        int i, j, h;
+
+        //#pragma omp parallel for
+        //for (i = 0; i < M; ++i)
+        i = index % M;
+        //if(i < M)
+        {   // l.n - filters [16 - 55 - 1024]
+            // further improvements: for (l.n == 1024) iterate several (j)
+            float mean_val = mean_arr[i];
+
+            //for (j = 0; j < N; ++j)
+            j = index / M;
+            if (j < N)
+            { // out_h*out_w - one channel output size [169 - 173056]
+                const int bit_step = 256;
+                int count = 0;
+                int k = 0;
+                for (k = 0; k < K; k += bit_step) {   // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
+                    ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8));    // weights
+                    //ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8));
+                    ulonglong4 b_bit256 = *((ulonglong4 *)(B_s + (local_j*ldb + k) / 8));    // input
+                    ulonglong4 c_bit256 = xnor_int256(a_bit256, b_bit256);
+
+                    count += __popcll(c_bit256.w) + __popcll(c_bit256.x) +
+                        __popcll(c_bit256.y) + __popcll(c_bit256.z);
+                }
+
+                int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
+                //C[i*ldc + j] += 2 * count*mean_val;
+                //C[i*ldc + j] += -2 * f1*mean_val;
+                //C[i*ldc + j] += - K*mean_val;
+
+                count = count - f1;    // remove extra bits (from empty space for align only)
+                C[i*ldc + j] = (2 * count - K) * mean_val;
+
+                //B_s[0] = (2 * count - K) * mean_val;
+            }
+        }
+    }
+}
+*/
+
+/*
+// A (weights) in the shared_memory
+__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    __shared__ uint64_t A_s[6144];  // 48 KB // [lda x M`]
+                                    //__shared__ uint8_t A_s[6144*8];  // 48 KB // [lda x M`]
+
+    int start_i = blockIdx.x*blockDim.x / N;
+    int end_i = (blockIdx.x*blockDim.x + blockDim.x) / N + 1;
+
+    size_t shared_size = lda * (end_i - start_i);
+
+    int i_cur = index / N;
+    int local_i = i_cur - start_i;
+
+    for (int k = threadIdx.x * 64; k < shared_size; k += blockDim.x * 64) {
+        int x = start_i*lda + k;
+        if (x < (M*lda)) *((uint64_t *)(A_s + k / 8)) = *((uint64_t *)(A + x / 8));
+    }
+
+    //if (i_cur < M && (index % N == 0 || threadIdx.x == 0)) {
+    //for (int k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+    //(*(uint64_t *)(A_s + (local_i*lda + k) / 8)) = *((uint64_t *)(A + (i_cur*lda + k) / 8));    // weights
+    //  }
+    //}
+
+    __syncthreads();
+
+    int i, j, k, h;
+
+    j = index % N;
+    {    // out_h*out_w - one channel output size [169 - 173056]
+        i = index / N;
+        if (i < M)  // l.n - filters [16 - 55 - 1024]
+        {
+            float mean_val = mean_arr[i];
+            int count = 0;
+
+            for (k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                //uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8));    // weights
+                uint64_t a_bit64 = *((uint64_t *)(A_s + (local_i*lda + k) / 8));    // weights
+                uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8));            // input
+                uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
+
+                int tmp_count = __popcll(c_bit64);
+
+                if (K - k < 64)  tmp_count = tmp_count - (64 - (K - k));    // remove extra bits
+                count += tmp_count;
+            }
+
+            C[i*ldc + j] = (2 * count - K) * mean_val;
+        }
+    }
+}
+*/
+
+__inline__ __device__
+int warpAllReduceSum(int val) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2)
+#if CUDART_VERSION >= 9000
+        val += __shfl_xor_sync(FULL_MASK, val, mask);
+#else
+        val += __shfl_xor(val, mask);
+#endif
+
+    return val;
+}
+
+// Tensor Cores binary (CC >= 7.3 && CUDA >= 10.0) - __CUDA_SUBBYTE_IMMA__
+#if CUDART_VERSION >= 10000
+#include <mma.h>
+
+#define WMMA_M 8
+#define WMMA_N 8
+#define WMMA_K 128
+#define WMMA_K32 (WMMA_K/32)
+
+#define WMMA_Nx2 (WMMA_N*2)
+
+// Tensor Cores are used for XOR-GEMM
+__global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, int K,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation,
+    float *shortcut_in_gpu, float *shortcut_out_gpu)
+{
+    // total 57%
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    __shared__ int C_s[WMMA_N * WMMA_M * 32 * 2];    // 2 * 8 KB - Temprorary result of GEMM WMMA for 32 warps
+
+    const int lane_id = threadIdx.x % 32;
+    const int warp_id = threadIdx.x / 32;
+    const int global_warp_id = index / 32;
+
+    const int N_aligned = N + WMMA_Nx2 - (N % WMMA_Nx2);
+
+    /*
+    __syncthreads();
+    __shared__ uint32_t A_s[8 * 512];   // 8x512 = 8 x 16384 bits, instead of 8x4
+    const int start_global_warp_id = blockIdx.x*blockDim.x / 32;
+    int start_i = start_global_warp_id / (N_aligned / WMMA_N);
+    start_i = start_i * WMMA_M;
+    if (start_i + WMMA_M > M) start_i = M - WMMA_M;   // must be: i+7 < M
+    for (int tmp_index = threadIdx.x; tmp_index < (8 * 512); tmp_index += blockDim.x)
+    {
+        int k_tmp = tmp_index % 512;
+        int local_i = tmp_index / 512;
+
+        uint32_t a_val = ((uint32_t *)(A))[(start_i + local_i)*lda/32 + k_tmp];
+        A_s[local_i * 512 + k_tmp] = a_val;
+    }
+    __syncthreads();
+    */
+
+
+    int i, j, k;//, h;
+    // 47% = 29 + 10 + 8
+    j = global_warp_id % (N_aligned / WMMA_Nx2);
+    j = j * WMMA_Nx2;
+    {    // out_h*out_w - one channel output size [169 - 173056]
+        i = global_warp_id / (N_aligned / WMMA_Nx2);
+        i = i * WMMA_M;
+
+        //int count = 0;
+        k = 0;
+
+        if (i < M)  //if (i < M)  // l.n - filters [16 - 55 - 1024]
+        {
+            if (j + WMMA_Nx2 > N) j = N - WMMA_Nx2;   // must be: j+7 < N
+            if (i + WMMA_M > M) i = M - WMMA_M;   // must be: i+7 < M
+
+#if __CUDA_ARCH__ >= 730
+            // Tensor Cores
+            using namespace nvcuda;
+
+            wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::row_major> a_frag;
+            wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::col_major> b_frag;
+            wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> c1_frag, c2_frag;
+            wmma::fill_fragment(c1_frag, 0); // !!!! XOR isn't XNOR !!!!!!!!!!
+            wmma::fill_fragment(c2_frag, 0); // !!!! XOR isn't XNOR !!!!!!!!!!
+
+            // 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
+            for (; k < K; k += 128)  // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
+            {
+                int64_t A_cur_index = (i*lda + k) / 8;  // index in bits
+                int64_t B1_cur_index = (j*ldb + k) / 8;  // index in bits
+                int64_t B2_cur_index = ((j + 8)*ldb + k) / 8;  // index in bits
+
+                // try to use A that is cached in shared memory - poor performance
+                //if (i == start_i) wmma::load_matrix_sync(a_frag, &A_s[k / 32], (512 * 32));   // lda = (128*32) bits
+                //else wmma::load_matrix_sync(a_frag, (uint32_t *)(A + A_cur_index), lda);   // lda = M
+
+                // lda, ldb - are in bits
+                wmma::load_matrix_sync(a_frag, (uint32_t *)(A + A_cur_index), lda);   // lda = M
+
+                wmma::load_matrix_sync(b_frag, (uint32_t *)(B + B1_cur_index), ldb);   // ldb = K
+                wmma::bmma_sync(c1_frag, a_frag, b_frag, c1_frag);    // XOR-GEMM
+
+                wmma::load_matrix_sync(b_frag, (uint32_t *)(B + B2_cur_index), ldb);   // ldb = K
+                wmma::bmma_sync(c2_frag, a_frag, b_frag, c2_frag);    // XOR-GEMM
+            }
+            // C[i*ldc + j]
+            wmma::store_matrix_sync(&C_s[warp_id*WMMA_M*WMMA_N], c1_frag, WMMA_N, wmma::mem_row_major);
+            wmma::store_matrix_sync(&C_s[warp_id*WMMA_M*WMMA_N + WMMA_M*WMMA_N*32], c2_frag, WMMA_N, wmma::mem_row_major);
+#else // __CUDA_ARCH__ >= 730
+
+            // Custom XOR-GEMM
+            int k_d = lane_id % 4;
+            int i_d = lane_id / 4;
+            //int j_d = lane_id / 4;
+
+            int32_t accum_c_val[8*2]; // wmma::fill_fragment(c_frag, 0);
+            for (int local_j = 0; local_j < 8*2; ++local_j) {
+                accum_c_val[local_j] = 0;
+            }
+
+            // 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
+            for (; k < K; k += 128)  // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
+            {
+                //int64_t A_cur_index = (i*lda + k) / 8;
+                //int64_t A_cur_index = (local_i*lda + k) / 8;
+                //int64_t B_cur_index = (j*ldb + k) / 8;
+
+                // lda, ldb - are in bits
+                // 8*4 = 32
+                // 8*8 = 64
+                int k_d = lane_id % 4;
+                int i_d = lane_id / 4;
+                int j_d = lane_id / 4;
+                uint32_t a_val = *(uint32_t *)(A + ((i + i_d)*lda + (k + k_d*32)) / 8); // wmma::load_matrix_sync(a_frag, (uint32_t *)(A + A_cur_index), lda);
+
+                for (int c_x = 0; c_x < 2; c_x++)
+                {
+                    uint32_t b_val = *(uint32_t *)(B + ((c_x * 8 + j + j_d)*ldb + (k + k_d * 32)) / 8); // wmma::load_matrix_sync(b_frag, (uint32_t *)(B + B_cur_index), ldb);
+
+                    // wmma::bmma_sync(c_frag, a_frag, b_frag, c_frag);
+                    int32_t c_val[8];  // 8 x 32 threads = 256
+                    #pragma unroll
+                    for (int local_j = 0; local_j < 8; ++local_j)
+                    {
+                        uint32_t b_val_cur = __shfl_custom(b_val, local_j * 4 + k_d);
+                        c_val[local_j] = __popc(xor_int32(a_val, b_val_cur));
+                    }
+
+                    #pragma unroll
+                    for (int local_j = 0; local_j < 8; ++local_j)
+                    {
+                        #pragma unroll
+                        for (int local_k = 0; local_k < 4; ++local_k) {
+                            accum_c_val[local_j + c_x*8] += __shfl_custom(c_val[local_j], i_d * 4 + local_k);
+                        }
+                    }
+                }
+            }
+
+            // only the first 8 threads (i) contain 8 good values each, in c_val[8] (j) = 8 x 8 =64
+            // wmma::store_matrix_sync(&C_s[warp_id*WMMA_M*WMMA_N], c_frag, WMMA_N, wmma::mem_row_major);
+            if (k_d == 0) {
+                for (int c_x = 0; c_x < 2; c_x++)
+                {
+                    for (int local_j = 0; local_j < 8; ++local_j)
+                    {
+                        C_s[warp_id*WMMA_M*WMMA_N + i_d*WMMA_N + local_j + WMMA_M*WMMA_N*32 * c_x] = accum_c_val[local_j + c_x*8];
+                    }
+                }
+            }
+#endif // __CUDA_ARCH__ >= 730
+
+            for(int c_x = 0; c_x < 2; c_x++)
+            {
+                int j_d = lane_id % WMMA_N;
+                {
+                    #pragma unroll
+                    for (int i_d = lane_id / WMMA_N; i_d < WMMA_M; i_d += WMMA_M / 2)
+                    {
+                        int count = C_s[warp_id*WMMA_M*WMMA_N + i_d*WMMA_N + j_d + WMMA_M*WMMA_N*32*c_x];
+
+                        const int bit_step = 128;
+                        int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
+                        count = count - f1;    // remove extra bits (from empty space for align only)
+
+                        count = (2 * count - K);
+
+                        float mean_val = mean_arr[i + i_d];
+                        float bias_val = bias_arr[i + i_d];
+                        float dst_val = count *mean_val + bias_val;
+                        if (leaky_activation)
+                            dst_val = (dst_val >= 0) ? (dst_val) : (0.1f*dst_val);    // Leaky activation
+
+                        size_t out_index = (i + i_d)*ldc + (c_x * 8 + j + j_d);
+                        C[out_index] = dst_val;
+
+                        if (shortcut_out_gpu) {
+                            shortcut_out_gpu[out_index] = shortcut_in_gpu[out_index] + dst_val;
+                        }
+                    }
+
+                }
+            }
+        }
+    }
+}
+#endif  // CUDART_VERSION >= 10000
+
+/*
+// Tensor Cores are used for XOR-GEMM
+__global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, int K,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation)
+{
+    // total 57%
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    __shared__ int C_s[8*8 * 32];    // Temprorary result of GEMM WMMA
+
+    const int lane_id = threadIdx.x % 32;
+    const int warp_id = threadIdx.x / 32;
+    const int global_warp_id = index / 32;
+
+    const int N_aligned = N + WMMA_N - (N % WMMA_N);
+
+    int i, j, k, h;
+    // 47% = 29 + 10 + 8
+    j = global_warp_id % (N_aligned / WMMA_N);
+    j = j * WMMA_N;
+    {    // out_h*out_w - one channel output size [169 - 173056]
+        i = global_warp_id / (N_aligned / WMMA_N);
+        i = i * WMMA_M;
+
+        int count = 0;
+        k = 0;
+
+        if (i < M)  //if (i < M)  // l.n - filters [16 - 55 - 1024]
+        {
+            if (j + WMMA_N > N) j = N - WMMA_N;   // must be: j+7 < N
+            if (i + WMMA_M > M) i = M - WMMA_M;   // must be: i+7 < M
+
+#if __CUDA_ARCH__ >= 730
+            // Tensor Cores
+            using namespace nvcuda;
+
+            wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::row_major> a_frag;
+            wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::col_major> b_frag;
+            wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> c_frag;
+            wmma::fill_fragment(c_frag, 0); // !!!! XOR isn't XNOR !!!!!!!!!!
+
+            // 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
+            for (; k < K; k += 128)  // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
+            {
+                int64_t A_cur_index = (i*lda + k) / 8;
+                //int64_t A_cur_index = (local_i*lda + k) / 8;
+                int64_t B_cur_index = (j*ldb + k) / 8;
+
+                // lda, ldb - are in bits
+                wmma::load_matrix_sync(a_frag, (uint32_t *)(A + A_cur_index), lda);   // lda = M
+                wmma::load_matrix_sync(b_frag, (uint32_t *)(B + B_cur_index), ldb);   // ldb = K
+
+                wmma::bmma_sync(c_frag, a_frag, b_frag, c_frag);    // XOR-GEMM
+            }
+            // C[i*ldc + j]
+            wmma::store_matrix_sync(&C_s[warp_id*WMMA_M*WMMA_N], c_frag, WMMA_N, wmma::mem_row_major);
+#else // __CUDA_ARCH__ >= 730
+
+            // Custom XOR-GEMM
+            int k_d = lane_id % 4;
+            int i_d = lane_id / 4;
+            int j_d = lane_id / 4;
+
+            int32_t accum_c_val[8]; // wmma::fill_fragment(c_frag, 0);
+            for (int local_j = 0; local_j < 8; ++local_j) {
+                accum_c_val[local_j] = 0;
+            }
+
+            // 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
+            for (; k < K; k += 128)  // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
+            {
+                int64_t A_cur_index = (i*lda + k) / 8;
+                //int64_t A_cur_index = (local_i*lda + k) / 8;
+                int64_t B_cur_index = (j*ldb + k) / 8;
+
+                // lda, ldb - are in bits
+                // 8*4 = 32
+                // 8*8 = 64
+                int k_d = lane_id % 4;
+                int i_d = lane_id / 4;
+                int j_d = lane_id / 4;
+                uint32_t a_val = *(uint32_t *)(A + ((i + i_d)*lda + (k + k_d*32)) / 8); // wmma::load_matrix_sync(a_frag, (uint32_t *)(A + A_cur_index), lda);
+                uint32_t b_val = *(uint32_t *)(B + ((j + j_d)*ldb + (k + k_d*32)) / 8); // wmma::load_matrix_sync(b_frag, (uint32_t *)(B + B_cur_index), ldb);
+
+                // wmma::bmma_sync(c_frag, a_frag, b_frag, c_frag);
+                int32_t c_val[8];  // 8 x 32 threads = 256
+                #pragma unroll
+                for (int local_j = 0; local_j < 8; ++local_j)
+                {
+                    uint32_t b_val_cur = __shfl_custom(b_val, local_j *4 + k_d);
+                    c_val[local_j] = __popc(xor_int32(a_val, b_val_cur));
+                }
+
+                #pragma unroll
+                for (int local_j = 0; local_j < 8; ++local_j)
+                {
+                    #pragma unroll
+                    for (int local_k = 0; local_k < 4; ++local_k) {
+                        accum_c_val[local_j] += __shfl_custom(c_val[local_j], i_d * 4 + local_k);
+                    }
+                }
+            }
+
+            // only the first 8 threads (i) contain 8 good values each, in c_val[8] (j) = 8 x 8 =64
+            // wmma::store_matrix_sync(&C_s[warp_id*WMMA_M*WMMA_N], c_frag, WMMA_N, wmma::mem_row_major);
+            if (k_d == 0) {
+                for (int local_j = 0; local_j < 8; ++local_j)
+                {
+                    C_s[warp_id*WMMA_M*WMMA_N + i_d*WMMA_N + local_j] = accum_c_val[local_j];
+                }
+            }
+#endif // __CUDA_ARCH__ >= 730
+
+            {
+                int i_d = lane_id % WMMA_M;
+                {
+
+                    for (int j_d = lane_id / WMMA_M; j_d < WMMA_N; j_d += WMMA_N / 2)
+                    {
+                        int count = C_s[warp_id*WMMA_M*WMMA_N + i_d*WMMA_N + j_d];
+
+                        const int bit_step = 128;
+                        int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
+                        count = count - f1;    // remove extra bits (from empty space for align only)
+
+                        count = (2 * count - K);
+
+                        float mean_val = mean_arr[i + i_d];
+                        float bias_val = bias_arr[i + i_d];
+                        float dst_val = count *mean_val + bias_val;
+                        if (leaky_activation)
+                            dst_val = (dst_val > 0) ? (dst_val) : (0.1f*dst_val);    // Leaky activation
+
+                        C[(i + i_d)*ldc + (j + j_d)] = dst_val;
+                    }
+
+                }
+            }
+        }
+    }
+}
+*/
+
+
+// Coalescing
+// A (weights) in the shared_memory - GOOD
+__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation,
+    float *shortcut_in_gpu, float *shortcut_out_gpu)
+{
+    // total 57%
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    __shared__ uint8_t A_s[6144*8/4];
+    //__shared__ uint64_t A_s[6144];  // 48 KB // [lda x M`]
+    //__shared__ uint8_t A_s[6144*8];  // 48 KB // [lda x M`]
+
+    int start_i = blockIdx.x*blockDim.x / N;
+    int end_i = (blockIdx.x*blockDim.x + blockDim.x) / N + 1;
+
+    size_t shared_size = lda * (end_i - start_i);
+
+    int i_cur = index / N;
+    int local_i = i_cur - start_i;
+    // ~10%
+    for (int k = threadIdx.x * 64; k < shared_size; k += blockDim.x * 64) {
+        int x = start_i*lda + k;
+        if (x < (M*lda)) *((uint64_t *)(A_s + k / 8)) = *((uint64_t *)(A + x / 8));
+    }
+    __syncthreads();
+
+    int i, j, k; //, h;
+    // 47% = 29 + 10 + 8
+    j = index % N;
+    {    // out_h*out_w - one channel output size [169 - 173056]
+        i = index / N;
+        //if (i < M)  // l.n - filters [16 - 55 - 1024]
+        {
+            int count = 0;
+            k = 0;
+
+#ifdef NOT_USED
+            // 32 thread X 256 bit = 8192 bit
+            for (; k < (K - 8192); k += 8192) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                ulonglong4 c_bit256;
+
+                //int64_t A_cur_index = (i*lda + k) / 8;
+                int64_t A_cur_index = (local_i*lda + k) / 8;
+                int64_t B_cur_index = (j*ldb + k) / 8;
+                if (i >= M) A_cur_index = 0;
+
+#pragma unroll
+                for (int t = 0; t < WARP_SIZE; ++t) {
+                    const int lane_id = threadIdx.x % WARP_SIZE;
+
+                    const int64_t A_i = __shfl_custom(A_cur_index, t) + 32 * lane_id;
+                    const int64_t B_i = __shfl_custom(B_cur_index, t) + 32 * lane_id;
+
+                    {
+                        //ulonglong4 a_bit256 = *((ulonglong4 *)(A + A_i));    // weights
+                        ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + A_i));    // weights
+                        ulonglong4 b_bit256 = *((ulonglong4 *)(B + B_i));    // input
+                        c_bit256 = xor_int256(a_bit256, b_bit256);
+                        int tmp_count = __popcll(c_bit256.w) + __popcll(c_bit256.x) +
+                            __popcll(c_bit256.y) + __popcll(c_bit256.z);
+
+                        int sum_count = warpAllReduceSum(tmp_count);
+                        if (lane_id == t) count += sum_count;
+                    }
+                }
+            }
+#endif
+
+
+//#ifdef NOT_USED
+            // 32 thread X 64 bit = 2048 bit // 29%
+            for (; k < (K - 2048); k += 2048) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                uint64_t c_bit64;
+
+                //int64_t A_cur_index = (i*lda + k) / 8;
+                int64_t A_cur_index = (local_i*lda + k) / 8;
+                int64_t B_cur_index = (j*ldb + k) / 8;
+                if (i >= M) A_cur_index = 0;
+
+                #pragma unroll
+                for (int t = 0; t < WARP_SIZE; ++t) {
+                    const int lane_id = threadIdx.x % WARP_SIZE;
+
+                    const int64_t A_i = __shfl_custom(A_cur_index, t) + 8 * lane_id;
+                    const int64_t B_i = __shfl_custom(B_cur_index, t) + 8 * lane_id;
+
+                    {
+                        //uint64_t a_bit64 = *((uint64_t *)(A + A_i));    // weights
+                        uint64_t a_bit64 = *((uint64_t *)(A_s + A_i));    // weights
+                        uint64_t b_bit64 = *((uint64_t *)(B + B_i));    // input
+                        c_bit64 = xor_int64(a_bit64, b_bit64);
+                        int tmp_count = __popcll(c_bit64);
+
+                        int sum_count = warpAllReduceSum(tmp_count);
+                        if (lane_id == t) count += sum_count;
+                    }
+                }
+            }
+//#endif
+
+//#ifdef NOT_USED
+            // 32 thread X 32 bit = 1024 bit // 10%
+            for (; k < (K - 1024); k += 1024) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+
+                //int64_t A_cur_index = (i*lda + k) / 8;
+                int64_t A_cur_index = (local_i*lda + k) / 8;
+                int64_t B_cur_index = (j*ldb + k) / 8;
+                if (i >= M) A_cur_index = 0;
+
+                #pragma unroll
+                for (int t = 0; t < WARP_SIZE; ++t) {
+                    const int lane_id = threadIdx.x % WARP_SIZE;
+
+                    const int64_t A_i = __shfl_custom(A_cur_index, t) + 4 * lane_id;
+                    const int64_t B_i = __shfl_custom(B_cur_index, t) + 4 * lane_id;
+
+                    {
+                        //uint64_t a_bit64 = *((uint64_t *)(A + A_i));    // weights
+                        uint32_t a_bit32 = *((uint32_t *)(A_s + A_i));    // weights
+                        uint32_t b_bit32 = *((uint32_t *)(B + B_i));    // input
+                        uint32_t c_bit32 = xor_int32(a_bit32, b_bit32);
+                        int tmp_count = __popc(c_bit32);
+
+                        int sum_count = warpAllReduceSum(tmp_count);
+                        if (lane_id == t) count += sum_count;
+                    }
+                }
+            }
+//#endif
+
+            if (i < M)
+            {
+                float mean_val = mean_arr[i];
+                float bias_val = bias_arr[i];
+
+//#ifdef NOT_USED
+                // 8%
+                for (; k < K; k += 256) {   // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
+                    //ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8));    // weights
+                    ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + (local_i*lda + k) / 8));    // weights
+                    ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8));    // input
+                    ulonglong4 c_bit256 = xor_int256(a_bit256, b_bit256);
+
+                    count += __popcll(c_bit256.w) + __popcll(c_bit256.x) +
+                        __popcll(c_bit256.y) + __popcll(c_bit256.z);
+                }
+//#endif
+
+#ifdef NOT_USED
+                for (; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                    //uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8));    // weights
+                    uint64_t a_bit64 = *((uint64_t *)(A_s + (local_i*lda + k) / 8));    // weights
+                    uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8));            // input
+                    uint64_t c_bit64 = xor_int64(a_bit64, b_bit64);
+
+                    count += __popcll(c_bit64);
+                }
+#endif
+
+                const int bit_step = 256;
+                int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
+                count = count - f1;    // remove extra bits (from empty space for align only)
+                float dst_val = (2 * count - K) *mean_val + bias_val;
+                if(leaky_activation)
+                    dst_val = (dst_val >= 0) ? (dst_val) : (0.1f*dst_val);    // Leaky activation
+                size_t out_index = i*ldc + j;
+                C[out_index] = dst_val;
+
+                if (shortcut_out_gpu) {
+                    shortcut_out_gpu[out_index] = shortcut_in_gpu[out_index] + dst_val;
+                }
+            }
+        }
+    }
+}
+
+
+// further optimization - use WMMA GEMM for using Tensor Cores
+// https://github.com/NVIDIA-developer-blog/code-samples/blob/master/posts/tensor-cores/simpleTensorCoreGEMM.cu
+// https://github.com/NVIDIA/cuda-samples/blob/master/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-subbyte
+// nvcuda::wmma::col_major ->  cutlass::MatrixLayout::kColumnMajor (matrix is not transposed)
+
+// Matrix A    Matrix B    Accumulator    Matrix Size (m-n-k)
+// precision::b1    precision::b1    int    8x8x128
+
+// The only dimensions currently supported by WMMA for XNOR
+// const int WMMA_M = 8;
+// const int WMMA_N = 8;
+// const int WMMA_K = 128;
+
+
+// GOOD
+void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
+    unsigned char *A, int lda,
+    unsigned char *B, int ldb,
+    float *C, int ldc, float *mean_arr, float *bias, int leaky_activation,
+    float *shortcut_in_gpu, float *shortcut_out_gpu)
+{
+    int size = M*N;
+    const int num_blocks = get_number_of_blocks(size, BLOCK);
+
+    //printf("\n M = %d, N = %d, M %% 8 = %d, N %% 8 = %d \n", M, N, M % 8, N % 8);
+
+    /*
+    printf("\n gemm_bin size = %d, num_blocks = %d, M*K = %d KB, N*K = %d KB \n (w) M*K/num_blocks = %d KB, (i) N*K/num_blocks = %d KB \n",
+        size, num_blocks, M*K / 1024, N*K / 1024, M*lda / num_blocks / 1024, N*ldb / num_blocks / 1024);
+    printf(" M / 512 = %d, N / 512 = %d, M*lda / 512 = %d, N*ldb / 512 = %d \n", M / 512, N / 512, M*lda/512, N*ldb/512);
+    */
+    //printf(" shared_memory: (w) lda*BLOCK/N = %d, (i) ldb*BLOCK/M = %d, \t lda = %d \n\n", lda*BLOCK / N, ldb*BLOCK / M, lda);
+
+
+    //if (M % 8 == 0 && N % 8 == 0 && M == 128)
+    //if (M >= 32)    // l.n >= 32
+#if CUDART_VERSION >= 10000
+    if (1)
+    {
+        const int M_aligned = M + (8 - (M % 8));
+        const int N_aligned = N + (16 - (N % 16));
+        int size = (M_aligned / 8)*(N_aligned / 16)*WARP_SIZE;
+        const int num_blocks = get_number_of_blocks(size, BLOCK);
+
+        //printf(" lda = %d, ldb = %d, ldc = %d, lda/32 = %d, ldb/32 = %d, ldc/32 = %d \n", lda, ldb, ldc, lda / 32, ldb / 32, ldc / 32);
+        //printf("  l.c (K/9) = %d, M (l.n) = %d \n", (K%9 == 0)? K / 9: K, M);
+        gemm_nn_custom_bin_mean_transposed_tensor_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (
+            M, N, K,
+            A, lda,
+            B, ldb,
+            C, ldc,
+            mean_arr, bias, leaky_activation,
+            shortcut_in_gpu, shortcut_out_gpu);
+
+        //cudaDeviceSynchronize();
+    }
+    else
+#endif  //# CUDART_VERSION >= 10000
+    {
+        gemm_nn_custom_bin_mean_transposed_gpu_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (
+            M, N, K,
+            A, lda,
+            B, ldb,
+            C, ldc,
+            mean_arr, bias, leaky_activation,
+            shortcut_in_gpu, shortcut_out_gpu);
+    }
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+// --------------------------------
+
+/*
+void convolve_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad)
+{
+    int fil;
+    // filter index
+#pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
+    for (fil = 0; fil < n; ++fil) {
+        int chan, y, x, f_y, f_x;
+        // channel index
+        for (chan = 0; chan < in_c; ++chan)
+            // input - y
+            for (y = 0; y < in_h; ++y)
+                // input - x
+                for (x = 0; x < in_w; ++x)
+                {
+                    int const output_index = fil*in_w*in_h + y*in_w + x;
+                    int const weights_pre_index = fil*in_c*size*size + chan*size*size;
+                    int const input_pre_index = chan*in_w*in_h;
+                    float sum = 0;
+
+                    // filter - y
+                    for (f_y = 0; f_y < size; ++f_y)
+                    {
+                        int input_y = y + f_y - pad;
+                        // filter - x
+                        for (f_x = 0; f_x < size; ++f_x)
+                        {
+                            int input_x = x + f_x - pad;
+                            if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;
+
+                            int input_index = input_pre_index + input_y*in_w + input_x;
+                            int weights_index = weights_pre_index + f_y*size + f_x;
+
+                            sum += input[input_index] * weights[weights_index];
+                        }
+                    }
+                    // l.output[filters][width][height] +=
+                    //        state.input[channels][width][height] *
+                    //        l.weights[filters][channels][filter_width][filter_height];
+                    output[output_index] += sum;
+                }
+    }
+
+
+}
+// --------------------------------
+
+
+void convolve_bin_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n,
+    int size, int pad, int new_lda, float *mean_arr_gpu)
+{
+    int fil;
+    // filter index
+#pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
+    for (fil = 0; fil < n; ++fil) {
+        float mean_val = mean_arr_gpu[fil];
+        int chan, y, x, f_y, f_x;
+        // channel index
+        for (chan = 0; chan < in_c; ++chan)
+            // input - y
+            for (y = 0; y < in_h; ++y)
+                // input - x
+                for (x = 0; x < in_w; ++x)
+                {
+                    int const output_index = fil*in_w*in_h + y*in_w + x;
+                    int const weights_pre_index = fil*in_c*size*size + chan*size*size;
+                    int const input_pre_index = chan*in_w*in_h;
+                    int sum = 0;
+                    int good_val = 0;
+
+                    // filter - y
+                    for (f_y = 0; f_y < size; ++f_y)
+                    {
+                        int input_y = y + f_y - pad;
+                        // filter - x
+                        for (f_x = 0; f_x < size; ++f_x)
+                        {
+                            int input_x = x + f_x - pad;
+                            if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;
+
+                            int input_index = input_pre_index + input_y*in_w + input_x;
+                            //int weights_index = weights_pre_index + f_y*size + f_x;
+                            //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
+                            int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;
+
+                            //sum += input[input_index] * weights[weights_index];
+
+                            int8_t in_bit = get_bit((uint8_t *)input, input_index);
+                            int8_t w_bit = get_bit((uint8_t *)weights, weights_index);
+                            int res = xnor_bit1(in_bit, w_bit);
+                            sum += res;
+                            good_val++;
+                            //sum += (res > 0) ? 1 : -1;
+                            //in_bit = (in_bit > 0) ? 1 : -1;
+                            //w_bit = (w_bit > 0) ? 1 : -1;
+                            //int8_t res = in_bit*w_bit;
+                            //sum += res;
+                            //printf("\n i: %d x w: %d = res: %d \t sum: %d \t mean = %f \n", in_bit, w_bit, res, sum, mean_val);
+                        }
+                    }
+                    //printf("sum = %d, ", sum);
+                    sum = sum - (good_val - sum);
+                    //printf(" size = %d, sum = %d \n", size, sum);
+
+                    // l.output[filters][width][height] +=
+                    //        state.input[channels][width][height] *
+                    //        l.weights[filters][channels][filter_width][filter_height];
+                    output[output_index] += sum*mean_val;
+                }
+    }
+}
+*/
+// --------------------------------
+
+__global__ void convolve_gpu_kernel(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    int fil;
+    // filter index
+    //for (fil = 0; fil < n; ++fil)
+    int chan, y, x, f_y, f_x;
+    // channel index
+    //for (chan = 0; chan < in_c; ++chan)
+    // input - y
+    //for (y = 0; y < in_h; ++y)
+    // input - x
+    //for (x = 0; x < in_w; ++x)
+    x = index % in_w;
+    int index2 = index / in_w;
+    y = index2 % in_h;
+    fil = index2 / in_h;
+    if (fil < n)
+    {
+
+        int const output_index = fil*in_w*in_h + y*in_w + x;
+        float sum = 0;
+
+        for (chan = 0; chan < in_c; ++chan)
+        {
+            int const weights_pre_index = fil*in_c*size*size + chan*size*size;
+            int const input_pre_index = chan*in_w*in_h;
+
+            // filter - y
+            for (f_y = 0; f_y < size; ++f_y)
+            {
+                int input_y = y + f_y - pad;
+                // filter - x
+                for (f_x = 0; f_x < size; ++f_x)
+                {
+                    int input_x = x + f_x - pad;
+                    if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;
+
+                    int input_index = input_pre_index + input_y*in_w + input_x;
+                    int weights_index = weights_pre_index + f_y*size + f_x;
+
+                    sum += input[input_index] * weights[weights_index];
+
+                }
+            }
+            // l.output[filters][width][height] +=
+            //        state.input[channels][width][height] *
+            //        l.weights[filters][channels][filter_width][filter_height];
+            //output[output_index] += sum;
+        }
+        output[output_index] = sum;
+    }
+
+}
+
+void convolve_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad)
+{
+    int array_size = in_w*in_h*n;    // width X height X filters
+    const int num_blocks = array_size / BLOCK + 1;
+    //printf("\n array_size = %d, num_blocks = %d, w = %d, h = %d, n = %d, c = %d, pad = %d \n", array_size, num_blocks, in_w, in_h, n, in_c, pad);
+
+    convolve_gpu_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (input, weights, output, in_w, in_h, in_c, n, size, pad);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+// --------------------------------
+
+/*
+__global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n,
+    int size, int pad, int new_lda, float *mean_arr_gpu)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    int fil;
+    // filter index
+    //for (fil = 0; fil < n; ++fil)
+    int chan, y, x, f_y, f_x;
+    // channel index
+    //for (chan = 0; chan < in_c; ++chan)
+    // input - y
+    //for (y = 0; y < in_h; ++y)
+    // input - x
+    //for (x = 0; x < in_w; ++x)
+    x = index % in_w;
+    int index2 = index / in_w;
+    y = index2 % in_h;
+    fil = index2 / in_h;
+    if (fil < n)    // (1-6 for one BLOCK)
+    {
+                //float mean_val = mean_arr_gpu[fil];
+                int const output_index = fil*in_w*in_h + y*in_w + x;
+                int sum = 0;
+                int good_val = 0;
+
+                for (chan = 0; chan < in_c; ++chan)
+                {
+                    //int const weights_pre_index = fil*in_c*size*size + chan*size*size;
+                    int const weights_pre_index = fil*new_lda + chan*size*size;
+                    int const input_pre_index = chan*in_w*in_h;
+
+                    // filter - y
+                    for (f_y = 0; f_y < size; ++f_y)
+                    {
+                        int input_y = y + f_y - pad;
+                        // filter - x
+                        for (f_x = 0; f_x < size; ++f_x)
+                        {
+                            int input_x = x + f_x - pad;
+                            if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;
+
+                            int input_index = input_pre_index + input_y*in_w + input_x;
+                            int weights_index = weights_pre_index + f_y*size + f_x;
+                            //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
+                            //int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;
+
+                            uint8_t in_bit = get_bit((uint8_t *)input, input_index);
+                            uint8_t w_bit = get_bit((uint8_t *)weights, weights_index);
+                            int res = xnor_bit1(in_bit, w_bit);
+                            sum += res;
+                            good_val++;
+
+                            //sum += input[input_index] *weights[weights_index];
+
+                        }
+                    }
+                    // l.output[filters][width][height] +=
+                    //        state.input[channels][width][height] *
+                    //        l.weights[filters][channels][filter_width][filter_height];
+                    //output[output_index] += sum;
+                }
+                sum = sum - (good_val - sum);
+                output[output_index] = sum * mean_arr_gpu[fil]; // atoimcAdd for inter-BLOCK sum
+    }
+
+}
+*/
+
+__global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n,
+    int size, int pad, int new_lda, float *mean_arr_gpu)
+{
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    int fil;
+    // filter index
+    //for (fil = 0; fil < n; ++fil)
+    int chan, y, x, f_y, f_x;
+    // channel index
+    //for (chan = 0; chan < in_c; ++chan)
+    // input - y
+    //for (y = 0; y < in_h; ++y)
+    // input - x
+    //for (x = 0; x < in_w; ++x)
+    x = index % in_w;
+    int index2 = index / in_w;
+    y = index2 % in_h;
+    fil = index2 / in_h;
+    //if (fil < n)    // (1-6 for one BLOCK)
+    {
+        //float mean_val = mean_arr_gpu[fil];
+        int const output_index = fil*in_w*in_h + y*in_w + x;
+        int sum = 0;
+        int good_val = 0;
+
+        int min_index = blockIdx.x*blockDim.x;
+        int min_fil = (min_index / in_w) / in_h;
+        int max_index = (blockIdx.x+1)*blockDim.x - 1;
+        int max_fil = (max_index / in_w) / in_h;
+
+        __shared__ uint32_t weights_shared[3*3*1024*6/32 + 1];  // 7 KB (6 filters) - use (new_lda) for size calculation
+        //const int weights_size = size*size*in_c/8;
+        const int weights_size = size*size*in_c / 32 + 1;
+
+        for (int tmp_fil = min_fil; tmp_fil <= max_fil; tmp_fil++) {
+            for (int s = threadIdx.x; s < weights_size; s += blockDim.x) {
+                //weights_shared[s + (tmp_fil - min_fil)*new_lda / 8] = ((uint8_t *)weights)[tmp_fil*new_lda / 8 + s];
+                weights_shared[s + (tmp_fil - min_fil)*new_lda/32] = ((uint32_t *)weights)[tmp_fil*new_lda / 32 + s];
+            }
+        }
+        __syncthreads();
+
+        for (chan = 0; chan < in_c; ++chan)
+        {
+            //int const weights_pre_index = fil*in_c*size*size + chan*size*size;
+            //int const weights_pre_index = fil*new_lda + chan*size*size;
+            int const input_pre_index = chan*in_w*in_h;
+
+            __shared__ uint32_t input_shared[416*416/32 + 1];   // 21.2 KB bytes (for input size 832x832)
+            const int input_shared_size = in_w*in_h / 32 + 1;
+            const int add_input_index = input_pre_index % 32;
+            __syncthreads();    // why??? but is required
+
+            for (int s = threadIdx.x; s < input_shared_size; s += blockDim.x) {
+                input_shared[s] = ((uint32_t *)input)[input_pre_index / 32 + s];
+            }
+            __syncthreads();
+
+            /*
+            __shared__ uint8_t input_shared[208 * 208 / 8 + 1];   // 5.4 KB bytes (for input size 416x416)
+            const int input_shared_size = in_w*in_h / 8 + 1;
+            const int add_input_index = input_pre_index % 8;
+            __syncthreads();
+
+            for (int s = threadIdx.x; s < input_shared_size; s += blockDim.x) {
+                ((uint8_t *)input_shared)[s] = ((uint8_t *)input)[input_pre_index / 8 + s];
+            }
+            __syncthreads();
+            */
+            //int src_index = -1;
+            //uint32_t input_byte;
+
+            if (fil < n)    // (1-6 for one BLOCK)
+            {
+                // filter - y
+                for (f_y = 0; f_y < size; ++f_y)
+                {
+                    int input_y = y + f_y - pad;
+                    // filter - x
+                    for (f_x = 0; f_x < size; ++f_x)
+                    {
+                        int input_x = x + f_x - pad;
+                        if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;
+
+                        //int input_index = input_pre_index + input_y*in_w + input_x;
+                        //int weights_index = weights_pre_index + f_y*size + f_x;
+                        //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
+                        //int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;
+
+                        //uint8_t in_bit = get_bit((uint8_t *)input, input_index);
+                        //uint8_t w_bit = get_bit((uint8_t *)weights, weights_index);
+
+                        //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
+                        int weights_shared_index = (fil - min_fil)*new_lda + chan*size*size + f_y*size + f_x;
+                        //uint8_t in_bit = get_bit((uint8_t *)weights_shared, weights_shared_index);
+                        uint8_t w_bit = get_bit((uint8_t *)weights_shared, weights_shared_index);
+
+                        //int input_index = input_pre_index + input_y*in_w + input_x;
+                        int input_shared_index = /*input_pre_index +*/ input_y*in_w + input_x + add_input_index;
+                        uint8_t in_bit = get_bit((uint8_t *)input_shared, input_shared_index);
+                        /*
+                        int new_src_index = input_shared_index / 32;
+                        int src_shift = input_shared_index % 32;
+                        //if (new_src_index != src_index)
+                        {
+                            src_index = new_src_index;
+                            input_byte = ((uint32_t *)input_shared)[src_index];
+                        }
+                        uint8_t in_bit = (input_byte & (1 << src_shift)) >> src_shift;
+                        */
+
+                        int res = xnor_bit1(in_bit, w_bit);
+                        sum += res;
+                        good_val++;
+
+                        //sum += input[input_index] *weights[weights_index];
+
+                    }
+                }
+            }
+            // l.output[filters][width][height] +=
+            //        state.input[channels][width][height] *
+            //        l.weights[filters][channels][filter_width][filter_height];
+            //output[output_index] += sum;
+        }
+        sum = sum - (good_val - sum);
+        //output[output_index] = sum * mean_arr_gpu[fil]; // atoimcAdd for inter-BLOCK sum
+        atomicAdd(&output[output_index], sum * mean_arr_gpu[fil]);
+    }
+
+}
+
+void convolve_bin_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n,
+    int size, int pad, int new_lda, float *mean_arr_gpu)
+{
+    int array_size = in_w*in_h*n;    // width X height X filters
+    const int num_blocks = array_size / BLOCK + 1;
+    //printf("\n array_size = %d, num_blocks = %d, w = %d, h = %d, n = %d, c = %d, pad = %d \n", array_size, num_blocks, in_w, in_h, n, in_c, pad);
+
+    convolve_bin_gpu_kernel <<<num_blocks, BLOCK, 0, get_cuda_stream() >>> (input, weights, output, in_w, in_h, in_c, n, size, pad, new_lda, mean_arr_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+// --------------------------------
+
+// CUDA: use 512 threads per block
+const int CAFFE_CUDA_NUM_THREADS = 512;
+
+// CUDA: number of blocks for threads.
+inline int CAFFE_GET_BLOCKS(const int N) {
+    return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
+}
+
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n); \
+       i += blockDim.x * gridDim.x)
+
+// https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
+__global__ void im2col_gpu_kernel_ext(const int n, const float* data_im,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int height_col, const int width_col,
+    float* data_col) {
+    CUDA_KERNEL_LOOP(index, n) {
+        const int h_index = index / width_col;
+        const int h_col = h_index % height_col;
+        const int w_col = index % width_col;
+        const int c_im = h_index / height_col;
+        const int c_col = c_im * kernel_h * kernel_w;
+        const int h_offset = h_col * stride_h - pad_h;
+        const int w_offset = w_col * stride_w - pad_w;
+        float* data_col_ptr = data_col;
+        data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
+        const float* data_im_ptr = data_im;
+        data_im_ptr += (c_im * height + h_offset) * width + w_offset;
+        for (int i = 0; i < kernel_h; ++i) {
+            for (int j = 0; j < kernel_w; ++j) {
+                int h_im = h_offset + i * dilation_h;
+                int w_im = w_offset + j * dilation_w;
+                *data_col_ptr =
+                    (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+                    data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
+                data_col_ptr += height_col * width_col;
+            }
+        }
+    }
+}
+
+
+void im2col_gpu_ext(const float* data_im, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    float* data_col)
+{
+    // We are going to launch channels * height_col * width_col kernels, each
+    // kernel responsible for copying a single-channel grid.
+    int height_col = (height + 2 * pad_h -
+        (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    int width_col = (width + 2 * pad_w -
+        (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+    int num_kernels = channels * height_col * width_col;
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    im2col_gpu_kernel_ext <<<CAFFE_GET_BLOCKS(num_kernels),
+        CAFFE_CUDA_NUM_THREADS >>>(
+            num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
+            pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col,
+            width_col, data_col);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
diff --git a/darknet-master/src/image.c b/darknet-master/src/image.c
new file mode 100644
index 0000000..b238db3
--- /dev/null
+++ b/darknet-master/src/image.c
@@ -0,0 +1,1710 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "image.h"
+#include "utils.h"
+#include "blas.h"
+#include "dark_cuda.h"
+#include <stdio.h>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+
+#ifndef STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb_image.h>
+#endif
+#ifndef STB_IMAGE_WRITE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include <stb_image_write.h>
+#endif
+
+float colors[6][3] = { {1,0,1}, {0,0,1},{0,1,1},{0,1,0},{1,1,0},{1,0,0} };
+
+float get_color(int c, int x, int max)
+{
+    float ratio = ((float)x/max)*5;
+    int i = floor(ratio);
+    int j = ceil(ratio);
+    ratio -= i;
+    float r = (1-ratio) * colors[i][c] + ratio*colors[j][c];
+    //printf("%f\n", r);
+    return r;
+}
+
+static float get_pixel(image m, int x, int y, int c)
+{
+    assert(x < m.w && y < m.h && c < m.c);
+    return m.data[c*m.h*m.w + y*m.w + x];
+}
+static float get_pixel_extend(image m, int x, int y, int c)
+{
+    if (x < 0 || x >= m.w || y < 0 || y >= m.h) return 0;
+    /*
+    if(x < 0) x = 0;
+    if(x >= m.w) x = m.w-1;
+    if(y < 0) y = 0;
+    if(y >= m.h) y = m.h-1;
+    */
+    if (c < 0 || c >= m.c) return 0;
+    return get_pixel(m, x, y, c);
+}
+static void set_pixel(image m, int x, int y, int c, float val)
+{
+    if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
+    assert(x < m.w && y < m.h && c < m.c);
+    m.data[c*m.h*m.w + y*m.w + x] = val;
+}
+static void add_pixel(image m, int x, int y, int c, float val)
+{
+    assert(x < m.w && y < m.h && c < m.c);
+    m.data[c*m.h*m.w + y*m.w + x] += val;
+}
+
+void composite_image(image source, image dest, int dx, int dy)
+{
+    int x,y,k;
+    for(k = 0; k < source.c; ++k){
+        for(y = 0; y < source.h; ++y){
+            for(x = 0; x < source.w; ++x){
+                float val = get_pixel(source, x, y, k);
+                float val2 = get_pixel_extend(dest, dx+x, dy+y, k);
+                set_pixel(dest, dx+x, dy+y, k, val * val2);
+            }
+        }
+    }
+}
+
+image border_image(image a, int border)
+{
+    image b = make_image(a.w + 2*border, a.h + 2*border, a.c);
+    int x,y,k;
+    for(k = 0; k < b.c; ++k){
+        for(y = 0; y < b.h; ++y){
+            for(x = 0; x < b.w; ++x){
+                float val = get_pixel_extend(a, x - border, y - border, k);
+                if(x - border < 0 || x - border >= a.w || y - border < 0 || y - border >= a.h) val = 1;
+                set_pixel(b, x, y, k, val);
+            }
+        }
+    }
+    return b;
+}
+
+image tile_images(image a, image b, int dx)
+{
+    if(a.w == 0) return copy_image(b);
+    image c = make_image(a.w + b.w + dx, (a.h > b.h) ? a.h : b.h, (a.c > b.c) ? a.c : b.c);
+    fill_cpu(c.w*c.h*c.c, 1, c.data, 1);
+    embed_image(a, c, 0, 0);
+    composite_image(b, c, a.w + dx, 0);
+    return c;
+}
+
+image get_label(image **characters, char *string, int size)
+{
+    if(size > 7) size = 7;
+    image label = make_empty_image(0,0,0);
+    while(*string){
+        image l = characters[size][(int)*string];
+        image n = tile_images(label, l, -size - 1 + (size+1)/2);
+        free_image(label);
+        label = n;
+        ++string;
+    }
+    image b = border_image(label, label.h*.25);
+    free_image(label);
+    return b;
+}
+
+image get_label_v3(image **characters, char *string, int size)
+{
+    size = size / 10;
+    if (size > 7) size = 7;
+    image label = make_empty_image(0, 0, 0);
+    while (*string) {
+        image l = characters[size][(int)*string];
+        image n = tile_images(label, l, -size - 1 + (size + 1) / 2);
+        free_image(label);
+        label = n;
+        ++string;
+    }
+    image b = border_image(label, label.h*.05);
+    free_image(label);
+    return b;
+}
+
+void draw_label(image a, int r, int c, image label, const float *rgb)
+{
+    int w = label.w;
+    int h = label.h;
+    if (r - h >= 0) r = r - h;
+
+    int i, j, k;
+    for(j = 0; j < h && j + r < a.h; ++j){
+        for(i = 0; i < w && i + c < a.w; ++i){
+            for(k = 0; k < label.c; ++k){
+                float val = get_pixel(label, i, j, k);
+                set_pixel(a, i+c, j+r, k, rgb[k] * val);
+            }
+        }
+    }
+}
+
+void draw_weighted_label(image a, int r, int c, image label, const float *rgb, const float alpha)
+{
+    int w = label.w;
+    int h = label.h;
+    if (r - h >= 0) r = r - h;
+
+    int i, j, k;
+    for (j = 0; j < h && j + r < a.h; ++j) {
+        for (i = 0; i < w && i + c < a.w; ++i) {
+            for (k = 0; k < label.c; ++k) {
+                float val1 = get_pixel(label, i, j, k);
+                float val2 = get_pixel(a, i + c, j + r, k);
+                float val_dst = val1 * rgb[k] * alpha + val2 * (1 - alpha);
+                set_pixel(a, i + c, j + r, k, val_dst);
+            }
+        }
+    }
+}
+
+void draw_box_bw(image a, int x1, int y1, int x2, int y2, float brightness)
+{
+    //normalize_image(a);
+    int i;
+    if (x1 < 0) x1 = 0;
+    if (x1 >= a.w) x1 = a.w - 1;
+    if (x2 < 0) x2 = 0;
+    if (x2 >= a.w) x2 = a.w - 1;
+
+    if (y1 < 0) y1 = 0;
+    if (y1 >= a.h) y1 = a.h - 1;
+    if (y2 < 0) y2 = 0;
+    if (y2 >= a.h) y2 = a.h - 1;
+
+    for (i = x1; i <= x2; ++i) {
+        a.data[i + y1*a.w + 0 * a.w*a.h] = brightness;
+        a.data[i + y2*a.w + 0 * a.w*a.h] = brightness;
+    }
+    for (i = y1; i <= y2; ++i) {
+        a.data[x1 + i*a.w + 0 * a.w*a.h] = brightness;
+        a.data[x2 + i*a.w + 0 * a.w*a.h] = brightness;
+    }
+}
+
+void draw_box_width_bw(image a, int x1, int y1, int x2, int y2, int w, float brightness)
+{
+    int i;
+    for (i = 0; i < w; ++i) {
+        float alternate_color = (w % 2) ? (brightness) : (1.0 - brightness);
+        draw_box_bw(a, x1 + i, y1 + i, x2 - i, y2 - i, alternate_color);
+    }
+}
+
+void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b)
+{
+    //normalize_image(a);
+    int i;
+    if(x1 < 0) x1 = 0;
+    if(x1 >= a.w) x1 = a.w-1;
+    if(x2 < 0) x2 = 0;
+    if(x2 >= a.w) x2 = a.w-1;
+
+    if(y1 < 0) y1 = 0;
+    if(y1 >= a.h) y1 = a.h-1;
+    if(y2 < 0) y2 = 0;
+    if(y2 >= a.h) y2 = a.h-1;
+
+    for(i = x1; i <= x2; ++i){
+        a.data[i + y1*a.w + 0*a.w*a.h] = r;
+        a.data[i + y2*a.w + 0*a.w*a.h] = r;
+
+        a.data[i + y1*a.w + 1*a.w*a.h] = g;
+        a.data[i + y2*a.w + 1*a.w*a.h] = g;
+
+        a.data[i + y1*a.w + 2*a.w*a.h] = b;
+        a.data[i + y2*a.w + 2*a.w*a.h] = b;
+    }
+    for(i = y1; i <= y2; ++i){
+        a.data[x1 + i*a.w + 0*a.w*a.h] = r;
+        a.data[x2 + i*a.w + 0*a.w*a.h] = r;
+
+        a.data[x1 + i*a.w + 1*a.w*a.h] = g;
+        a.data[x2 + i*a.w + 1*a.w*a.h] = g;
+
+        a.data[x1 + i*a.w + 2*a.w*a.h] = b;
+        a.data[x2 + i*a.w + 2*a.w*a.h] = b;
+    }
+}
+
+void draw_box_width(image a, int x1, int y1, int x2, int y2, int w, float r, float g, float b)
+{
+    int i;
+    for(i = 0; i < w; ++i){
+        draw_box(a, x1+i, y1+i, x2-i, y2-i, r, g, b);
+    }
+}
+
+void draw_bbox(image a, box bbox, int w, float r, float g, float b)
+{
+    int left  = (bbox.x-bbox.w/2)*a.w;
+    int right = (bbox.x+bbox.w/2)*a.w;
+    int top   = (bbox.y-bbox.h/2)*a.h;
+    int bot   = (bbox.y+bbox.h/2)*a.h;
+
+    int i;
+    for(i = 0; i < w; ++i){
+        draw_box(a, left+i, top+i, right-i, bot-i, r, g, b);
+    }
+}
+
+image **load_alphabet()
+{
+    int i, j;
+    const int nsize = 8;
+    image** alphabets = (image**)xcalloc(nsize, sizeof(image*));
+    for(j = 0; j < nsize; ++j){
+        alphabets[j] = (image*)xcalloc(128, sizeof(image));
+        for(i = 32; i < 127; ++i){
+            char buff[256];
+            sprintf(buff, "data/labels/%d_%d.png", i, j);
+            alphabets[j][i] = load_image_color(buff, 0, 0);
+        }
+    }
+    return alphabets;
+}
+
+void free_alphabet(image **alphabet)
+{
+    int i, j;
+    const int nsize = 8;
+    for (j = 0; j < nsize; ++j) {
+        for (i = 32; i < 127; ++i) {
+            free_image(alphabet[j][i]);
+        }
+        free(alphabet[j]);
+    }
+    free(alphabet);
+}
+
+
+
+// Creates array of detections with prob > thresh and fills best_class for them
+detection_with_class* get_actual_detections(detection *dets, int dets_num, float thresh, int* selected_detections_num, char **names)
+{
+    int selected_num = 0;
+    detection_with_class* result_arr = (detection_with_class*)xcalloc(dets_num, sizeof(detection_with_class));
+    int i;
+    for (i = 0; i < dets_num; ++i) {
+        int best_class = -1;
+        float best_class_prob = thresh;
+        int j;
+        for (j = 0; j < dets[i].classes; ++j) {
+            int show = strncmp(names[j], "dont_show", 9);
+            if (dets[i].prob[j] > best_class_prob && show) {
+                best_class = j;
+                best_class_prob = dets[i].prob[j];
+            }
+        }
+        if (best_class >= 0) {
+            result_arr[selected_num].det = dets[i];
+            result_arr[selected_num].best_class = best_class;
+            ++selected_num;
+        }
+    }
+    if (selected_detections_num)
+        *selected_detections_num = selected_num;
+    return result_arr;
+}
+
+// compare to sort detection** by bbox.x
+int compare_by_lefts(const void *a_ptr, const void *b_ptr) {
+    const detection_with_class* a = (detection_with_class*)a_ptr;
+    const detection_with_class* b = (detection_with_class*)b_ptr;
+    const float delta = (a->det.bbox.x - a->det.bbox.w/2) - (b->det.bbox.x - b->det.bbox.w/2);
+    return delta < 0 ? -1 : delta > 0 ? 1 : 0;
+}
+
+// compare to sort detection** by best_class probability
+int compare_by_probs(const void *a_ptr, const void *b_ptr) {
+    const detection_with_class* a = (detection_with_class*)a_ptr;
+    const detection_with_class* b = (detection_with_class*)b_ptr;
+    float delta = a->det.prob[a->best_class] - b->det.prob[b->best_class];
+    return delta < 0 ? -1 : delta > 0 ? 1 : 0;
+}
+
+void draw_detections_v3(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes, int ext_output)
+{
+    static int frame_id = 0;
+    frame_id++;
+
+    int selected_detections_num;
+    detection_with_class* selected_detections = get_actual_detections(dets, num, thresh, &selected_detections_num, names);
+
+    // text output
+    qsort(selected_detections, selected_detections_num, sizeof(*selected_detections), compare_by_lefts);
+    int i;
+    for (i = 0; i < selected_detections_num; ++i) {
+        const int best_class = selected_detections[i].best_class;
+        printf("%s: %.0f%%", names[best_class],    selected_detections[i].det.prob[best_class] * 100);
+        if (ext_output)
+            printf("\t(left_x: %4.0f   top_y: %4.0f   width: %4.0f   height: %4.0f)\n",
+                round((selected_detections[i].det.bbox.x - selected_detections[i].det.bbox.w / 2)*im.w),
+                round((selected_detections[i].det.bbox.y - selected_detections[i].det.bbox.h / 2)*im.h),
+                round(selected_detections[i].det.bbox.w*im.w), round(selected_detections[i].det.bbox.h*im.h));
+        else
+            printf("\n");
+        int j;
+        for (j = 0; j < classes; ++j) {
+            if (selected_detections[i].det.prob[j] > thresh && j != best_class) {
+                printf("%s: %.0f%%", names[j], selected_detections[i].det.prob[j] * 100);
+
+                if (ext_output)
+                    printf("\t(left_x: %4.0f   top_y: %4.0f   width: %4.0f   height: %4.0f)\n",
+                        round((selected_detections[i].det.bbox.x - selected_detections[i].det.bbox.w / 2)*im.w),
+                        round((selected_detections[i].det.bbox.y - selected_detections[i].det.bbox.h / 2)*im.h),
+                        round(selected_detections[i].det.bbox.w*im.w), round(selected_detections[i].det.bbox.h*im.h));
+                else
+                    printf("\n");
+            }
+        }
+    }
+
+    // image output
+    qsort(selected_detections, selected_detections_num, sizeof(*selected_detections), compare_by_probs);
+    for (i = 0; i < selected_detections_num; ++i) {
+            int width = im.h * .002;
+            if (width < 1)
+                width = 1;
+
+            /*
+            if(0){
+            width = pow(prob, 1./2.)*10+1;
+            alphabet = 0;
+            }
+            */
+
+            //printf("%d %s: %.0f%%\n", i, names[selected_detections[i].best_class], prob*100);
+            int offset = selected_detections[i].best_class * 123457 % classes;
+            float red = get_color(2, offset, classes);
+            float green = get_color(1, offset, classes);
+            float blue = get_color(0, offset, classes);
+            float rgb[3];
+
+            //width = prob*20+2;
+
+            rgb[0] = red;
+            rgb[1] = green;
+            rgb[2] = blue;
+            box b = selected_detections[i].det.bbox;
+            //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
+
+            int left = (b.x - b.w / 2.)*im.w;
+            int right = (b.x + b.w / 2.)*im.w;
+            int top = (b.y - b.h / 2.)*im.h;
+            int bot = (b.y + b.h / 2.)*im.h;
+
+            if (left < 0) left = 0;
+            if (right > im.w - 1) right = im.w - 1;
+            if (top < 0) top = 0;
+            if (bot > im.h - 1) bot = im.h - 1;
+
+            //int b_x_center = (left + right) / 2;
+            //int b_y_center = (top + bot) / 2;
+            //int b_width = right - left;
+            //int b_height = bot - top;
+            //sprintf(labelstr, "%d x %d - w: %d, h: %d", b_x_center, b_y_center, b_width, b_height);
+
+            // you should create directory: result_img
+            //static int copied_frame_id = -1;
+            //static image copy_img;
+            //if (copied_frame_id != frame_id) {
+            //    copied_frame_id = frame_id;
+            //    if (copy_img.data) free_image(copy_img);
+            //    copy_img = copy_image(im);
+            //}
+            //image cropped_im = crop_image(copy_img, left, top, right - left, bot - top);
+            //static int img_id = 0;
+            //img_id++;
+            //char image_name[1024];
+            //int best_class_id = selected_detections[i].best_class;
+            //sprintf(image_name, "result_img/img_%d_%d_%d_%s.jpg", frame_id, img_id, best_class_id, names[best_class_id]);
+            //save_image(cropped_im, image_name);
+            //free_image(cropped_im);
+
+            if (im.c == 1) {
+                draw_box_width_bw(im, left, top, right, bot, width, 0.8);    // 1 channel Black-White
+            }
+            else {
+                draw_box_width(im, left, top, right, bot, width, red, green, blue); // 3 channels RGB
+            }
+            if (alphabet) {
+                char labelstr[4096] = { 0 };
+                strcat(labelstr, names[selected_detections[i].best_class]);
+                char prob_str[10];
+                sprintf(prob_str, ": %.2f", selected_detections[i].det.prob[selected_detections[i].best_class]);
+                strcat(labelstr, prob_str);
+                int j;
+                for (j = 0; j < classes; ++j) {
+                    if (selected_detections[i].det.prob[j] > thresh && j != selected_detections[i].best_class) {
+                        strcat(labelstr, ", ");
+                        strcat(labelstr, names[j]);
+                    }
+                }
+                image label = get_label_v3(alphabet, labelstr, (im.h*.02));
+                //draw_label(im, top + width, left, label, rgb);
+                draw_weighted_label(im, top + width, left, label, rgb, 0.7);
+                free_image(label);
+            }
+            if (selected_detections[i].det.mask) {
+                image mask = float_to_image(14, 14, 1, selected_detections[i].det.mask);
+                image resized_mask = resize_image(mask, b.w*im.w, b.h*im.h);
+                image tmask = threshold_image(resized_mask, .5);
+                embed_image(tmask, im, left, top);
+                free_image(mask);
+                free_image(resized_mask);
+                free_image(tmask);
+            }
+    }
+    free(selected_detections);
+}
+
+void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes)
+{
+    int i;
+
+    for(i = 0; i < num; ++i){
+        int class_id = max_index(probs[i], classes);
+        float prob = probs[i][class_id];
+        if(prob > thresh){
+
+            //// for comparison with OpenCV version of DNN Darknet Yolo v2
+            //printf("\n %f, %f, %f, %f, ", boxes[i].x, boxes[i].y, boxes[i].w, boxes[i].h);
+            // int k;
+            //for (k = 0; k < classes; ++k) {
+            //    printf("%f, ", probs[i][k]);
+            //}
+            //printf("\n");
+
+            int width = im.h * .012;
+
+            if(0){
+                width = pow(prob, 1./2.)*10+1;
+                alphabet = 0;
+            }
+
+            int offset = class_id*123457 % classes;
+            float red = get_color(2,offset,classes);
+            float green = get_color(1,offset,classes);
+            float blue = get_color(0,offset,classes);
+            float rgb[3];
+
+            //width = prob*20+2;
+
+            rgb[0] = red;
+            rgb[1] = green;
+            rgb[2] = blue;
+            box b = boxes[i];
+
+            int left  = (b.x-b.w/2.)*im.w;
+            int right = (b.x+b.w/2.)*im.w;
+            int top   = (b.y-b.h/2.)*im.h;
+            int bot   = (b.y+b.h/2.)*im.h;
+
+            if(left < 0) left = 0;
+            if(right > im.w-1) right = im.w-1;
+            if(top < 0) top = 0;
+            if(bot > im.h-1) bot = im.h-1;
+            printf("%s: %.0f%%", names[class_id], prob * 100);
+
+            //printf(" - id: %d, x_center: %d, y_center: %d, width: %d, height: %d",
+            //    class_id, (right + left) / 2, (bot - top) / 2, right - left, bot - top);
+
+            printf("\n");
+            draw_box_width(im, left, top, right, bot, width, red, green, blue);
+            if (alphabet) {
+                image label = get_label(alphabet, names[class_id], (im.h*.03)/10);
+                draw_label(im, top + width, left, label, rgb);
+            }
+        }
+    }
+}
+
+void transpose_image(image im)
+{
+    assert(im.w == im.h);
+    int n, m;
+    int c;
+    for(c = 0; c < im.c; ++c){
+        for(n = 0; n < im.w-1; ++n){
+            for(m = n + 1; m < im.w; ++m){
+                float swap = im.data[m + im.w*(n + im.h*c)];
+                im.data[m + im.w*(n + im.h*c)] = im.data[n + im.w*(m + im.h*c)];
+                im.data[n + im.w*(m + im.h*c)] = swap;
+            }
+        }
+    }
+}
+
+void rotate_image_cw(image im, int times)
+{
+    assert(im.w == im.h);
+    times = (times + 400) % 4;
+    int i, x, y, c;
+    int n = im.w;
+    for(i = 0; i < times; ++i){
+        for(c = 0; c < im.c; ++c){
+            for(x = 0; x < n/2; ++x){
+                for(y = 0; y < (n-1)/2 + 1; ++y){
+                    float temp = im.data[y + im.w*(x + im.h*c)];
+                    im.data[y + im.w*(x + im.h*c)] = im.data[n-1-x + im.w*(y + im.h*c)];
+                    im.data[n-1-x + im.w*(y + im.h*c)] = im.data[n-1-y + im.w*(n-1-x + im.h*c)];
+                    im.data[n-1-y + im.w*(n-1-x + im.h*c)] = im.data[x + im.w*(n-1-y + im.h*c)];
+                    im.data[x + im.w*(n-1-y + im.h*c)] = temp;
+                }
+            }
+        }
+    }
+}
+
+void flip_image(image a)
+{
+    int i,j,k;
+    for(k = 0; k < a.c; ++k){
+        for(i = 0; i < a.h; ++i){
+            for(j = 0; j < a.w/2; ++j){
+                int index = j + a.w*(i + a.h*(k));
+                int flip = (a.w - j - 1) + a.w*(i + a.h*(k));
+                float swap = a.data[flip];
+                a.data[flip] = a.data[index];
+                a.data[index] = swap;
+            }
+        }
+    }
+}
+
+image image_distance(image a, image b)
+{
+    int i,j;
+    image dist = make_image(a.w, a.h, 1);
+    for(i = 0; i < a.c; ++i){
+        for(j = 0; j < a.h*a.w; ++j){
+            dist.data[j] += pow(a.data[i*a.h*a.w+j]-b.data[i*a.h*a.w+j],2);
+        }
+    }
+    for(j = 0; j < a.h*a.w; ++j){
+        dist.data[j] = sqrt(dist.data[j]);
+    }
+    return dist;
+}
+
+void embed_image(image source, image dest, int dx, int dy)
+{
+    int x,y,k;
+    for(k = 0; k < source.c; ++k){
+        for(y = 0; y < source.h; ++y){
+            for(x = 0; x < source.w; ++x){
+                float val = get_pixel(source, x,y,k);
+                set_pixel(dest, dx+x, dy+y, k, val);
+            }
+        }
+    }
+}
+
+image collapse_image_layers(image source, int border)
+{
+    int h = source.h;
+    h = (h+border)*source.c - border;
+    image dest = make_image(source.w, h, 1);
+    int i;
+    for(i = 0; i < source.c; ++i){
+        image layer = get_image_layer(source, i);
+        int h_offset = i*(source.h+border);
+        embed_image(layer, dest, 0, h_offset);
+        free_image(layer);
+    }
+    return dest;
+}
+
+void constrain_image(image im)
+{
+    int i;
+    for(i = 0; i < im.w*im.h*im.c; ++i){
+        if(im.data[i] < 0) im.data[i] = 0;
+        if(im.data[i] > 1) im.data[i] = 1;
+    }
+}
+
+void normalize_image(image p)
+{
+    int i;
+    float min = 9999999;
+    float max = -999999;
+
+    for(i = 0; i < p.h*p.w*p.c; ++i){
+        float v = p.data[i];
+        if(v < min) min = v;
+        if(v > max) max = v;
+    }
+    if(max - min < .000000001){
+        min = 0;
+        max = 1;
+    }
+    for(i = 0; i < p.c*p.w*p.h; ++i){
+        p.data[i] = (p.data[i] - min)/(max-min);
+    }
+}
+
+void normalize_image2(image p)
+{
+    float* min = (float*)xcalloc(p.c, sizeof(float));
+    float* max = (float*)xcalloc(p.c, sizeof(float));
+    int i,j;
+    for(i = 0; i < p.c; ++i) min[i] = max[i] = p.data[i*p.h*p.w];
+
+    for(j = 0; j < p.c; ++j){
+        for(i = 0; i < p.h*p.w; ++i){
+            float v = p.data[i+j*p.h*p.w];
+            if(v < min[j]) min[j] = v;
+            if(v > max[j]) max[j] = v;
+        }
+    }
+    for(i = 0; i < p.c; ++i){
+        if(max[i] - min[i] < .000000001){
+            min[i] = 0;
+            max[i] = 1;
+        }
+    }
+    for(j = 0; j < p.c; ++j){
+        for(i = 0; i < p.w*p.h; ++i){
+            p.data[i+j*p.h*p.w] = (p.data[i+j*p.h*p.w] - min[j])/(max[j]-min[j]);
+        }
+    }
+    free(min);
+    free(max);
+}
+
+void copy_image_inplace(image src, image dst)
+{
+    memcpy(dst.data, src.data, src.h*src.w*src.c * sizeof(float));
+}
+
+image copy_image(image p)
+{
+    image copy = p;
+    copy.data = (float*)xcalloc(p.h * p.w * p.c, sizeof(float));
+    memcpy(copy.data, p.data, p.h*p.w*p.c*sizeof(float));
+    return copy;
+}
+
+void rgbgr_image(image im)
+{
+    int i;
+    for(i = 0; i < im.w*im.h; ++i){
+        float swap = im.data[i];
+        im.data[i] = im.data[i+im.w*im.h*2];
+        im.data[i+im.w*im.h*2] = swap;
+    }
+}
+
+void show_image(image p, const char *name)
+{
+#ifdef OPENCV
+    show_image_cv(p, name);
+#else
+    fprintf(stderr, "Not compiled with OpenCV, saving to %s.jpg instead\n", name);
+    save_image(p, name);
+#endif  // OPENCV
+}
+
+void save_image_png(image im, const char *name)
+{
+    char buff[256];
+    //sprintf(buff, "%s (%d)", name, windows);
+    sprintf(buff, "%s.png", name);
+    unsigned char* data = (unsigned char*)xcalloc(im.w * im.h * im.c, sizeof(unsigned char));
+    int i,k;
+    for(k = 0; k < im.c; ++k){
+        for(i = 0; i < im.w*im.h; ++i){
+            data[i*im.c+k] = (unsigned char) (255*im.data[i + k*im.w*im.h]);
+        }
+    }
+    int success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w*im.c);
+    free(data);
+    if(!success) fprintf(stderr, "Failed to write image %s\n", buff);
+}
+
+void save_image_options(image im, const char *name, IMTYPE f, int quality)
+{
+    char buff[256];
+    //sprintf(buff, "%s (%d)", name, windows);
+    if (f == PNG)       sprintf(buff, "%s.png", name);
+    else if (f == BMP) sprintf(buff, "%s.bmp", name);
+    else if (f == TGA) sprintf(buff, "%s.tga", name);
+    else if (f == JPG) sprintf(buff, "%s.jpg", name);
+    else               sprintf(buff, "%s.png", name);
+    unsigned char* data = (unsigned char*)xcalloc(im.w * im.h * im.c, sizeof(unsigned char));
+    int i, k;
+    for (k = 0; k < im.c; ++k) {
+        for (i = 0; i < im.w*im.h; ++i) {
+            data[i*im.c + k] = (unsigned char)(255 * im.data[i + k*im.w*im.h]);
+        }
+    }
+    int success = 0;
+    if (f == PNG)       success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w*im.c);
+    else if (f == BMP) success = stbi_write_bmp(buff, im.w, im.h, im.c, data);
+    else if (f == TGA) success = stbi_write_tga(buff, im.w, im.h, im.c, data);
+    else if (f == JPG) success = stbi_write_jpg(buff, im.w, im.h, im.c, data, quality);
+    free(data);
+    if (!success) fprintf(stderr, "Failed to write image %s\n", buff);
+}
+
+void save_image(image im, const char *name)
+{
+    save_image_options(im, name, JPG, 80);
+}
+
+void save_image_jpg(image p, const char *name)
+{
+    save_image_options(p, name, JPG, 80);
+}
+
+void show_image_layers(image p, char *name)
+{
+    int i;
+    char buff[256];
+    for(i = 0; i < p.c; ++i){
+        sprintf(buff, "%s - Layer %d", name, i);
+        image layer = get_image_layer(p, i);
+        show_image(layer, buff);
+        free_image(layer);
+    }
+}
+
+void show_image_collapsed(image p, char *name)
+{
+    image c = collapse_image_layers(p, 1);
+    show_image(c, name);
+    free_image(c);
+}
+
+image make_empty_image(int w, int h, int c)
+{
+    image out;
+    out.data = 0;
+    out.h = h;
+    out.w = w;
+    out.c = c;
+    return out;
+}
+
+image make_image(int w, int h, int c)
+{
+    image out = make_empty_image(w,h,c);
+    out.data = (float*)xcalloc(h * w * c, sizeof(float));
+    return out;
+}
+
+image make_random_image(int w, int h, int c)
+{
+    image out = make_empty_image(w,h,c);
+    out.data = (float*)xcalloc(h * w * c, sizeof(float));
+    int i;
+    for(i = 0; i < w*h*c; ++i){
+        out.data[i] = (rand_normal() * .25) + .5;
+    }
+    return out;
+}
+
+image float_to_image_scaled(int w, int h, int c, float *data)
+{
+    image out = make_image(w, h, c);
+    int abs_max = 0;
+    int i = 0;
+    for (i = 0; i < w*h*c; ++i) {
+        if (fabs(data[i]) > abs_max) abs_max = fabs(data[i]);
+    }
+    for (i = 0; i < w*h*c; ++i) {
+        out.data[i] = data[i] / abs_max;
+    }
+    return out;
+}
+
+image float_to_image(int w, int h, int c, float *data)
+{
+    image out = make_empty_image(w,h,c);
+    out.data = data;
+    return out;
+}
+
+
+image rotate_crop_image(image im, float rad, float s, int w, int h, float dx, float dy, float aspect)
+{
+    int x, y, c;
+    float cx = im.w/2.;
+    float cy = im.h/2.;
+    image rot = make_image(w, h, im.c);
+    for(c = 0; c < im.c; ++c){
+        for(y = 0; y < h; ++y){
+            for(x = 0; x < w; ++x){
+                float rx = cos(rad)*((x - w/2.)/s*aspect + dx/s*aspect) - sin(rad)*((y - h/2.)/s + dy/s) + cx;
+                float ry = sin(rad)*((x - w/2.)/s*aspect + dx/s*aspect) + cos(rad)*((y - h/2.)/s + dy/s) + cy;
+                float val = bilinear_interpolate(im, rx, ry, c);
+                set_pixel(rot, x, y, c, val);
+            }
+        }
+    }
+    return rot;
+}
+
+image rotate_image(image im, float rad)
+{
+    int x, y, c;
+    float cx = im.w/2.;
+    float cy = im.h/2.;
+    image rot = make_image(im.w, im.h, im.c);
+    for(c = 0; c < im.c; ++c){
+        for(y = 0; y < im.h; ++y){
+            for(x = 0; x < im.w; ++x){
+                float rx = cos(rad)*(x-cx) - sin(rad)*(y-cy) + cx;
+                float ry = sin(rad)*(x-cx) + cos(rad)*(y-cy) + cy;
+                float val = bilinear_interpolate(im, rx, ry, c);
+                set_pixel(rot, x, y, c, val);
+            }
+        }
+    }
+    return rot;
+}
+
+void translate_image(image m, float s)
+{
+    int i;
+    for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] += s;
+}
+
+void scale_image(image m, float s)
+{
+    int i;
+    for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] *= s;
+}
+
+image crop_image(image im, int dx, int dy, int w, int h)
+{
+    image cropped = make_image(w, h, im.c);
+    int i, j, k;
+    for(k = 0; k < im.c; ++k){
+        for(j = 0; j < h; ++j){
+            for(i = 0; i < w; ++i){
+                int r = j + dy;
+                int c = i + dx;
+                float val = 0;
+                r = constrain_int(r, 0, im.h-1);
+                c = constrain_int(c, 0, im.w-1);
+                if (r >= 0 && r < im.h && c >= 0 && c < im.w) {
+                    val = get_pixel(im, c, r, k);
+                }
+                set_pixel(cropped, i, j, k, val);
+            }
+        }
+    }
+    return cropped;
+}
+
+int best_3d_shift_r(image a, image b, int min, int max)
+{
+    if(min == max) return min;
+    int mid = floor((min + max) / 2.);
+    image c1 = crop_image(b, 0, mid, b.w, b.h);
+    image c2 = crop_image(b, 0, mid+1, b.w, b.h);
+    float d1 = dist_array(c1.data, a.data, a.w*a.h*a.c, 10);
+    float d2 = dist_array(c2.data, a.data, a.w*a.h*a.c, 10);
+    free_image(c1);
+    free_image(c2);
+    if(d1 < d2) return best_3d_shift_r(a, b, min, mid);
+    else return best_3d_shift_r(a, b, mid+1, max);
+}
+
+int best_3d_shift(image a, image b, int min, int max)
+{
+    int i;
+    int best = 0;
+    float best_distance = FLT_MAX;
+    for(i = min; i <= max; i += 2){
+        image c = crop_image(b, 0, i, b.w, b.h);
+        float d = dist_array(c.data, a.data, a.w*a.h*a.c, 100);
+        if(d < best_distance){
+            best_distance = d;
+            best = i;
+        }
+        printf("%d %f\n", i, d);
+        free_image(c);
+    }
+    return best;
+}
+
+void composite_3d(char *f1, char *f2, char *out, int delta)
+{
+    if(!out) out = "out";
+    image a = load_image(f1, 0,0,0);
+    image b = load_image(f2, 0,0,0);
+    int shift = best_3d_shift_r(a, b, -a.h/100, a.h/100);
+
+    image c1 = crop_image(b, 10, shift, b.w, b.h);
+    float d1 = dist_array(c1.data, a.data, a.w*a.h*a.c, 100);
+    image c2 = crop_image(b, -10, shift, b.w, b.h);
+    float d2 = dist_array(c2.data, a.data, a.w*a.h*a.c, 100);
+
+    if(d2 < d1 && 0){
+        image swap = a;
+        a = b;
+        b = swap;
+        shift = -shift;
+        printf("swapped, %d\n", shift);
+    }
+    else{
+        printf("%d\n", shift);
+    }
+
+    image c = crop_image(b, delta, shift, a.w, a.h);
+    int i;
+    for(i = 0; i < c.w*c.h; ++i){
+        c.data[i] = a.data[i];
+    }
+#ifdef OPENCV
+    save_image_jpg(c, out);
+#else
+    save_image(c, out);
+#endif
+}
+
+void fill_image(image m, float s)
+{
+    int i;
+    for (i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s;
+}
+
+void letterbox_image_into(image im, int w, int h, image boxed)
+{
+    int new_w = im.w;
+    int new_h = im.h;
+    if (((float)w / im.w) < ((float)h / im.h)) {
+        new_w = w;
+        new_h = (im.h * w) / im.w;
+    }
+    else {
+        new_h = h;
+        new_w = (im.w * h) / im.h;
+    }
+    image resized = resize_image(im, new_w, new_h);
+    embed_image(resized, boxed, (w - new_w) / 2, (h - new_h) / 2);
+    free_image(resized);
+}
+
+image letterbox_image(image im, int w, int h)
+{
+    int new_w = im.w;
+    int new_h = im.h;
+    if (((float)w / im.w) < ((float)h / im.h)) {
+        new_w = w;
+        new_h = (im.h * w) / im.w;
+    }
+    else {
+        new_h = h;
+        new_w = (im.w * h) / im.h;
+    }
+    image resized = resize_image(im, new_w, new_h);
+    image boxed = make_image(w, h, im.c);
+    fill_image(boxed, .5);
+    //int i;
+    //for(i = 0; i < boxed.w*boxed.h*boxed.c; ++i) boxed.data[i] = 0;
+    embed_image(resized, boxed, (w - new_w) / 2, (h - new_h) / 2);
+    free_image(resized);
+    return boxed;
+}
+
+image resize_max(image im, int max)
+{
+    int w = im.w;
+    int h = im.h;
+    if(w > h){
+        h = (h * max) / w;
+        w = max;
+    } else {
+        w = (w * max) / h;
+        h = max;
+    }
+    if(w == im.w && h == im.h) return im;
+    image resized = resize_image(im, w, h);
+    return resized;
+}
+
+image resize_min(image im, int min)
+{
+    int w = im.w;
+    int h = im.h;
+    if(w < h){
+        h = (h * min) / w;
+        w = min;
+    } else {
+        w = (w * min) / h;
+        h = min;
+    }
+    if(w == im.w && h == im.h) return im;
+    image resized = resize_image(im, w, h);
+    return resized;
+}
+
+image random_crop_image(image im, int w, int h)
+{
+    int dx = rand_int(0, im.w - w);
+    int dy = rand_int(0, im.h - h);
+    image crop = crop_image(im, dx, dy, w, h);
+    return crop;
+}
+
+image random_augment_image(image im, float angle, float aspect, int low, int high, int size)
+{
+    aspect = rand_scale(aspect);
+    int r = rand_int(low, high);
+    int min = (im.h < im.w*aspect) ? im.h : im.w*aspect;
+    float scale = (float)r / min;
+
+    float rad = rand_uniform(-angle, angle) * 2.0 * M_PI / 360.;
+
+    float dx = (im.w*scale/aspect - size) / 2.;
+    float dy = (im.h*scale - size) / 2.;
+    if(dx < 0) dx = 0;
+    if(dy < 0) dy = 0;
+    dx = rand_uniform(-dx, dx);
+    dy = rand_uniform(-dy, dy);
+
+    image crop = rotate_crop_image(im, rad, scale, size, size, dx, dy, aspect);
+
+    return crop;
+}
+
+float three_way_max(float a, float b, float c)
+{
+    return (a > b) ? ( (a > c) ? a : c) : ( (b > c) ? b : c) ;
+}
+
+float three_way_min(float a, float b, float c)
+{
+    return (a < b) ? ( (a < c) ? a : c) : ( (b < c) ? b : c) ;
+}
+
+// http://www.cs.rit.edu/~ncs/color/t_convert.html
+void rgb_to_hsv(image im)
+{
+    assert(im.c == 3);
+    int i, j;
+    float r, g, b;
+    float h, s, v;
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            r = get_pixel(im, i , j, 0);
+            g = get_pixel(im, i , j, 1);
+            b = get_pixel(im, i , j, 2);
+            float max = three_way_max(r,g,b);
+            float min = three_way_min(r,g,b);
+            float delta = max - min;
+            v = max;
+            if(max == 0){
+                s = 0;
+                h = 0;
+            }else{
+                s = delta/max;
+                if(r == max){
+                    h = (g - b) / delta;
+                } else if (g == max) {
+                    h = 2 + (b - r) / delta;
+                } else {
+                    h = 4 + (r - g) / delta;
+                }
+                if (h < 0) h += 6;
+                h = h/6.;
+            }
+            set_pixel(im, i, j, 0, h);
+            set_pixel(im, i, j, 1, s);
+            set_pixel(im, i, j, 2, v);
+        }
+    }
+}
+
+void hsv_to_rgb(image im)
+{
+    assert(im.c == 3);
+    int i, j;
+    float r, g, b;
+    float h, s, v;
+    float f, p, q, t;
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            h = 6 * get_pixel(im, i , j, 0);
+            s = get_pixel(im, i , j, 1);
+            v = get_pixel(im, i , j, 2);
+            if (s == 0) {
+                r = g = b = v;
+            } else {
+                int index = floor(h);
+                f = h - index;
+                p = v*(1-s);
+                q = v*(1-s*f);
+                t = v*(1-s*(1-f));
+                if(index == 0){
+                    r = v; g = t; b = p;
+                } else if(index == 1){
+                    r = q; g = v; b = p;
+                } else if(index == 2){
+                    r = p; g = v; b = t;
+                } else if(index == 3){
+                    r = p; g = q; b = v;
+                } else if(index == 4){
+                    r = t; g = p; b = v;
+                } else {
+                    r = v; g = p; b = q;
+                }
+            }
+            set_pixel(im, i, j, 0, r);
+            set_pixel(im, i, j, 1, g);
+            set_pixel(im, i, j, 2, b);
+        }
+    }
+}
+
+image grayscale_image(image im)
+{
+    assert(im.c == 3);
+    int i, j, k;
+    image gray = make_image(im.w, im.h, 1);
+    float scale[] = {0.587, 0.299, 0.114};
+    for(k = 0; k < im.c; ++k){
+        for(j = 0; j < im.h; ++j){
+            for(i = 0; i < im.w; ++i){
+                gray.data[i+im.w*j] += scale[k]*get_pixel(im, i, j, k);
+            }
+        }
+    }
+    return gray;
+}
+
+image threshold_image(image im, float thresh)
+{
+    int i;
+    image t = make_image(im.w, im.h, im.c);
+    for(i = 0; i < im.w*im.h*im.c; ++i){
+        t.data[i] = im.data[i]>thresh ? 1 : 0;
+    }
+    return t;
+}
+
+image blend_image(image fore, image back, float alpha)
+{
+    assert(fore.w == back.w && fore.h == back.h && fore.c == back.c);
+    image blend = make_image(fore.w, fore.h, fore.c);
+    int i, j, k;
+    for(k = 0; k < fore.c; ++k){
+        for(j = 0; j < fore.h; ++j){
+            for(i = 0; i < fore.w; ++i){
+                float val = alpha * get_pixel(fore, i, j, k) +
+                    (1 - alpha)* get_pixel(back, i, j, k);
+                set_pixel(blend, i, j, k, val);
+            }
+        }
+    }
+    return blend;
+}
+
+void scale_image_channel(image im, int c, float v)
+{
+    int i, j;
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            float pix = get_pixel(im, i, j, c);
+            pix = pix*v;
+            set_pixel(im, i, j, c, pix);
+        }
+    }
+}
+
+void translate_image_channel(image im, int c, float v)
+{
+    int i, j;
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            float pix = get_pixel(im, i, j, c);
+            pix = pix+v;
+            set_pixel(im, i, j, c, pix);
+        }
+    }
+}
+
+image binarize_image(image im)
+{
+    image c = copy_image(im);
+    int i;
+    for(i = 0; i < im.w * im.h * im.c; ++i){
+        if(c.data[i] > .5) c.data[i] = 1;
+        else c.data[i] = 0;
+    }
+    return c;
+}
+
+void saturate_image(image im, float sat)
+{
+    rgb_to_hsv(im);
+    scale_image_channel(im, 1, sat);
+    hsv_to_rgb(im);
+    constrain_image(im);
+}
+
+void hue_image(image im, float hue)
+{
+    rgb_to_hsv(im);
+    int i;
+    for(i = 0; i < im.w*im.h; ++i){
+        im.data[i] = im.data[i] + hue;
+        if (im.data[i] > 1) im.data[i] -= 1;
+        if (im.data[i] < 0) im.data[i] += 1;
+    }
+    hsv_to_rgb(im);
+    constrain_image(im);
+}
+
+void exposure_image(image im, float sat)
+{
+    rgb_to_hsv(im);
+    scale_image_channel(im, 2, sat);
+    hsv_to_rgb(im);
+    constrain_image(im);
+}
+
+void distort_image(image im, float hue, float sat, float val)
+{
+    if (im.c >= 3)
+    {
+        rgb_to_hsv(im);
+        scale_image_channel(im, 1, sat);
+        scale_image_channel(im, 2, val);
+        int i;
+        for(i = 0; i < im.w*im.h; ++i){
+            im.data[i] = im.data[i] + hue;
+            if (im.data[i] > 1) im.data[i] -= 1;
+            if (im.data[i] < 0) im.data[i] += 1;
+        }
+        hsv_to_rgb(im);
+    }
+    else
+    {
+        scale_image_channel(im, 0, val);
+    }
+    constrain_image(im);
+}
+
+void random_distort_image(image im, float hue, float saturation, float exposure)
+{
+    float dhue = rand_uniform_strong(-hue, hue);
+    float dsat = rand_scale(saturation);
+    float dexp = rand_scale(exposure);
+    distort_image(im, dhue, dsat, dexp);
+}
+
+void saturate_exposure_image(image im, float sat, float exposure)
+{
+    rgb_to_hsv(im);
+    scale_image_channel(im, 1, sat);
+    scale_image_channel(im, 2, exposure);
+    hsv_to_rgb(im);
+    constrain_image(im);
+}
+
+float bilinear_interpolate(image im, float x, float y, int c)
+{
+    int ix = (int) floorf(x);
+    int iy = (int) floorf(y);
+
+    float dx = x - ix;
+    float dy = y - iy;
+
+    float val = (1-dy) * (1-dx) * get_pixel_extend(im, ix, iy, c) +
+        dy     * (1-dx) * get_pixel_extend(im, ix, iy+1, c) +
+        (1-dy) *   dx   * get_pixel_extend(im, ix+1, iy, c) +
+        dy     *   dx   * get_pixel_extend(im, ix+1, iy+1, c);
+    return val;
+}
+
+void quantize_image(image im)
+{
+    int size = im.c * im.w * im.h;
+    int i;
+    for (i = 0; i < size; ++i) im.data[i] = (int)(im.data[i] * 255) / 255. + (0.5/255);
+}
+
+void make_image_red(image im)
+{
+    int r, c, k;
+    for (r = 0; r < im.h; ++r) {
+        for (c = 0; c < im.w; ++c) {
+            float val = 0;
+            for (k = 0; k < im.c; ++k) {
+                val += get_pixel(im, c, r, k);
+                set_pixel(im, c, r, k, 0);
+            }
+            for (k = 0; k < im.c; ++k) {
+                //set_pixel(im, c, r, k, val);
+            }
+            set_pixel(im, c, r, 0, val);
+        }
+    }
+}
+
+image make_attention_image(int img_size, float *original_delta_cpu, float *original_input_cpu, int w, int h, int c, float alpha)
+{
+    image attention_img;
+    attention_img.w = w;
+    attention_img.h = h;
+    attention_img.c = c;
+    attention_img.data = original_delta_cpu;
+    make_image_red(attention_img);
+
+    int k;
+    float min_val = 999999, mean_val = 0, max_val = -999999;
+    for (k = 0; k < img_size; ++k) {
+        if (original_delta_cpu[k] < min_val) min_val = original_delta_cpu[k];
+        if (original_delta_cpu[k] > max_val) max_val = original_delta_cpu[k];
+        mean_val += original_delta_cpu[k];
+    }
+    mean_val = mean_val / img_size;
+    float range = max_val - min_val;
+
+    for (k = 0; k < img_size; ++k) {
+        float val = original_delta_cpu[k];
+        val = fabs(mean_val - val) / range;
+        original_delta_cpu[k] = val * 4;
+    }
+
+    image resized = resize_image(attention_img, w / 4, h / 4);
+    attention_img = resize_image(resized, w, h);
+    free_image(resized);
+    for (k = 0; k < img_size; ++k) attention_img.data[k] = attention_img.data[k]*alpha + (1-alpha)*original_input_cpu[k];
+
+    //normalize_image(attention_img);
+    //show_image(attention_img, "delta");
+    return attention_img;
+}
+
+image resize_image(image im, int w, int h)
+{
+    if (im.w == w && im.h == h) return copy_image(im);
+
+    image resized = make_image(w, h, im.c);
+    image part = make_image(w, im.h, im.c);
+    int r, c, k;
+    float w_scale = (float)(im.w - 1) / (w - 1);
+    float h_scale = (float)(im.h - 1) / (h - 1);
+    for(k = 0; k < im.c; ++k){
+        for(r = 0; r < im.h; ++r){
+            for(c = 0; c < w; ++c){
+                float val = 0;
+                if(c == w-1 || im.w == 1){
+                    val = get_pixel(im, im.w-1, r, k);
+                } else {
+                    float sx = c*w_scale;
+                    int ix = (int) sx;
+                    float dx = sx - ix;
+                    val = (1 - dx) * get_pixel(im, ix, r, k) + dx * get_pixel(im, ix+1, r, k);
+                }
+                set_pixel(part, c, r, k, val);
+            }
+        }
+    }
+    for(k = 0; k < im.c; ++k){
+        for(r = 0; r < h; ++r){
+            float sy = r*h_scale;
+            int iy = (int) sy;
+            float dy = sy - iy;
+            for(c = 0; c < w; ++c){
+                float val = (1-dy) * get_pixel(part, c, iy, k);
+                set_pixel(resized, c, r, k, val);
+            }
+            if(r == h-1 || im.h == 1) continue;
+            for(c = 0; c < w; ++c){
+                float val = dy * get_pixel(part, c, iy+1, k);
+                add_pixel(resized, c, r, k, val);
+            }
+        }
+    }
+
+    free_image(part);
+    return resized;
+}
+
+
+void test_resize(char *filename)
+{
+    image im = load_image(filename, 0,0, 3);
+    float mag = mag_array(im.data, im.w*im.h*im.c);
+    printf("L2 Norm: %f\n", mag);
+    image gray = grayscale_image(im);
+
+    image c1 = copy_image(im);
+    image c2 = copy_image(im);
+    image c3 = copy_image(im);
+    image c4 = copy_image(im);
+    distort_image(c1, .1, 1.5, 1.5);
+    distort_image(c2, -.1, .66666, .66666);
+    distort_image(c3, .1, 1.5, .66666);
+    distort_image(c4, .1, .66666, 1.5);
+
+
+    show_image(im,   "Original");
+    show_image(gray, "Gray");
+    show_image(c1, "C1");
+    show_image(c2, "C2");
+    show_image(c3, "C3");
+    show_image(c4, "C4");
+
+#ifdef OPENCV
+    while(1){
+        image aug = random_augment_image(im, 0, .75, 320, 448, 320);
+        show_image(aug, "aug");
+        free_image(aug);
+
+
+        float exposure = 1.15;
+        float saturation = 1.15;
+        float hue = .05;
+
+        image c = copy_image(im);
+
+        float dexp = rand_scale(exposure);
+        float dsat = rand_scale(saturation);
+        float dhue = rand_uniform(-hue, hue);
+
+        distort_image(c, dhue, dsat, dexp);
+        show_image(c, "rand");
+        printf("%f %f %f\n", dhue, dsat, dexp);
+        free_image(c);
+        wait_until_press_key_cv();
+    }
+#endif
+}
+
+
+image load_image_stb(char *filename, int channels)
+{
+    int w, h, c;
+    unsigned char *data = stbi_load(filename, &w, &h, &c, channels);
+    if (!data) {
+        char shrinked_filename[1024];
+        if (strlen(filename) >= 1024) sprintf(shrinked_filename, "name is too long");
+        else sprintf(shrinked_filename, "%s", filename);
+        fprintf(stderr, "Cannot load image \"%s\"\nSTB Reason: %s\n", shrinked_filename, stbi_failure_reason());
+        FILE* fw = fopen("bad.list", "a");
+        fwrite(shrinked_filename, sizeof(char), strlen(shrinked_filename), fw);
+        char *new_line = "\n";
+        fwrite(new_line, sizeof(char), strlen(new_line), fw);
+        fclose(fw);
+        return make_image(10, 10, 3);
+    }
+    if(channels) c = channels;
+    int i,j,k;
+    image im = make_image(w, h, c);
+    for(k = 0; k < c; ++k){
+        for(j = 0; j < h; ++j){
+            for(i = 0; i < w; ++i){
+                int dst_index = i + w*j + w*h*k;
+                int src_index = k + c*i + c*w*j;
+                im.data[dst_index] = (float)data[src_index]/255.;
+            }
+        }
+    }
+    free(data);
+    return im;
+}
+
+image load_image_stb_resize(char *filename, int w, int h, int c)
+{
+    image out = load_image_stb(filename, c);
+
+    if ((h && w) && (h != out.h || w != out.w)) {
+        image resized = resize_image(out, w, h);
+        free_image(out);
+        out = resized;
+    }
+    return out;
+}
+
+image load_image(char *filename, int w, int h, int c)
+{
+#ifdef OPENCV
+    image out = load_image_cv(filename, c);
+#else
+    image out = load_image_stb(filename, c);
+#endif  // OPENCV
+
+    if((h && w) && (h != out.h || w != out.w)){
+        image resized = resize_image(out, w, h);
+        free_image(out);
+        out = resized;
+    }
+    return out;
+}
+
+image load_image_color(char *filename, int w, int h)
+{
+    return load_image(filename, w, h, 3);
+}
+
+image get_image_layer(image m, int l)
+{
+    image out = make_image(m.w, m.h, 1);
+    int i;
+    for(i = 0; i < m.h*m.w; ++i){
+        out.data[i] = m.data[i+l*m.h*m.w];
+    }
+    return out;
+}
+
+void print_image(image m)
+{
+    int i, j, k;
+    for(i =0 ; i < m.c; ++i){
+        for(j =0 ; j < m.h; ++j){
+            for(k = 0; k < m.w; ++k){
+                printf("%.2lf, ", m.data[i*m.h*m.w + j*m.w + k]);
+                if(k > 30) break;
+            }
+            printf("\n");
+            if(j > 30) break;
+        }
+        printf("\n");
+    }
+    printf("\n");
+}
+
+image collapse_images_vert(image *ims, int n)
+{
+    int color = 1;
+    int border = 1;
+    int h,w,c;
+    w = ims[0].w;
+    h = (ims[0].h + border) * n - border;
+    c = ims[0].c;
+    if(c != 3 || !color){
+        w = (w+border)*c - border;
+        c = 1;
+    }
+
+    image filters = make_image(w, h, c);
+    int i,j;
+    for(i = 0; i < n; ++i){
+        int h_offset = i*(ims[0].h+border);
+        image copy = copy_image(ims[i]);
+        //normalize_image(copy);
+        if(c == 3 && color){
+            embed_image(copy, filters, 0, h_offset);
+        }
+        else{
+            for(j = 0; j < copy.c; ++j){
+                int w_offset = j*(ims[0].w+border);
+                image layer = get_image_layer(copy, j);
+                embed_image(layer, filters, w_offset, h_offset);
+                free_image(layer);
+            }
+        }
+        free_image(copy);
+    }
+    return filters;
+}
+
+image collapse_images_horz(image *ims, int n)
+{
+    int color = 1;
+    int border = 1;
+    int h,w,c;
+    int size = ims[0].h;
+    h = size;
+    w = (ims[0].w + border) * n - border;
+    c = ims[0].c;
+    if(c != 3 || !color){
+        h = (h+border)*c - border;
+        c = 1;
+    }
+
+    image filters = make_image(w, h, c);
+    int i,j;
+    for(i = 0; i < n; ++i){
+        int w_offset = i*(size+border);
+        image copy = copy_image(ims[i]);
+        //normalize_image(copy);
+        if(c == 3 && color){
+            embed_image(copy, filters, w_offset, 0);
+        }
+        else{
+            for(j = 0; j < copy.c; ++j){
+                int h_offset = j*(size+border);
+                image layer = get_image_layer(copy, j);
+                embed_image(layer, filters, w_offset, h_offset);
+                free_image(layer);
+            }
+        }
+        free_image(copy);
+    }
+    return filters;
+}
+
+void show_image_normalized(image im, const char *name)
+{
+    image c = copy_image(im);
+    normalize_image(c);
+    show_image(c, name);
+    free_image(c);
+}
+
+void show_images(image *ims, int n, char *window)
+{
+    image m = collapse_images_vert(ims, n);
+    /*
+       int w = 448;
+       int h = ((float)m.h/m.w) * 448;
+       if(h > 896){
+       h = 896;
+       w = ((float)m.w/m.h) * 896;
+       }
+       image sized = resize_image(m, w, h);
+     */
+    normalize_image(m);
+    save_image(m, window);
+    show_image(m, window);
+    free_image(m);
+}
+
+void free_image(image m)
+{
+    if(m.data){
+        free(m.data);
+    }
+}
+
+// Fast copy data from a contiguous byte array into the image.
+LIB_API void copy_image_from_bytes(image im, char *pdata)
+{
+    unsigned char *data = (unsigned char*)pdata;
+    int i, k, j;
+    int w = im.w;
+    int h = im.h;
+    int c = im.c;
+    for (k = 0; k < c; ++k) {
+        for (j = 0; j < h; ++j) {
+            for (i = 0; i < w; ++i) {
+                int dst_index = i + w * j + w * h*k;
+                int src_index = k + c * i + c * w*j;
+                im.data[dst_index] = (float)data[src_index] / 255.;
+            }
+        }
+    }
+}
diff --git a/darknet-master/src/image.h b/darknet-master/src/image.h
new file mode 100644
index 0000000..65ccb7c
--- /dev/null
+++ b/darknet-master/src/image.h
@@ -0,0 +1,107 @@
+#ifndef IMAGE_H
+#define IMAGE_H
+#include "darknet.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <float.h>
+#include <string.h>
+#include <math.h>
+
+#include "image_opencv.h"
+
+#include "box.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+typedef struct {
+    int w;
+    int h;
+    int c;
+    float *data;
+} image;
+*/
+float get_color(int c, int x, int max);
+void flip_image(image a);
+void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b);
+void draw_box_width(image a, int x1, int y1, int x2, int y2, int w, float r, float g, float b);
+void draw_bbox(image a, box bbox, int w, float r, float g, float b);
+void draw_label(image a, int r, int c, image label, const float *rgb);
+void draw_weighted_label(image a, int r, int c, image label, const float *rgb, const float alpha);
+void write_label(image a, int r, int c, image *characters, char *string, float *rgb);
+void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **labels, int classes);
+void draw_detections_v3(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes, int ext_output);
+image image_distance(image a, image b);
+void scale_image(image m, float s);
+// image crop_image(image im, int dx, int dy, int w, int h);
+image random_crop_image(image im, int w, int h);
+image random_augment_image(image im, float angle, float aspect, int low, int high, int size);
+void random_distort_image(image im, float hue, float saturation, float exposure);
+//LIB_API image resize_image(image im, int w, int h);
+//LIB_API void copy_image_from_bytes(image im, char *pdata);
+void fill_image(image m, float s);
+void letterbox_image_into(image im, int w, int h, image boxed);
+//LIB_API image letterbox_image(image im, int w, int h);
+// image resize_min(image im, int min);
+image resize_max(image im, int max);
+void translate_image(image m, float s);
+void normalize_image(image p);
+image rotate_image(image m, float rad);
+void rotate_image_cw(image im, int times);
+void embed_image(image source, image dest, int dx, int dy);
+void saturate_image(image im, float sat);
+void exposure_image(image im, float sat);
+void distort_image(image im, float hue, float sat, float val);
+void saturate_exposure_image(image im, float sat, float exposure);
+void hsv_to_rgb(image im);
+//LIB_API void rgbgr_image(image im);
+void constrain_image(image im);
+void composite_3d(char *f1, char *f2, char *out, int delta);
+int best_3d_shift_r(image a, image b, int min, int max);
+
+image grayscale_image(image im);
+image threshold_image(image im, float thresh);
+
+image collapse_image_layers(image source, int border);
+image collapse_images_horz(image *ims, int n);
+image collapse_images_vert(image *ims, int n);
+
+void show_image(image p, const char *name);
+void show_image_normalized(image im, const char *name);
+void save_image_png(image im, const char *name);
+void save_image(image p, const char *name);
+void show_images(image *ims, int n, char *window);
+void show_image_layers(image p, char *name);
+void show_image_collapsed(image p, char *name);
+
+void print_image(image m);
+
+//LIB_API image make_image(int w, int h, int c);
+image make_random_image(int w, int h, int c);
+image make_empty_image(int w, int h, int c);
+image float_to_image_scaled(int w, int h, int c, float *data);
+image float_to_image(int w, int h, int c, float *data);
+image copy_image(image p);
+void copy_image_inplace(image src, image dst);
+image load_image(char *filename, int w, int h, int c);
+image load_image_stb_resize(char *filename, int w, int h, int c);
+//LIB_API image load_image_color(char *filename, int w, int h);
+image **load_alphabet();
+void free_alphabet(image **alphabet);
+
+//float get_pixel(image m, int x, int y, int c);
+//float get_pixel_extend(image m, int x, int y, int c);
+//void set_pixel(image m, int x, int y, int c, float val);
+//void add_pixel(image m, int x, int y, int c, float val);
+float bilinear_interpolate(image im, float x, float y, int c);
+
+image get_image_layer(image m, int l);
+
+//LIB_API void free_image(image m);
+void test_resize(char *filename);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/image_opencv.cpp b/darknet-master/src/image_opencv.cpp
new file mode 100644
index 0000000..22e6ca5
--- /dev/null
+++ b/darknet-master/src/image_opencv.cpp
@@ -0,0 +1,1568 @@
+#include "image_opencv.h"
+#include <iostream>
+
+#ifdef OPENCV
+#include "utils.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <string>
+#include <vector>
+#include <fstream>
+#include <algorithm>
+#include <atomic>
+
+#include <opencv2/core/version.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/opencv.hpp>
+#include <opencv2/opencv_modules.hpp>
+
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/video/video.hpp>
+
+// includes for OpenCV >= 3.x
+#ifndef CV_VERSION_EPOCH
+#include <opencv2/core/types.hpp>
+#include <opencv2/videoio/videoio.hpp>
+#include <opencv2/imgcodecs/imgcodecs.hpp>
+#endif
+
+// OpenCV includes for OpenCV 2.x
+#ifdef CV_VERSION_EPOCH
+#include <opencv2/highgui/highgui_c.h>
+#include <opencv2/imgproc/imgproc_c.h>
+#include <opencv2/core/types_c.h>
+#include <opencv2/core/version.hpp>
+#endif
+
+//using namespace cv;
+
+using std::cerr;
+using std::endl;
+
+#ifdef DEBUG
+#define OCV_D "d"
+#else
+#define OCV_D
+#endif//DEBUG
+
+
+// OpenCV libraries
+#ifndef CV_VERSION_EPOCH
+#define OPENCV_VERSION CVAUX_STR(CV_VERSION_MAJOR)"" CVAUX_STR(CV_VERSION_MINOR)"" CVAUX_STR(CV_VERSION_REVISION) OCV_D
+#ifndef USE_CMAKE_LIBS
+#pragma comment(lib, "opencv_world" OPENCV_VERSION ".lib")
+#endif    // USE_CMAKE_LIBS
+#else   // CV_VERSION_EPOCH
+#define OPENCV_VERSION CVAUX_STR(CV_VERSION_EPOCH)"" CVAUX_STR(CV_VERSION_MAJOR)"" CVAUX_STR(CV_VERSION_MINOR) OCV_D
+#ifndef USE_CMAKE_LIBS
+#pragma comment(lib, "opencv_core" OPENCV_VERSION ".lib")
+#pragma comment(lib, "opencv_imgproc" OPENCV_VERSION ".lib")
+#pragma comment(lib, "opencv_highgui" OPENCV_VERSION ".lib")
+#endif    // USE_CMAKE_LIBS
+#endif    // CV_VERSION_EPOCH
+
+#include "http_stream.h"
+
+#ifndef CV_RGB
+#define CV_RGB(r, g, b) cvScalar( (b), (g), (r), 0 )
+#endif
+
+#ifndef CV_FILLED
+#define CV_FILLED cv::FILLED
+#endif
+
+#ifndef CV_AA
+#define CV_AA cv::LINE_AA
+#endif
+
+extern "C" {
+
+    //struct mat_cv : cv::Mat {  };
+    //struct cap_cv : cv::VideoCapture { };
+    //struct write_cv : cv::VideoWriter {  };
+
+    //struct mat_cv : cv::Mat { int a[0]; };
+    //struct cap_cv : cv::VideoCapture { int a[0]; };
+    //struct write_cv : cv::VideoWriter { int a[0]; };
+
+// ====================================================================
+// cv::Mat
+// ====================================================================
+    image mat_to_image(cv::Mat mat);
+    cv::Mat image_to_mat(image img);
+//    image ipl_to_image(mat_cv* src);
+//    mat_cv *image_to_ipl(image img);
+//    cv::Mat ipl_to_mat(IplImage *ipl);
+//    IplImage *mat_to_ipl(cv::Mat mat);
+
+
+extern "C" mat_cv *load_image_mat_cv(const char *filename, int flag)
+{
+    cv::Mat *mat_ptr = NULL;
+    try {
+        cv::Mat mat = cv::imread(filename, flag);
+        if (mat.empty())
+        {
+            std::string shrinked_filename = filename;
+            if (shrinked_filename.length() > 1024) {
+                shrinked_filename.resize(1024);
+                shrinked_filename = std::string("name is too long: ") + shrinked_filename;
+            }
+            cerr << "Cannot load image " << shrinked_filename << std::endl;
+            std::ofstream bad_list("bad.list", std::ios::out | std::ios::app);
+            bad_list << shrinked_filename << std::endl;
+            return NULL;
+        }
+        cv::Mat dst;
+        if (mat.channels() == 3) cv::cvtColor(mat, dst, cv::COLOR_RGB2BGR);
+        else if (mat.channels() == 4) cv::cvtColor(mat, dst, cv::COLOR_RGBA2BGRA);
+        else dst = mat;
+
+        mat_ptr = new cv::Mat(dst);
+
+        return (mat_cv *)mat_ptr;
+    }
+    catch (...) {
+        cerr << "OpenCV exception: load_image_mat_cv \n";
+    }
+    if (mat_ptr) delete mat_ptr;
+    return NULL;
+}
+// ----------------------------------------
+
+cv::Mat load_image_mat(char *filename, int channels)
+{
+    int flag = cv::IMREAD_UNCHANGED;
+    if (channels == 0) flag = cv::IMREAD_COLOR;
+    else if (channels == 1) flag = cv::IMREAD_GRAYSCALE;
+    else if (channels == 3) flag = cv::IMREAD_COLOR;
+    else {
+        fprintf(stderr, "OpenCV can't force load with %d channels\n", channels);
+    }
+    //flag |= IMREAD_IGNORE_ORIENTATION;    // un-comment it if you want
+
+    cv::Mat *mat_ptr = (cv::Mat *)load_image_mat_cv(filename, flag);
+
+    if (mat_ptr == NULL) {
+        return cv::Mat();
+    }
+    cv::Mat mat = *mat_ptr;
+    delete mat_ptr;
+
+    return mat;
+}
+// ----------------------------------------
+
+extern "C" image load_image_cv(char *filename, int channels)
+{
+    cv::Mat mat = load_image_mat(filename, channels);
+
+    if (mat.empty()) {
+        return make_image(10, 10, channels);
+    }
+    return mat_to_image(mat);
+}
+// ----------------------------------------
+
+extern "C" image load_image_resize(char *filename, int w, int h, int c, image *im)
+{
+    image out;
+    try {
+        cv::Mat loaded_image = load_image_mat(filename, c);
+
+        *im = mat_to_image(loaded_image);
+
+        cv::Mat resized(h, w, CV_8UC3);
+        cv::resize(loaded_image, resized, cv::Size(w, h), 0, 0, cv::INTER_LINEAR);
+        out = mat_to_image(resized);
+    }
+    catch (...) {
+        cerr << " OpenCV exception: load_image_resize() can't load image %s " << filename << " \n";
+        out = make_image(w, h, c);
+        *im = make_image(w, h, c);
+    }
+    return out;
+}
+// ----------------------------------------
+
+extern "C" int get_width_mat(mat_cv *mat)
+{
+    if (mat == NULL) {
+        cerr << " Pointer is NULL in get_width_mat() \n";
+        return 0;
+    }
+    return ((cv::Mat *)mat)->cols;
+}
+// ----------------------------------------
+
+extern "C" int get_height_mat(mat_cv *mat)
+{
+    if (mat == NULL) {
+        cerr << " Pointer is NULL in get_height_mat() \n";
+        return 0;
+    }
+    return ((cv::Mat *)mat)->rows;
+}
+// ----------------------------------------
+
+extern "C" void release_mat(mat_cv **mat)
+{
+    try {
+        cv::Mat **mat_ptr = (cv::Mat **)mat;
+        if (*mat_ptr) delete *mat_ptr;
+        *mat_ptr = NULL;
+    }
+    catch (...) {
+        cerr << "OpenCV exception: release_mat \n";
+    }
+}
+
+// ====================================================================
+// IplImage
+// ====================================================================
+/*
+extern "C" int get_width_cv(mat_cv *ipl_src)
+{
+    IplImage *ipl = (IplImage *)ipl_src;
+    return ipl->width;
+}
+// ----------------------------------------
+
+extern "C" int get_height_cv(mat_cv *ipl_src)
+{
+    IplImage *ipl = (IplImage *)ipl_src;
+    return ipl->height;
+}
+// ----------------------------------------
+
+extern "C" void release_ipl(mat_cv **ipl)
+{
+    IplImage **ipl_img = (IplImage **)ipl;
+    if (*ipl_img) cvReleaseImage(ipl_img);
+    *ipl_img = NULL;
+}
+// ----------------------------------------
+
+// ====================================================================
+// image-to-ipl, ipl-to-image, image_to_mat, mat_to_image
+// ====================================================================
+
+extern "C" mat_cv *image_to_ipl(image im)
+{
+    int x, y, c;
+    IplImage *disp = cvCreateImage(cvSize(im.w, im.h), IPL_DEPTH_8U, im.c);
+    int step = disp->widthStep;
+    for (y = 0; y < im.h; ++y) {
+        for (x = 0; x < im.w; ++x) {
+            for (c = 0; c < im.c; ++c) {
+                float val = im.data[c*im.h*im.w + y*im.w + x];
+                disp->imageData[y*step + x*im.c + c] = (unsigned char)(val * 255);
+            }
+        }
+    }
+    return (mat_cv *)disp;
+}
+// ----------------------------------------
+
+extern "C" image ipl_to_image(mat_cv* src_ptr)
+{
+    IplImage* src = (IplImage*)src_ptr;
+    int h = src->height;
+    int w = src->width;
+    int c = src->nChannels;
+    image im = make_image(w, h, c);
+    unsigned char *data = (unsigned char *)src->imageData;
+    int step = src->widthStep;
+    int i, j, k;
+
+    for (i = 0; i < h; ++i) {
+        for (k = 0; k < c; ++k) {
+            for (j = 0; j < w; ++j) {
+                im.data[k*w*h + i*w + j] = data[i*step + j*c + k] / 255.;
+            }
+        }
+    }
+    return im;
+}
+// ----------------------------------------
+
+cv::Mat ipl_to_mat(IplImage *ipl)
+{
+    Mat m = cvarrToMat(ipl, true);
+    return m;
+}
+// ----------------------------------------
+
+IplImage *mat_to_ipl(cv::Mat mat)
+{
+    IplImage *ipl = new IplImage;
+    *ipl = mat;
+    return ipl;
+}
+// ----------------------------------------
+*/
+
+extern "C" cv::Mat image_to_mat(image img)
+{
+    int channels = img.c;
+    int width = img.w;
+    int height = img.h;
+    cv::Mat mat = cv::Mat(height, width, CV_8UC(channels));
+    int step = mat.step;
+
+    for (int y = 0; y < img.h; ++y) {
+        for (int x = 0; x < img.w; ++x) {
+            for (int c = 0; c < img.c; ++c) {
+                float val = img.data[c*img.h*img.w + y*img.w + x];
+                mat.data[y*step + x*img.c + c] = (unsigned char)(val * 255);
+            }
+        }
+    }
+    return mat;
+}
+// ----------------------------------------
+
+extern "C" image mat_to_image(cv::Mat mat)
+{
+    int w = mat.cols;
+    int h = mat.rows;
+    int c = mat.channels();
+    image im = make_image(w, h, c);
+    unsigned char *data = (unsigned char *)mat.data;
+    int step = mat.step;
+    for (int y = 0; y < h; ++y) {
+        for (int k = 0; k < c; ++k) {
+            for (int x = 0; x < w; ++x) {
+                //uint8_t val = mat.ptr<uint8_t>(y)[c * x + k];
+                //uint8_t val = mat.at<Vec3b>(y, x).val[k];
+                //im.data[k*w*h + y*w + x] = val / 255.0f;
+
+                im.data[k*w*h + y*w + x] = data[y*step + x*c + k] / 255.0f;
+            }
+        }
+    }
+    return im;
+}
+
+image mat_to_image_cv(mat_cv *mat)
+{
+    return mat_to_image(*(cv::Mat*)mat);
+}
+
+// ====================================================================
+// Window
+// ====================================================================
+extern "C" void create_window_cv(char const* window_name, int full_screen, int width, int height)
+{
+    try {
+        int window_type = cv::WINDOW_NORMAL;
+#ifdef CV_VERSION_EPOCH // OpenCV 2.x
+        if (full_screen) window_type = CV_WINDOW_FULLSCREEN;
+#else
+        if (full_screen) window_type = cv::WINDOW_FULLSCREEN;
+#endif
+        cv::namedWindow(window_name, window_type);
+        cv::moveWindow(window_name, 0, 0);
+        cv::resizeWindow(window_name, width, height);
+    }
+    catch (...) {
+        cerr << "OpenCV exception: create_window_cv \n";
+    }
+}
+// ----------------------------------------
+
+extern "C" void resize_window_cv(char const* window_name, int width, int height)
+{
+    try {
+        cv::resizeWindow(window_name, width, height);
+    }
+    catch (...) {
+        cerr << "OpenCV exception: create_window_cv \n";
+    }
+}
+// ----------------------------------------
+
+extern "C" void move_window_cv(char const* window_name, int x, int y)
+{
+    try {
+        cv::moveWindow(window_name, x, y);
+    }
+    catch (...) {
+        cerr << "OpenCV exception: create_window_cv \n";
+    }
+}
+// ----------------------------------------
+
+extern "C" void destroy_all_windows_cv()
+{
+    try {
+        cv::destroyAllWindows();
+    }
+    catch (...) {
+        cerr << "OpenCV exception: destroy_all_windows_cv \n";
+    }
+}
+// ----------------------------------------
+
+extern "C" int wait_key_cv(int delay)
+{
+    try {
+        return cv::waitKey(delay);
+    }
+    catch (...) {
+        cerr << "OpenCV exception: wait_key_cv \n";
+    }
+    return -1;
+}
+// ----------------------------------------
+
+extern "C" int wait_until_press_key_cv()
+{
+    return wait_key_cv(0);
+}
+// ----------------------------------------
+
+extern "C" void make_window(char *name, int w, int h, int fullscreen)
+{
+    try {
+        cv::namedWindow(name, cv::WINDOW_NORMAL);
+        if (fullscreen) {
+#ifdef CV_VERSION_EPOCH // OpenCV 2.x
+            cv::setWindowProperty(name, cv::WND_PROP_FULLSCREEN, CV_WINDOW_FULLSCREEN);
+#else
+            cv::setWindowProperty(name, cv::WND_PROP_FULLSCREEN, cv::WINDOW_FULLSCREEN);
+#endif
+        }
+        else {
+            cv::resizeWindow(name, w, h);
+            if (strcmp(name, "Demo") == 0) cv::moveWindow(name, 0, 0);
+        }
+    }
+    catch (...) {
+        cerr << "OpenCV exception: make_window \n";
+    }
+}
+// ----------------------------------------
+
+static float get_pixel(image m, int x, int y, int c)
+{
+    assert(x < m.w && y < m.h && c < m.c);
+    return m.data[c*m.h*m.w + y*m.w + x];
+}
+// ----------------------------------------
+
+extern "C" void show_image_cv(image p, const char *name)
+{
+    try {
+        image copy = copy_image(p);
+        constrain_image(copy);
+
+        cv::Mat mat = image_to_mat(copy);
+        if (mat.channels() == 3) cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR);
+        else if (mat.channels() == 4) cv::cvtColor(mat, mat, cv::COLOR_RGBA2BGR);
+        cv::namedWindow(name, cv::WINDOW_NORMAL);
+        cv::imshow(name, mat);
+        free_image(copy);
+    }
+    catch (...) {
+        cerr << "OpenCV exception: show_image_cv \n";
+    }
+}
+// ----------------------------------------
+
+/*
+extern "C" void show_image_cv_ipl(mat_cv *disp, const char *name)
+{
+    if (disp == NULL) return;
+    char buff[256];
+    sprintf(buff, "%s", name);
+    cv::namedWindow(buff, WINDOW_NORMAL);
+    cvShowImage(buff, disp);
+}
+// ----------------------------------------
+*/
+
+extern "C" void show_image_mat(mat_cv *mat_ptr, const char *name)
+{
+    try {
+        if (mat_ptr == NULL) return;
+        cv::Mat &mat = *(cv::Mat *)mat_ptr;
+        cv::namedWindow(name, cv::WINDOW_NORMAL);
+        cv::imshow(name, mat);
+    }
+    catch (...) {
+        cerr << "OpenCV exception: show_image_mat \n";
+    }
+}
+
+// ====================================================================
+// Video Writer
+// ====================================================================
+extern "C" write_cv *create_video_writer(char *out_filename, char c1, char c2, char c3, char c4, int fps, int width, int height, int is_color)
+{
+    try {
+    cv::VideoWriter * output_video_writer =
+#ifdef CV_VERSION_EPOCH
+        new cv::VideoWriter(out_filename, CV_FOURCC(c1, c2, c3, c4), fps, cv::Size(width, height), is_color);
+#else
+        new cv::VideoWriter(out_filename, cv::VideoWriter::fourcc(c1, c2, c3, c4), fps, cv::Size(width, height), is_color);
+#endif
+
+    return (write_cv *)output_video_writer;
+    }
+    catch (...) {
+        cerr << "OpenCV exception: create_video_writer \n";
+    }
+    return NULL;
+}
+
+extern "C" void write_frame_cv(write_cv *output_video_writer, mat_cv *mat)
+{
+    try {
+        cv::VideoWriter *out = (cv::VideoWriter *)output_video_writer;
+        out->write(*(cv::Mat*)mat);
+    }
+    catch (...) {
+        cerr << "OpenCV exception: write_frame_cv \n";
+    }
+}
+
+extern "C" void release_video_writer(write_cv **output_video_writer)
+{
+    try {
+        if (output_video_writer) {
+            std::cout << " closing...";
+            cv::VideoWriter *out = *(cv::VideoWriter **)output_video_writer;
+            out->release();
+            delete out;
+            output_video_writer = NULL;
+            std::cout << " closed!";
+        }
+        else {
+            cerr << "OpenCV exception: output_video_writer isn't created \n";
+        }
+    }
+    catch (...) {
+        cerr << "OpenCV exception: release_video_writer \n";
+    }
+}
+
+/*
+extern "C" void *open_video_stream(const char *f, int c, int w, int h, int fps)
+{
+    VideoCapture *cap;
+    if(f) cap = new VideoCapture(f);
+    else cap = new VideoCapture(c);
+    if(!cap->isOpened()) return 0;
+    if(w) cap->set(CV_CAP_PROP_FRAME_WIDTH, w);
+    if(h) cap->set(CV_CAP_PROP_FRAME_HEIGHT, w);
+    if(fps) cap->set(CV_CAP_PROP_FPS, w);
+    return (void *) cap;
+}
+
+
+extern "C" image get_image_from_stream(void *p)
+{
+    VideoCapture *cap = (VideoCapture *)p;
+    Mat m;
+    *cap >> m;
+    if(m.empty()) return make_empty_image(0,0,0);
+    return mat_to_image(m);
+}
+
+extern "C" int show_image_cv(image im, const char* name, int ms)
+{
+    Mat m = image_to_mat(im);
+    imshow(name, m);
+    int c = waitKey(ms);
+    if (c != -1) c = c%256;
+    return c;
+}
+*/
+
+
+// ====================================================================
+// Video Capture
+// ====================================================================
+
+extern "C" cap_cv* get_capture_video_stream(const char *path) {
+    cv::VideoCapture* cap = NULL;
+    try {
+        cap = new cv::VideoCapture(path);
+    }
+    catch (...) {
+        cerr << " OpenCV exception: video-stream " << path << " can't be opened! \n";
+    }
+    return (cap_cv*)cap;
+}
+// ----------------------------------------
+
+extern "C" cap_cv* get_capture_webcam(int index)
+{
+    cv::VideoCapture* cap = NULL;
+    try {
+        cap = new cv::VideoCapture(index);
+        //cap->set(CV_CAP_PROP_FRAME_WIDTH, 1280);
+        //cap->set(CV_CAP_PROP_FRAME_HEIGHT, 960);
+    }
+    catch (...) {
+        cerr << " OpenCV exception: Web-camera " << index << " can't be opened! \n";
+    }
+    return (cap_cv*)cap;
+}
+// ----------------------------------------
+
+extern "C" void release_capture(cap_cv* cap)
+{
+    try {
+        cv::VideoCapture *cpp_cap = (cv::VideoCapture *)cap;
+        delete cpp_cap;
+    }
+    catch (...) {
+        cerr << " OpenCV exception: cv::VideoCapture " << cap << " can't be released! \n";
+    }
+}
+// ----------------------------------------
+
+extern "C" mat_cv* get_capture_frame_cv(cap_cv *cap) {
+    cv::Mat *mat = NULL;
+    try {
+        mat = new cv::Mat();
+        if (cap) {
+            cv::VideoCapture &cpp_cap = *(cv::VideoCapture *)cap;
+            if (cpp_cap.isOpened())
+            {
+                cpp_cap >> *mat;
+            }
+            else std::cout << " Video-stream stopped! \n";
+        }
+        else cerr << " cv::VideoCapture isn't created \n";
+    }
+    catch (...) {
+        std::cout << " OpenCV exception: Video-stream stoped! \n";
+    }
+    return (mat_cv *)mat;
+}
+// ----------------------------------------
+
+extern "C" int get_stream_fps_cpp_cv(cap_cv *cap)
+{
+    int fps = 25;
+    try {
+        cv::VideoCapture &cpp_cap = *(cv::VideoCapture *)cap;
+#ifndef CV_VERSION_EPOCH    // OpenCV 3.x
+        fps = cpp_cap.get(cv::CAP_PROP_FPS);
+#else                        // OpenCV 2.x
+        fps = cpp_cap.get(CV_CAP_PROP_FPS);
+#endif
+    }
+    catch (...) {
+        cerr << " Can't get FPS of source videofile. For output video FPS = 25 by default. \n";
+    }
+    return fps;
+}
+// ----------------------------------------
+
+extern "C" double get_capture_property_cv(cap_cv *cap, int property_id)
+{
+    try {
+        cv::VideoCapture &cpp_cap = *(cv::VideoCapture *)cap;
+        return cpp_cap.get(property_id);
+    }
+    catch (...) {
+        cerr << " OpenCV exception: Can't get property of source video-stream. \n";
+    }
+    return 0;
+}
+// ----------------------------------------
+
+extern "C" double get_capture_frame_count_cv(cap_cv *cap)
+{
+    try {
+        cv::VideoCapture &cpp_cap = *(cv::VideoCapture *)cap;
+#ifndef CV_VERSION_EPOCH    // OpenCV 3.x
+        return cpp_cap.get(cv::CAP_PROP_FRAME_COUNT);
+#else                        // OpenCV 2.x
+        return cpp_cap.get(CV_CAP_PROP_FRAME_COUNT);
+#endif
+    }
+    catch (...) {
+        cerr << " OpenCV exception: Can't get CAP_PROP_FRAME_COUNT of source videofile. \n";
+    }
+    return 0;
+}
+// ----------------------------------------
+
+extern "C" int set_capture_property_cv(cap_cv *cap, int property_id, double value)
+{
+    try {
+        cv::VideoCapture &cpp_cap = *(cv::VideoCapture *)cap;
+        return cpp_cap.set(property_id, value);
+    }
+    catch (...) {
+        cerr << " Can't set property of source video-stream. \n";
+    }
+    return false;
+}
+// ----------------------------------------
+
+extern "C" int set_capture_position_frame_cv(cap_cv *cap, int index)
+{
+    try {
+        cv::VideoCapture &cpp_cap = *(cv::VideoCapture *)cap;
+#ifndef CV_VERSION_EPOCH    // OpenCV 3.x
+        return cpp_cap.set(cv::CAP_PROP_POS_FRAMES, index);
+#else                        // OpenCV 2.x
+        return cpp_cap.set(CV_CAP_PROP_POS_FRAMES, index);
+#endif
+    }
+    catch (...) {
+        cerr << " Can't set CAP_PROP_POS_FRAMES of source videofile. \n";
+    }
+    return false;
+}
+// ----------------------------------------
+
+
+
+// ====================================================================
+// ... Video Capture
+// ====================================================================
+
+extern "C" image get_image_from_stream_cpp(cap_cv *cap)
+{
+    cv::Mat *src = NULL;
+    static int once = 1;
+    if (once) {
+        once = 0;
+        do {
+            if (src) delete src;
+            src = (cv::Mat*)get_capture_frame_cv(cap);
+            if (!src) return make_empty_image(0, 0, 0);
+        } while (src->cols < 1 || src->rows < 1 || src->channels() < 1);
+        printf("Video stream: %d x %d \n", src->cols, src->rows);
+    }
+    else
+        src = (cv::Mat*)get_capture_frame_cv(cap);
+
+    if (!src) return make_empty_image(0, 0, 0);
+    image im = mat_to_image(*src);
+    rgbgr_image(im);
+    if (src) delete src;
+    return im;
+}
+// ----------------------------------------
+
+extern "C" int wait_for_stream(cap_cv *cap, cv::Mat* src, int dont_close)
+{
+    if (!src) {
+        if (dont_close) src = new cv::Mat(416, 416, CV_8UC(3)); // cvCreateImage(cvSize(416, 416), IPL_DEPTH_8U, 3);
+        else return 0;
+    }
+    if (src->cols < 1 || src->rows < 1 || src->channels() < 1) {
+        if (dont_close) {
+            delete src;// cvReleaseImage(&src);
+            int z = 0;
+            for (z = 0; z < 20; ++z) {
+                src = (cv::Mat*)get_capture_frame_cv(cap);
+                delete src;// cvReleaseImage(&src);
+            }
+            src = new cv::Mat(416, 416, CV_8UC(3)); // cvCreateImage(cvSize(416, 416), IPL_DEPTH_8U, 3);
+        }
+        else return 0;
+    }
+    return 1;
+}
+// ----------------------------------------
+
+extern "C" image get_image_from_stream_resize(cap_cv *cap, int w, int h, int c, mat_cv** in_img, int dont_close)
+{
+    c = c ? c : 3;
+    cv::Mat *src = NULL;
+
+    static int once = 1;
+    if (once) {
+        once = 0;
+        do {
+            if (src) delete src;
+            src = (cv::Mat*)get_capture_frame_cv(cap);
+            if (!src) return make_empty_image(0, 0, 0);
+        } while (src->cols < 1 || src->rows < 1 || src->channels() < 1);
+        printf("Video stream: %d x %d \n", src->cols, src->rows);
+    }
+    else
+        src = (cv::Mat*)get_capture_frame_cv(cap);
+
+    if (!wait_for_stream(cap, src, dont_close)) return make_empty_image(0, 0, 0);
+
+    *(cv::Mat **)in_img = src;
+
+    cv::Mat new_img = cv::Mat(h, w, CV_8UC(c));
+    cv::resize(*src, new_img, new_img.size(), 0, 0, cv::INTER_LINEAR);
+    if (c>1) cv::cvtColor(new_img, new_img, cv::COLOR_RGB2BGR);
+    image im = mat_to_image(new_img);
+
+    //show_image_cv(im, "im");
+    //show_image_mat(*in_img, "in_img");
+    return im;
+}
+// ----------------------------------------
+
+extern "C" image get_image_from_stream_letterbox(cap_cv *cap, int w, int h, int c, mat_cv** in_img, int dont_close)
+{
+    c = c ? c : 3;
+    cv::Mat *src = NULL;
+    static int once = 1;
+    if (once) {
+        once = 0;
+        do {
+            if (src) delete src;
+            src = (cv::Mat*)get_capture_frame_cv(cap);
+            if (!src) return make_empty_image(0, 0, 0);
+        } while (src->cols < 1 || src->rows < 1 || src->channels() < 1);
+        printf("Video stream: %d x %d \n", src->cols, src->rows);
+    }
+    else
+        src = (cv::Mat*)get_capture_frame_cv(cap);
+
+    if (!wait_for_stream(cap, src, dont_close)) return make_empty_image(0, 0, 0);   // passes (cv::Mat *)src while should be (cv::Mat **)src
+
+    *in_img = (mat_cv *)new cv::Mat(src->rows, src->cols, CV_8UC(c));
+    cv::resize(*src, **(cv::Mat**)in_img, (*(cv::Mat**)in_img)->size(), 0, 0, cv::INTER_LINEAR);
+
+    if (c>1) cv::cvtColor(*src, *src, cv::COLOR_RGB2BGR);
+    image tmp = mat_to_image(*src);
+    image im = letterbox_image(tmp, w, h);
+    free_image(tmp);
+    release_mat((mat_cv **)&src);
+
+    //show_image_cv(im, "im");
+    //show_image_mat(*in_img, "in_img");
+    return im;
+}
+// ----------------------------------------
+
+extern "C" void consume_frame(cap_cv *cap){
+    cv::Mat *src = NULL;
+    src = (cv::Mat *)get_capture_frame_cv(cap);
+    if (src)
+        delete src;
+}
+// ----------------------------------------
+
+
+// ====================================================================
+// Image Saving
+// ====================================================================
+extern int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
+extern int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+
+extern "C" void save_mat_png(cv::Mat img_src, const char *name)
+{
+    cv::Mat img_rgb;
+    if (img_src.channels() >= 3) cv::cvtColor(img_src, img_rgb, cv::COLOR_RGB2BGR);
+    stbi_write_png(name, img_rgb.cols, img_rgb.rows, 3, (char *)img_rgb.data, 0);
+}
+// ----------------------------------------
+
+extern "C" void save_mat_jpg(cv::Mat img_src, const char *name)
+{
+    cv::Mat img_rgb;
+    if (img_src.channels() >= 3) cv::cvtColor(img_src, img_rgb, cv::COLOR_RGB2BGR);
+    stbi_write_jpg(name, img_rgb.cols, img_rgb.rows, 3, (char *)img_rgb.data, 80);
+}
+// ----------------------------------------
+
+
+extern "C" void save_cv_png(mat_cv *img_src, const char *name)
+{
+    cv::Mat* img = (cv::Mat* )img_src;
+    save_mat_png(*img, name);
+}
+// ----------------------------------------
+
+extern "C" void save_cv_jpg(mat_cv *img_src, const char *name)
+{
+    cv::Mat* img = (cv::Mat*)img_src;
+    save_mat_jpg(*img, name);
+}
+// ----------------------------------------
+
+
+// ====================================================================
+// Draw Detection
+// ====================================================================
+extern "C" void draw_detections_cv_v3(mat_cv* mat, detection *dets, int num, float thresh, char **names, image **alphabet, int classes, int ext_output)
+{
+    try {
+        cv::Mat *show_img = (cv::Mat*)mat;
+        int i, j;
+        if (!show_img) return;
+        static int frame_id = 0;
+        frame_id++;
+
+        for (i = 0; i < num; ++i) {
+            char labelstr[4096] = { 0 };
+            int class_id = -1;
+            for (j = 0; j < classes; ++j) {
+                int show = strncmp(names[j], "dont_show", 9);
+                if (dets[i].prob[j] > thresh && show) {
+                    if (class_id < 0) {
+                        strcat(labelstr, names[j]);
+                        class_id = j;
+                        char buff[20];
+                        if (dets[i].track_id) {
+                            sprintf(buff, " (id: %d)", dets[i].track_id);
+                            strcat(labelstr, buff);
+                        }
+                        sprintf(buff, " (%2.0f%%)", dets[i].prob[j] * 100);
+                        strcat(labelstr, buff);
+                        printf("%s: %.0f%% ", names[j], dets[i].prob[j] * 100);
+                        if (dets[i].track_id) printf("(track = %d, sim = %f) ", dets[i].track_id, dets[i].sim);
+                    }
+                    else {
+                        strcat(labelstr, ", ");
+                        strcat(labelstr, names[j]);
+                        printf(", %s: %.0f%% ", names[j], dets[i].prob[j] * 100);
+                    }
+                }
+            }
+            if (class_id >= 0) {
+                int width = std::max(1.0f, show_img->rows * .002f);
+
+                //if(0){
+                //width = pow(prob, 1./2.)*10+1;
+                //alphabet = 0;
+                //}
+
+                //printf("%d %s: %.0f%%\n", i, names[class_id], prob*100);
+                int offset = class_id * 123457 % classes;
+                float red = get_color(2, offset, classes);
+                float green = get_color(1, offset, classes);
+                float blue = get_color(0, offset, classes);
+                float rgb[3];
+
+                //width = prob*20+2;
+
+                rgb[0] = red;
+                rgb[1] = green;
+                rgb[2] = blue;
+                box b = dets[i].bbox;
+                if (std::isnan(b.w) || std::isinf(b.w)) b.w = 0.5;
+                if (std::isnan(b.h) || std::isinf(b.h)) b.h = 0.5;
+                if (std::isnan(b.x) || std::isinf(b.x)) b.x = 0.5;
+                if (std::isnan(b.y) || std::isinf(b.y)) b.y = 0.5;
+                b.w = (b.w < 1) ? b.w : 1;
+                b.h = (b.h < 1) ? b.h : 1;
+                b.x = (b.x < 1) ? b.x : 1;
+                b.y = (b.y < 1) ? b.y : 1;
+                //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
+
+                int left = (b.x - b.w / 2.)*show_img->cols;
+                int right = (b.x + b.w / 2.)*show_img->cols;
+                int top = (b.y - b.h / 2.)*show_img->rows;
+                int bot = (b.y + b.h / 2.)*show_img->rows;
+
+                if (left < 0) left = 0;
+                if (right > show_img->cols - 1) right = show_img->cols - 1;
+                if (top < 0) top = 0;
+                if (bot > show_img->rows - 1) bot = show_img->rows - 1;
+
+                //int b_x_center = (left + right) / 2;
+                //int b_y_center = (top + bot) / 2;
+                //int b_width = right - left;
+                //int b_height = bot - top;
+                //sprintf(labelstr, "%d x %d - w: %d, h: %d", b_x_center, b_y_center, b_width, b_height);
+
+                float const font_size = show_img->rows / 1000.F;
+                cv::Size const text_size = cv::getTextSize(labelstr, cv::FONT_HERSHEY_COMPLEX_SMALL, font_size, 1, 0);
+                cv::Point pt1, pt2, pt_text, pt_text_bg1, pt_text_bg2;
+                pt1.x = left;
+                pt1.y = top;
+                pt2.x = right;
+                pt2.y = bot;
+                pt_text.x = left;
+                pt_text.y = top - 4;// 12;
+                pt_text_bg1.x = left;
+                pt_text_bg1.y = top - (3 + 18 * font_size);
+                pt_text_bg2.x = right;
+                if ((right - left) < text_size.width) pt_text_bg2.x = left + text_size.width;
+                pt_text_bg2.y = top;
+                cv::Scalar color;
+                color.val[0] = red * 256;
+                color.val[1] = green * 256;
+                color.val[2] = blue * 256;
+
+                // you should create directory: result_img
+                //static int copied_frame_id = -1;
+                //static IplImage* copy_img = NULL;
+                //if (copied_frame_id != frame_id) {
+                //    copied_frame_id = frame_id;
+                //    if(copy_img == NULL) copy_img = cvCreateImage(cvSize(show_img->width, show_img->height), show_img->depth, show_img->nChannels);
+                //    cvCopy(show_img, copy_img, 0);
+                //}
+                //static int img_id = 0;
+                //img_id++;
+                //char image_name[1024];
+                //sprintf(image_name, "result_img/img_%d_%d_%d_%s.jpg", frame_id, img_id, class_id, names[class_id]);
+                //CvRect rect = cvRect(pt1.x, pt1.y, pt2.x - pt1.x, pt2.y - pt1.y);
+                //cvSetImageROI(copy_img, rect);
+                //cvSaveImage(image_name, copy_img, 0);
+                //cvResetImageROI(copy_img);
+
+                cv::rectangle(*show_img, pt1, pt2, color, width, 8, 0);
+                if (ext_output)
+                    printf("\t(left_x: %4.0f   top_y: %4.0f   width: %4.0f   height: %4.0f)\n",
+                    (float)left, (float)top, b.w*show_img->cols, b.h*show_img->rows);
+                else
+                    printf("\n");
+
+                cv::rectangle(*show_img, pt_text_bg1, pt_text_bg2, color, width, 8, 0);
+                cv::rectangle(*show_img, pt_text_bg1, pt_text_bg2, color, CV_FILLED, 8, 0);    // filled
+                cv::Scalar black_color = CV_RGB(0, 0, 0);
+                cv::putText(*show_img, labelstr, pt_text, cv::FONT_HERSHEY_COMPLEX_SMALL, font_size, black_color, 2 * font_size, CV_AA);
+                // cv::FONT_HERSHEY_COMPLEX_SMALL, cv::FONT_HERSHEY_SIMPLEX
+            }
+        }
+        if (ext_output) {
+            fflush(stdout);
+        }
+    }
+    catch (...) {
+        cerr << "OpenCV exception: draw_detections_cv_v3() \n";
+    }
+}
+// ----------------------------------------
+
+// ====================================================================
+// Draw Loss & Accuracy chart
+// ====================================================================
+extern "C" mat_cv* draw_train_chart(char *windows_name, float max_img_loss, int max_batches, int number_of_lines, int img_size, int dont_show, char* chart_path)
+{
+    int img_offset = 60;
+    int draw_size = img_size - img_offset;
+    cv::Mat *img_ptr = new cv::Mat(img_size, img_size, CV_8UC3, CV_RGB(255, 255, 255));
+    cv::Mat &img = *img_ptr;
+    cv::Point pt1, pt2, pt_text;
+
+    try {
+        // load chart from file
+        if (chart_path != NULL && chart_path[0] != '\0') {
+            *img_ptr = cv::imread(chart_path);
+        }
+        else {
+            // draw new chart
+            char char_buff[100];
+            int i;
+            // vertical lines
+            pt1.x = img_offset; pt2.x = img_size, pt_text.x = 30;
+            for (i = 1; i <= number_of_lines; ++i) {
+                pt1.y = pt2.y = (float)i * draw_size / number_of_lines;
+                cv::line(img, pt1, pt2, CV_RGB(224, 224, 224), 1, 8, 0);
+                if (i % 10 == 0) {
+                    sprintf(char_buff, "%2.1f", max_img_loss*(number_of_lines - i) / number_of_lines);
+                    pt_text.y = pt1.y + 3;
+
+                    cv::putText(img, char_buff, pt_text, cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(0, 0, 0), 1, CV_AA);
+                    cv::line(img, pt1, pt2, CV_RGB(128, 128, 128), 1, 8, 0);
+                }
+            }
+            // horizontal lines
+            pt1.y = draw_size; pt2.y = 0, pt_text.y = draw_size + 15;
+            for (i = 0; i <= number_of_lines; ++i) {
+                pt1.x = pt2.x = img_offset + (float)i * draw_size / number_of_lines;
+                cv::line(img, pt1, pt2, CV_RGB(224, 224, 224), 1, 8, 0);
+                if (i % 10 == 0) {
+                    sprintf(char_buff, "%d", max_batches * i / number_of_lines);
+                    pt_text.x = pt1.x - 20;
+                    cv::putText(img, char_buff, pt_text, cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(0, 0, 0), 1, CV_AA);
+                    cv::line(img, pt1, pt2, CV_RGB(128, 128, 128), 1, 8, 0);
+                }
+            }
+
+            cv::putText(img, "Loss", cv::Point(10, 60), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(0, 0, 255), 1, CV_AA);
+            cv::putText(img, "Iteration number", cv::Point(draw_size / 2, img_size - 10), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(0, 0, 0), 1, CV_AA);
+            char max_batches_buff[100];
+            sprintf(max_batches_buff, "in cfg max_batches=%d", max_batches);
+            cv::putText(img, max_batches_buff, cv::Point(draw_size - 195, img_size - 10), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(0, 0, 0), 1, CV_AA);
+            cv::putText(img, "Press 's' to save : chart.png", cv::Point(5, img_size - 10), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(0, 0, 0), 1, CV_AA);
+        }
+
+        if (!dont_show) {
+            printf(" If error occurs - run training with flag: -dont_show \n");
+            cv::namedWindow(windows_name, cv::WINDOW_NORMAL);
+            cv::moveWindow(windows_name, 0, 0);
+            cv::resizeWindow(windows_name, img_size, img_size);
+            cv::imshow(windows_name, img);
+            cv::waitKey(20);
+        }
+    }
+    catch (...) {
+        cerr << "OpenCV exception: draw_train_chart() \n";
+    }
+    return (mat_cv*)img_ptr;
+}
+// ----------------------------------------
+
+extern "C" void draw_train_loss(char *windows_name, mat_cv* img_src, int img_size, float avg_loss, float max_img_loss, int current_batch, int max_batches,
+    float precision, int draw_precision, char *accuracy_name, float contr_acc, int dont_show, int mjpeg_port, double time_remaining)
+{
+    try {
+        cv::Mat &img = *(cv::Mat*)img_src;
+        int img_offset = 60;
+        int draw_size = img_size - img_offset;
+        char char_buff[100];
+        cv::Point pt1, pt2;
+        pt1.x = img_offset + draw_size * (float)current_batch / max_batches;
+        pt1.y = draw_size * (1 - avg_loss / max_img_loss);
+        if (pt1.y < 0) pt1.y = 1;
+        cv::circle(img, pt1, 1, CV_RGB(0, 0, 255), CV_FILLED, 8, 0);
+
+        // contrastive accuracy
+        if (contr_acc >= 0) {
+            static float old_contr_acc = 0;
+
+            if (current_batch > 0) {
+                cv::line(img,
+                    cv::Point(img_offset + draw_size * (float)(current_batch - 1) / max_batches, draw_size * (1 - old_contr_acc)),
+                    cv::Point(img_offset + draw_size * (float)current_batch / max_batches, draw_size * (1 - contr_acc)),
+                    CV_RGB(0, 150, 70), 1, 8, 0);
+            }
+            old_contr_acc = contr_acc;
+
+            sprintf(char_buff, "C:%2.1f%% ", contr_acc * 100);
+            cv::putText(img, char_buff, cv::Point(1, 45), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(255, 255, 255), 5, CV_AA);
+            cv::putText(img, char_buff, cv::Point(1, 45), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(0, 150, 70), 1, CV_AA);
+        }
+
+        // precision
+        if (draw_precision) {
+            static float old_precision = 0;
+            static float max_precision = 0;
+            static int iteration_old = 0;
+            static int text_iteration_old = 0;
+            if (iteration_old == 0)
+                cv::putText(img, accuracy_name, cv::Point(10, 12), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(255, 0, 0), 1, CV_AA);
+
+            if (iteration_old != 0){
+                    cv::line(img,
+                        cv::Point(img_offset + draw_size * (float)iteration_old / max_batches, draw_size * (1 - old_precision)),
+                        cv::Point(img_offset + draw_size * (float)current_batch / max_batches, draw_size * (1 - precision)),
+                        CV_RGB(255, 0, 0), 1, 8, 0);
+            }
+
+            sprintf(char_buff, "%2.1f%% ", precision * 100);
+            cv::putText(img, char_buff, cv::Point(10, 28), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(255, 255, 255), 5, CV_AA);
+            cv::putText(img, char_buff, cv::Point(10, 28), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(200, 0, 0), 1, CV_AA);
+
+            if ((std::fabs(old_precision - precision) > 0.1)  || (max_precision < precision) || (current_batch - text_iteration_old) >= max_batches / 10) {
+                text_iteration_old = current_batch;
+                max_precision = std::max(max_precision, precision);
+                sprintf(char_buff, "%2.0f%% ", precision * 100);
+                cv::putText(img, char_buff, cv::Point(pt1.x - 30, draw_size * (1 - precision) + 15), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(255, 255, 255), 5, CV_AA);
+                cv::putText(img, char_buff, cv::Point(pt1.x - 30, draw_size * (1 - precision) + 15), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(200, 0, 0), 1, CV_AA);
+            }
+            old_precision = precision;
+            iteration_old = current_batch;
+        }
+        sprintf(char_buff, "current avg loss = %2.4f    iteration = %d    approx. time left = %2.2f hours", avg_loss, current_batch, time_remaining);
+        pt1.x = 15, pt1.y = draw_size + 18;
+        pt2.x = pt1.x + 800, pt2.y = pt1.y + 20;
+        cv::rectangle(img, pt1, pt2, CV_RGB(255, 255, 255), CV_FILLED, 8, 0);
+        pt1.y += 15;
+        cv::putText(img, char_buff, pt1, cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(0, 0, 100), 1, CV_AA);
+
+        int k = 0;
+        if (!dont_show) {
+            cv::imshow(windows_name, img);
+            k = cv::waitKey(20);
+        }
+        static int old_batch = 0;
+        if (k == 's' || current_batch == (max_batches - 1) || (current_batch / 100 > old_batch / 100)) {
+            old_batch = current_batch;
+            save_mat_png(img, "chart.png");
+            save_mat_png(img, windows_name);
+            cv::putText(img, "- Saved", cv::Point(260, img_size - 10), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(255, 0, 0), 1, CV_AA);
+        }
+        else
+            cv::putText(img, "- Saved", cv::Point(260, img_size - 10), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.7, CV_RGB(255, 255, 255), 1, CV_AA);
+
+        if (mjpeg_port > 0) send_mjpeg((mat_cv *)&img, mjpeg_port, 500000, 70);
+    }
+    catch (...) {
+        cerr << "OpenCV exception: draw_train_loss() \n";
+    }
+}
+// ----------------------------------------
+
+
+// ====================================================================
+// Data augmentation
+// ====================================================================
+
+extern "C" image image_data_augmentation(mat_cv* mat, int w, int h,
+    int pleft, int ptop, int swidth, int sheight, int flip,
+    float dhue, float dsat, float dexp,
+    int gaussian_noise, int blur, int num_boxes, int truth_size, float *truth)
+{
+    image out;
+    try {
+        cv::Mat img = *(cv::Mat *)mat;
+
+        // crop
+        cv::Rect src_rect(pleft, ptop, swidth, sheight);
+        cv::Rect img_rect(cv::Point2i(0, 0), img.size());
+        cv::Rect new_src_rect = src_rect & img_rect;
+
+        cv::Rect dst_rect(cv::Point2i(std::max<int>(0, -pleft), std::max<int>(0, -ptop)), new_src_rect.size());
+        cv::Mat sized;
+
+        if (src_rect.x == 0 && src_rect.y == 0 && src_rect.size() == img.size()) {
+            cv::resize(img, sized, cv::Size(w, h), 0, 0, cv::INTER_LINEAR);
+        }
+        else {
+            cv::Mat cropped(src_rect.size(), img.type());
+            //cropped.setTo(cv::Scalar::all(0));
+            cropped.setTo(cv::mean(img));
+
+            img(new_src_rect).copyTo(cropped(dst_rect));
+
+            // resize
+            cv::resize(cropped, sized, cv::Size(w, h), 0, 0, cv::INTER_LINEAR);
+        }
+
+        // flip
+        if (flip) {
+            cv::Mat cropped;
+            cv::flip(sized, cropped, 1);    // 0 - x-axis, 1 - y-axis, -1 - both axes (x & y)
+            sized = cropped.clone();
+        }
+
+        // HSV augmentation
+        // cv::COLOR_BGR2HSV, cv::COLOR_RGB2HSV, cv::COLOR_HSV2BGR, cv::COLOR_HSV2RGB
+        if (dsat != 1 || dexp != 1 || dhue != 0) {
+            if (img.channels() >= 3)
+            {
+                cv::Mat hsv_src;
+                cvtColor(sized, hsv_src, cv::COLOR_RGB2HSV);    // RGB to HSV
+
+                std::vector<cv::Mat> hsv;
+                cv::split(hsv_src, hsv);
+
+                hsv[1] *= dsat;
+                hsv[2] *= dexp;
+                hsv[0] += 179 * dhue;
+
+                cv::merge(hsv, hsv_src);
+
+                cvtColor(hsv_src, sized, cv::COLOR_HSV2RGB);    // HSV to RGB (the same as previous)
+            }
+            else
+            {
+                sized *= dexp;
+            }
+        }
+
+        //std::stringstream window_name;
+        //window_name << "augmentation - " << ipl;
+        //cv::imshow(window_name.str(), sized);
+        //cv::waitKey(0);
+
+        if (blur) {
+            cv::Mat dst(sized.size(), sized.type());
+            if (blur == 1) {
+                cv::GaussianBlur(sized, dst, cv::Size(17, 17), 0);
+                //cv::bilateralFilter(sized, dst, 17, 75, 75);
+            }
+            else {
+                int ksize = (blur / 2) * 2 + 1;
+                cv::Size kernel_size = cv::Size(ksize, ksize);
+                cv::GaussianBlur(sized, dst, kernel_size, 0);
+                //cv::medianBlur(sized, dst, ksize);
+                //cv::bilateralFilter(sized, dst, ksize, 75, 75);
+
+                // sharpen
+                //cv::Mat img_tmp;
+                //cv::GaussianBlur(dst, img_tmp, cv::Size(), 3);
+                //cv::addWeighted(dst, 1.5, img_tmp, -0.5, 0, img_tmp);
+                //dst = img_tmp;
+            }
+            //std::cout << " blur num_boxes = " << num_boxes << std::endl;
+
+            if (blur == 1) {
+                cv::Rect img_rect(0, 0, sized.cols, sized.rows);
+                int t;
+                for (t = 0; t < num_boxes; ++t) {
+                    box b = float_to_box_stride(truth + t*truth_size, 1);
+                    if (!b.x) break;
+                    int left = (b.x - b.w / 2.)*sized.cols;
+                    int width = b.w*sized.cols;
+                    int top = (b.y - b.h / 2.)*sized.rows;
+                    int height = b.h*sized.rows;
+                    cv::Rect roi(left, top, width, height);
+                    roi = roi & img_rect;
+
+                    sized(roi).copyTo(dst(roi));
+                }
+            }
+            dst.copyTo(sized);
+        }
+
+        if (gaussian_noise) {
+            cv::Mat noise = cv::Mat(sized.size(), sized.type());
+            gaussian_noise = std::min(gaussian_noise, 127);
+            gaussian_noise = std::max(gaussian_noise, 0);
+            cv::randn(noise, 0, gaussian_noise);  //mean and variance
+            cv::Mat sized_norm = sized + noise;
+            //cv::normalize(sized_norm, sized_norm, 0.0, 255.0, cv::NORM_MINMAX, sized.type());
+            //cv::imshow("source", sized);
+            //cv::imshow("gaussian noise", sized_norm);
+            //cv::waitKey(0);
+            sized = sized_norm;
+        }
+
+        //char txt[100];
+        //sprintf(txt, "blur = %d", blur);
+        //cv::putText(sized, txt, cv::Point(100, 100), cv::FONT_HERSHEY_COMPLEX_SMALL, 1.7, CV_RGB(255, 0, 0), 1, CV_AA);
+
+        // Mat -> image
+        out = mat_to_image(sized);
+    }
+    catch (const std::exception& e) {
+        cerr << "OpenCV can't augment image: " << w << " x " << h << " \n" << e.what() << " \n";
+        out = mat_to_image(*(cv::Mat*)mat);
+    }
+    return out;
+}
+
+// blend two images with (alpha and beta)
+extern "C" void blend_images_cv(image new_img, float alpha, image old_img, float beta)
+{
+    cv::Mat new_mat(cv::Size(new_img.w, new_img.h), CV_32FC(new_img.c), new_img.data);// , size_t step = AUTO_STEP)
+    cv::Mat old_mat(cv::Size(old_img.w, old_img.h), CV_32FC(old_img.c), old_img.data);
+    cv::addWeighted(new_mat, alpha, old_mat, beta, 0.0, new_mat);
+}
+
+// bilateralFilter bluring
+extern "C" image blur_image(image src_img, int ksize)
+{
+    cv::Mat src = image_to_mat(src_img);
+    cv::Mat dst;
+    cv::Size kernel_size = cv::Size(ksize, ksize);
+    cv::GaussianBlur(src, dst, kernel_size, 0);
+    //cv::bilateralFilter(src, dst, ksize, 75, 75);
+    image dst_img = mat_to_image(dst);
+    return dst_img;
+}
+
+// ====================================================================
+// Draw object - adversarial attack dnn
+// ====================================================================
+
+std::atomic<int> x_start, y_start;
+std::atomic<int> x_end, y_end;
+std::atomic<int> x_size, y_size;
+std::atomic<bool> draw_select, selected;
+
+void callback_mouse_click(int event, int x, int y, int flags, void* user_data)
+{
+    if (event == cv::EVENT_LBUTTONDOWN)
+    {
+        draw_select = true;
+        selected = false;
+        x_start = x;
+        y_start = y;
+
+        //if (prev_img_rect.contains(Point2i(x, y))) add_id_img = -1;
+        //else if (next_img_rect.contains(Point2i(x, y))) add_id_img = 1;
+        //else add_id_img = 0;
+        //std::cout << "cv::EVENT_LBUTTONDOWN \n";
+    }
+    else if (event == cv::EVENT_LBUTTONUP)
+    {
+        x_size = abs(x - x_start);
+        y_size = abs(y - y_start);
+        x_end = std::max(x, 0);
+        y_end = std::max(y, 0);
+        draw_select = false;
+        selected = true;
+        //std::cout << "cv::EVENT_LBUTTONUP \n";
+    }
+    else if (event == cv::EVENT_MOUSEMOVE)
+    {
+        x_size = abs(x - x_start);
+        y_size = abs(y - y_start);
+        x_end = std::max(x, 0);
+        y_end = std::max(y, 0);
+    }
+}
+
+extern "C" void cv_draw_object(image sized, float *truth_cpu, int max_boxes, int num_truth, int *it_num_set, float *lr_set, int *boxonly, int classes, char **names)
+{
+    cv::Mat frame = image_to_mat(sized);
+    if(frame.channels() == 3) cv::cvtColor(frame, frame, cv::COLOR_RGB2BGR);
+    cv::Mat frame_clone = frame.clone();
+
+
+    std::string const window_name = "Marking image";
+    cv::namedWindow(window_name, cv::WINDOW_NORMAL);
+    cv::resizeWindow(window_name, 1280, 720);
+    cv::imshow(window_name, frame);
+    cv::moveWindow(window_name, 0, 0);
+    cv::setMouseCallback(window_name, callback_mouse_click);
+
+
+    int it_trackbar_value = 200;
+    std::string const it_trackbar_name = "iterations";
+    int it_tb_res = cv::createTrackbar(it_trackbar_name, window_name, &it_trackbar_value, 1000);
+
+    int lr_trackbar_value = 10;
+    std::string const lr_trackbar_name = "learning_rate exp";
+    int lr_tb_res = cv::createTrackbar(lr_trackbar_name, window_name, &lr_trackbar_value, 20);
+
+    int cl_trackbar_value = 0;
+    std::string const cl_trackbar_name = "class_id";
+    int cl_tb_res = cv::createTrackbar(cl_trackbar_name, window_name, &cl_trackbar_value, classes-1);
+
+    std::string const bo_trackbar_name = "box-only";
+    int bo_tb_res = cv::createTrackbar(bo_trackbar_name, window_name, boxonly, 1);
+
+    int i = 0;
+
+    while (!selected) {
+#ifndef CV_VERSION_EPOCH
+        int pressed_key = cv::waitKeyEx(20);    // OpenCV 3.x
+#else
+        int pressed_key = cv::waitKey(20);        // OpenCV 2.x
+#endif
+        if (pressed_key == 27 || pressed_key == 1048603) break;// break;  // ESC - save & exit
+
+        frame_clone = frame.clone();
+        char buff[100];
+        std::string lr_value = "learning_rate = " + std::to_string(1.0 / pow(2, lr_trackbar_value));
+        cv::putText(frame_clone, lr_value, cv::Point2i(10, 20), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(10, 50, 10), 3);
+        cv::putText(frame_clone, lr_value, cv::Point2i(10, 20), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(20, 120, 60), 2);
+        cv::putText(frame_clone, lr_value, cv::Point2i(10, 20), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(50, 200, 100), 1);
+
+        if (names) {
+            std::string obj_name = names[cl_trackbar_value];
+            cv::putText(frame_clone, obj_name, cv::Point2i(10, 40), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(10, 50, 10), 3);
+            cv::putText(frame_clone, obj_name, cv::Point2i(10, 40), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(20, 120, 60), 2);
+            cv::putText(frame_clone, obj_name, cv::Point2i(10, 40), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(50, 200, 100), 1);
+        }
+
+        if (draw_select) {
+             cv::Rect selected_rect(
+                cv::Point2i((int)min(x_start, x_end), (int)min(y_start, y_end)),
+                cv::Size(x_size, y_size));
+
+            rectangle(frame_clone, selected_rect, cv::Scalar(150, 200, 150));
+        }
+
+
+        cv::imshow(window_name, frame_clone);
+    }
+
+    if (selected) {
+        cv::Rect selected_rect(
+            cv::Point2i((int)min(x_start, x_end), (int)min(y_start, y_end)),
+            cv::Size(x_size, y_size));
+
+        printf(" x_start = %d, y_start = %d, x_size = %d, y_size = %d \n",
+            x_start.load(), y_start.load(), x_size.load(), y_size.load());
+
+        rectangle(frame, selected_rect, cv::Scalar(150, 200, 150));
+        cv::imshow(window_name, frame);
+        cv::waitKey(100);
+
+        float width = x_end - x_start;
+        float height = y_end - y_start;
+
+        float const relative_center_x = (float)(x_start + width / 2) / frame.cols;
+        float const relative_center_y = (float)(y_start + height / 2) / frame.rows;
+        float const relative_width = (float)width / frame.cols;
+        float const relative_height = (float)height / frame.rows;
+
+        truth_cpu[i * 5 + 0] = relative_center_x;
+        truth_cpu[i * 5 + 1] = relative_center_y;
+        truth_cpu[i * 5 + 2] = relative_width;
+        truth_cpu[i * 5 + 3] = relative_height;
+        truth_cpu[i * 5 + 4] = cl_trackbar_value;
+    }
+
+    *it_num_set = it_trackbar_value;
+    *lr_set = 1.0 / pow(2, lr_trackbar_value);
+}
+
+// ====================================================================
+// Show Anchors
+// ====================================================================
+extern "C" void show_acnhors(int number_of_boxes, int num_of_clusters, float *rel_width_height_array, model anchors_data, int width, int height)
+{
+    cv::Mat labels = cv::Mat(number_of_boxes, 1, CV_32SC1);
+    cv::Mat points = cv::Mat(number_of_boxes, 2, CV_32FC1);
+    cv::Mat centers = cv::Mat(num_of_clusters, 2, CV_32FC1);
+
+    for (int i = 0; i < number_of_boxes; ++i) {
+        points.at<float>(i, 0) = rel_width_height_array[i * 2];
+        points.at<float>(i, 1) = rel_width_height_array[i * 2 + 1];
+    }
+
+    for (int i = 0; i < num_of_clusters; ++i) {
+        centers.at<float>(i, 0) = anchors_data.centers.vals[i][0];
+        centers.at<float>(i, 1) = anchors_data.centers.vals[i][1];
+    }
+
+    for (int i = 0; i < number_of_boxes; ++i) {
+        labels.at<int>(i, 0) = anchors_data.assignments[i];
+    }
+
+    size_t img_size = 700;
+    cv::Mat img = cv::Mat(img_size, img_size, CV_8UC3);
+
+    for (int i = 0; i < number_of_boxes; ++i) {
+        cv::Point pt;
+        pt.x = points.at<float>(i, 0) * img_size / width;
+        pt.y = points.at<float>(i, 1) * img_size / height;
+        int cluster_idx = labels.at<int>(i, 0);
+        int red_id = (cluster_idx * (uint64_t)123 + 55) % 255;
+        int green_id = (cluster_idx * (uint64_t)321 + 33) % 255;
+        int blue_id = (cluster_idx * (uint64_t)11 + 99) % 255;
+        cv::circle(img, pt, 1, CV_RGB(red_id, green_id, blue_id), CV_FILLED, 8, 0);
+        //if(pt.x > img_size || pt.y > img_size) printf("\n pt.x = %d, pt.y = %d \n", pt.x, pt.y);
+    }
+
+    for (int j = 0; j < num_of_clusters; ++j) {
+        cv::Point pt1, pt2;
+        pt1.x = pt1.y = 0;
+        pt2.x = centers.at<float>(j, 0) * img_size / width;
+        pt2.y = centers.at<float>(j, 1) * img_size / height;
+        cv::rectangle(img, pt1, pt2, CV_RGB(255, 255, 255), 1, 8, 0);
+    }
+    save_mat_png(img, "cloud.png");
+    cv::imshow("clusters", img);
+    cv::waitKey(0);
+    cv::destroyAllWindows();
+}
+
+void show_opencv_info()
+{
+    std::cerr << " OpenCV version: " << CV_VERSION_MAJOR << "." << CV_VERSION_MINOR << "." << CVAUX_STR(CV_VERSION_REVISION) OCV_D
+        << std::endl;
+}
+
+
+
+}   // extern "C"
+#else  // OPENCV
+extern "C" void show_opencv_info()
+{
+    std::cerr << " OpenCV isn't used - data augmentation will be slow \n";
+}
+extern "C" int wait_key_cv(int delay) { return 0; }
+extern "C" int wait_until_press_key_cv() { return 0; }
+extern "C" void destroy_all_windows_cv() {}
+extern "C" void resize_window_cv(char const* window_name, int width, int height) {}
+#endif // OPENCV
diff --git a/darknet-master/src/image_opencv.h b/darknet-master/src/image_opencv.h
new file mode 100644
index 0000000..19d16e1
--- /dev/null
+++ b/darknet-master/src/image_opencv.h
@@ -0,0 +1,135 @@
+#ifndef IMAGE_OPENCV_H
+#define IMAGE_OPENCV_H
+
+#include "image.h"
+#include "matrix.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef OPENCV
+
+// declaration
+typedef void* mat_cv;
+typedef void* cap_cv;
+typedef void* write_cv;
+
+//typedef struct mat_cv mat_cv;
+//typedef struct cap_cv cap_cv;
+//typedef struct write_cv write_cv;
+
+// cv::Mat
+mat_cv *load_image_mat_cv(const char *filename, int flag);
+image load_image_cv(char *filename, int channels);
+image load_image_resize(char *filename, int w, int h, int c, image *im);
+int get_width_mat(mat_cv *mat);
+int get_height_mat(mat_cv *mat);
+void release_mat(mat_cv **mat);
+
+// IplImage - to delete
+//int get_width_cv(mat_cv *ipl);
+//int get_height_cv(mat_cv *ipl);
+//void release_ipl(mat_cv **ipl);
+
+// image-to-ipl, ipl-to-image, image_to_mat, mat_to_image
+//mat_cv *image_to_ipl(image im);           // to delete
+//image ipl_to_image(mat_cv* src_ptr);    // to delete
+
+
+// mat_cv *image_to_ipl(image im)
+// image ipl_to_image(mat_cv* src_ptr)
+// cv::Mat ipl_to_mat(IplImage *ipl)
+// IplImage *mat_to_ipl(cv::Mat mat)
+// Mat image_to_mat(image img)
+// image mat_to_image(cv::Mat mat)
+image mat_to_image_cv(mat_cv *mat);
+
+// Window
+void create_window_cv(char const* window_name, int full_screen, int width, int height);
+void resize_window_cv(char const* window_name, int width, int height);
+void move_window_cv(char const* window_name, int x, int y);
+void destroy_all_windows_cv();
+int wait_key_cv(int delay);
+int wait_until_press_key_cv();
+void make_window(char *name, int w, int h, int fullscreen);
+void show_image_cv(image p, const char *name);
+//void show_image_cv_ipl(mat_cv *disp, const char *name);
+void show_image_mat(mat_cv *mat_ptr, const char *name);
+
+// Video Writer
+write_cv *create_video_writer(char *out_filename, char c1, char c2, char c3, char c4, int fps, int width, int height, int is_color);
+void write_frame_cv(write_cv *output_video_writer, mat_cv *mat);
+void release_video_writer(write_cv **output_video_writer);
+
+
+//void *open_video_stream(const char *f, int c, int w, int h, int fps);
+//image get_image_from_stream(void *p);
+//image load_image_cv(char *filename, int channels);
+//int show_image_cv(image im, const char* name, int ms);
+
+// Video Capture
+cap_cv* get_capture_video_stream(const char *path);
+cap_cv* get_capture_webcam(int index);
+void release_capture(cap_cv* cap);
+
+mat_cv* get_capture_frame_cv(cap_cv *cap);
+int get_stream_fps_cpp_cv(cap_cv *cap);
+double get_capture_property_cv(cap_cv *cap, int property_id);
+double get_capture_frame_count_cv(cap_cv *cap);
+int set_capture_property_cv(cap_cv *cap, int property_id, double value);
+int set_capture_position_frame_cv(cap_cv *cap, int index);
+
+// ... Video Capture
+image get_image_from_stream_cpp(cap_cv *cap);
+image get_image_from_stream_resize(cap_cv *cap, int w, int h, int c, mat_cv** in_img, int dont_close);
+image get_image_from_stream_letterbox(cap_cv *cap, int w, int h, int c, mat_cv** in_img, int dont_close);
+void consume_frame(cap_cv *cap);
+
+// Image Saving
+void save_cv_png(mat_cv *img, const char *name);
+void save_cv_jpg(mat_cv *img, const char *name);
+
+// Draw Detection
+void draw_detections_cv_v3(mat_cv* show_img, detection *dets, int num, float thresh, char **names, image **alphabet, int classes, int ext_output);
+
+// Draw Loss & Accuracy chart
+mat_cv* draw_train_chart(char *windows_name, float max_img_loss, int max_batches, int number_of_lines, int img_size, int dont_show, char* chart_path);
+void draw_train_loss(char *windows_name, mat_cv* img, int img_size, float avg_loss, float max_img_loss, int current_batch, int max_batches,
+    float precision, int draw_precision, char *accuracy_name, float contr_acc, int dont_show, int mjpeg_port, double time_remaining);
+
+// Data augmentation
+image image_data_augmentation(mat_cv* mat, int w, int h,
+    int pleft, int ptop, int swidth, int sheight, int flip,
+    float dhue, float dsat, float dexp,
+    int gaussian_noise, int blur, int num_boxes, int truth_size, float *truth);
+
+// blend two images with (alpha and beta)
+void blend_images_cv(image new_img, float alpha, image old_img, float beta);
+
+// bilateralFilter bluring
+image blur_image(image src_img, int ksize);
+
+// draw objects for Adversarial attacks
+void cv_draw_object(image sized, float *truth_cpu, int max_boxes, int num_truth, int *it_num_set, float *lr_set, int *boxonly, int classes, char **names);
+
+// Show Anchors
+void show_acnhors(int number_of_boxes, int num_of_clusters, float *rel_width_height_array, model anchors_data, int width, int height);
+
+void show_opencv_info();
+
+#else   // OPENCV
+
+void show_opencv_info();
+int wait_key_cv(int delay);
+int wait_until_press_key_cv();
+void destroy_all_windows_cv();
+void resize_window_cv(char const* window_name, int width, int height);
+
+#endif  // OPENCV
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // IMAGE_OPENCV_H
diff --git a/darknet-master/src/layer.c b/darknet-master/src/layer.c
new file mode 100644
index 0000000..032a24e
--- /dev/null
+++ b/darknet-master/src/layer.c
@@ -0,0 +1,268 @@
+#include "layer.h"
+#include "dark_cuda.h"
+#include <stdlib.h>
+
+void free_sublayer(layer *l)
+{
+    if (l) {
+        free_layer(*l);
+        free(l);
+    }
+}
+
+void free_layer(layer l)
+{
+    free_layer_custom(l, 0);
+}
+
+void free_layer_custom(layer l, int keep_cudnn_desc)
+{
+    if (l.share_layer != NULL) return;    // don't free shared layers
+    if (l.antialiasing) {
+        free_sublayer(l.input_layer);
+    }
+    if (l.type == CONV_LSTM) {
+        if (l.peephole) {
+            free_sublayer(l.vf);
+            free_sublayer(l.vi);
+            free_sublayer(l.vo);
+        }
+        else {
+            free(l.vf);
+            free(l.vi);
+            free(l.vo);
+        }
+        free_sublayer(l.wf);
+        if (!l.bottleneck) {
+            free_sublayer(l.wi);
+            free_sublayer(l.wg);
+            free_sublayer(l.wo);
+        }
+        free_sublayer(l.uf);
+        free_sublayer(l.ui);
+        free_sublayer(l.ug);
+        free_sublayer(l.uo);
+    }
+    if (l.type == CRNN) {
+        free_sublayer(l.input_layer);
+        free_sublayer(l.self_layer);
+        free_sublayer(l.output_layer);
+        l.output = NULL;
+        l.delta = NULL;
+#ifdef GPU
+        l.output_gpu = NULL;
+        l.delta_gpu = NULL;
+#endif // GPU
+    }
+    if (l.type == DROPOUT) {
+        if (l.rand)           free(l.rand);
+#ifdef GPU
+        if (l.rand_gpu)              cuda_free(l.rand_gpu);
+        if (l.drop_blocks_scale)     cuda_free_host(l.drop_blocks_scale);
+        if (l.drop_blocks_scale_gpu) cuda_free(l.drop_blocks_scale_gpu);
+#endif
+        return;
+    }
+    if (l.mask)               free(l.mask);
+    if (l.classes_multipliers)free(l.classes_multipliers);
+    if (l.cweights)           free(l.cweights);
+    if (l.indexes)            free(l.indexes);
+    if (l.input_layers)       free(l.input_layers);
+    if (l.input_sizes)        free(l.input_sizes);
+    if (l.layers_output)      free(l.layers_output);
+    if (l.layers_delta)       free(l.layers_delta);
+    if (l.map)                free(l.map);
+    if (l.rand)               free(l.rand);
+    if (l.cost)               free(l.cost);
+    if (l.labels && !l.detection) free(l.labels);
+    if (l.class_ids && !l.detection) free(l.class_ids);
+    if (l.cos_sim)            free(l.cos_sim);
+    if (l.exp_cos_sim)        free(l.exp_cos_sim);
+    if (l.p_constrastive)     free(l.p_constrastive);
+    if (l.embedding_output)   free(l.embedding_output);
+    if (l.state)              free(l.state);
+    if (l.prev_state)         free(l.prev_state);
+    if (l.forgot_state)       free(l.forgot_state);
+    if (l.forgot_delta)       free(l.forgot_delta);
+    if (l.state_delta)        free(l.state_delta);
+    if (l.concat)             free(l.concat);
+    if (l.concat_delta)       free(l.concat_delta);
+    if (l.binary_weights)     free(l.binary_weights);
+    if (l.biases)             free(l.biases), l.biases = NULL;
+    if (l.bias_updates)       free(l.bias_updates), l.bias_updates = NULL;
+    if (l.scales)             free(l.scales), l.scales = NULL;
+    if (l.scale_updates)      free(l.scale_updates), l.scale_updates = NULL;
+    if (l.biases_ema)         free(l.biases_ema), l.biases = NULL;
+    if (l.scales_ema)         free(l.scales_ema), l.scales = NULL;
+    if (l.weights_ema)        free(l.weights_ema), l.weights = NULL;
+    if (l.weights)            free(l.weights), l.weights = NULL;
+    if (l.weight_updates)     free(l.weight_updates), l.weight_updates = NULL;
+    if (l.align_bit_weights)  free(l.align_bit_weights);
+    if (l.mean_arr)           free(l.mean_arr);
+#ifdef GPU
+    if (l.delta && l.delta_pinned) {
+        cudaFreeHost(l.delta);
+        l.delta = NULL;
+    }
+    if (l.output && l.output_pinned) {
+        cudaFreeHost(l.output);
+        l.output = NULL;
+    }
+#endif  // GPU
+    if (l.delta)              free(l.delta), l.delta = NULL;
+    if (l.output)             free(l.output), l.output = NULL;
+    if (l.activation_input)   free(l.activation_input), l.activation_input = NULL;
+    if (l.squared)            free(l.squared);
+    if (l.norms)              free(l.norms);
+    if (l.spatial_mean)       free(l.spatial_mean);
+    if (l.mean)               free(l.mean), l.mean = NULL;
+    if (l.variance)           free(l.variance), l.variance = NULL;
+    if (l.mean_delta)         free(l.mean_delta), l.mean_delta = NULL;
+    if (l.variance_delta)     free(l.variance_delta), l.variance_delta = NULL;
+    if (l.rolling_mean)       free(l.rolling_mean), l.rolling_mean = NULL;
+    if (l.rolling_variance)   free(l.rolling_variance), l.rolling_variance = NULL;
+    if (l.x)                  free(l.x);
+    if (l.x_norm)             free(l.x_norm);
+    if (l.m)                  free(l.m);
+    if (l.v)                  free(l.v);
+    if (l.z_cpu)              free(l.z_cpu);
+    if (l.r_cpu)              free(l.r_cpu);
+    if (l.binary_input)       free(l.binary_input);
+    if (l.bin_re_packed_input) free(l.bin_re_packed_input);
+    if (l.t_bit_input)        free(l.t_bit_input);
+    if (l.loss)               free(l.loss);
+
+    // CONV-LSTM
+    if (l.f_cpu)               free(l.f_cpu);
+    if (l.i_cpu)               free(l.i_cpu);
+    if (l.g_cpu)               free(l.g_cpu);
+    if (l.o_cpu)               free(l.o_cpu);
+    if (l.c_cpu)               free(l.c_cpu);
+    if (l.h_cpu)               free(l.h_cpu);
+    if (l.temp_cpu)            free(l.temp_cpu);
+    if (l.temp2_cpu)           free(l.temp2_cpu);
+    if (l.temp3_cpu)           free(l.temp3_cpu);
+    if (l.dc_cpu)              free(l.dc_cpu);
+    if (l.dh_cpu)              free(l.dh_cpu);
+    if (l.prev_state_cpu)      free(l.prev_state_cpu);
+    if (l.prev_cell_cpu)       free(l.prev_cell_cpu);
+    if (l.stored_c_cpu)        free(l.stored_c_cpu);
+    if (l.stored_h_cpu)        free(l.stored_h_cpu);
+    if (l.cell_cpu)            free(l.cell_cpu);
+
+#ifdef GPU
+    if (l.indexes_gpu)           cuda_free((float *)l.indexes_gpu);
+
+    if (l.contrast_p_gpu)          cuda_free((float *)l.contrast_p_gpu);
+    if (l.z_gpu)                   cuda_free(l.z_gpu);
+    if (l.r_gpu)                   cuda_free(l.r_gpu);
+    if (l.m_gpu)                   cuda_free(l.m_gpu);
+    if (l.v_gpu)                   cuda_free(l.v_gpu);
+    if (l.forgot_state_gpu)        cuda_free(l.forgot_state_gpu);
+    if (l.forgot_delta_gpu)        cuda_free(l.forgot_delta_gpu);
+    if (l.state_gpu)               cuda_free(l.state_gpu);
+    if (l.state_delta_gpu)         cuda_free(l.state_delta_gpu);
+    if (l.gate_gpu)                cuda_free(l.gate_gpu);
+    if (l.gate_delta_gpu)          cuda_free(l.gate_delta_gpu);
+    if (l.save_gpu)                cuda_free(l.save_gpu);
+    if (l.save_delta_gpu)          cuda_free(l.save_delta_gpu);
+    if (l.concat_gpu)              cuda_free(l.concat_gpu);
+    if (l.concat_delta_gpu)        cuda_free(l.concat_delta_gpu);
+    if (l.binary_input_gpu)        cuda_free(l.binary_input_gpu);
+    if (l.binary_weights_gpu)      cuda_free(l.binary_weights_gpu);
+    if (l.mean_gpu)                cuda_free(l.mean_gpu), l.mean_gpu = NULL;
+    if (l.variance_gpu)            cuda_free(l.variance_gpu), l.variance_gpu = NULL;
+    if (l.m_cbn_avg_gpu)           cuda_free(l.m_cbn_avg_gpu), l.m_cbn_avg_gpu = NULL;
+    if (l.v_cbn_avg_gpu)           cuda_free(l.v_cbn_avg_gpu), l.v_cbn_avg_gpu = NULL;
+    if (l.rolling_mean_gpu)        cuda_free(l.rolling_mean_gpu), l.rolling_mean_gpu = NULL;
+    if (l.rolling_variance_gpu)    cuda_free(l.rolling_variance_gpu), l.rolling_variance_gpu = NULL;
+    if (l.variance_delta_gpu)      cuda_free(l.variance_delta_gpu), l.variance_delta_gpu = NULL;
+    if (l.mean_delta_gpu)          cuda_free(l.mean_delta_gpu), l.mean_delta_gpu = NULL;
+    if (l.x_norm_gpu)              cuda_free(l.x_norm_gpu);
+
+    // assisted excitation
+    if (l.gt_gpu)                  cuda_free(l.gt_gpu);
+    if (l.a_avg_gpu)               cuda_free(l.a_avg_gpu);
+
+    if (l.align_bit_weights_gpu)   cuda_free((float *)l.align_bit_weights_gpu);
+    if (l.mean_arr_gpu)            cuda_free(l.mean_arr_gpu);
+    if (l.align_workspace_gpu)     cuda_free(l.align_workspace_gpu);
+    if (l.transposed_align_workspace_gpu) cuda_free(l.transposed_align_workspace_gpu);
+
+    if (l.weights_gpu)             cuda_free(l.weights_gpu), l.weights_gpu = NULL;
+    if (l.weight_updates_gpu)      cuda_free(l.weight_updates_gpu), l.weight_updates_gpu = NULL;
+    if (l.weight_deform_gpu)       cuda_free(l.weight_deform_gpu), l.weight_deform_gpu = NULL;
+    if (l.weights_gpu16)           cuda_free(l.weights_gpu16), l.weights_gpu16 = NULL;
+    if (l.weight_updates_gpu16)    cuda_free(l.weight_updates_gpu16), l.weight_updates_gpu16 = NULL;
+    if (l.biases_gpu)              cuda_free(l.biases_gpu), l.biases_gpu = NULL;
+    if (l.bias_updates_gpu)        cuda_free(l.bias_updates_gpu), l.bias_updates_gpu = NULL;
+    if (l.scales_gpu)              cuda_free(l.scales_gpu), l.scales_gpu = NULL;
+    if (l.scale_updates_gpu)       cuda_free(l.scale_updates_gpu), l.scale_updates_gpu = NULL;
+    if (l.input_antialiasing_gpu)  cuda_free(l.input_antialiasing_gpu), l.input_antialiasing_gpu = NULL;
+    if (l.optimized_memory < 2) {
+        if (l.x_gpu)                   cuda_free(l.x_gpu),  l.x_gpu = NULL;
+        if (l.output_gpu)              cuda_free(l.output_gpu), l.output_gpu = NULL;
+        if (l.output_avg_gpu)          cuda_free(l.output_avg_gpu), l.output_avg_gpu = NULL;
+        if (l.activation_input_gpu)    cuda_free(l.activation_input_gpu), l.activation_input_gpu = NULL;
+    }
+    if (l.delta_gpu && (l.optimized_memory < 1 || l.keep_delta_gpu && l.optimized_memory < 3)) cuda_free(l.delta_gpu), l.delta_gpu = NULL;
+    if (l.cos_sim_gpu)             cuda_free(l.cos_sim_gpu);
+    if (l.rand_gpu)                cuda_free(l.rand_gpu);
+    if (l.squared_gpu)             cuda_free(l.squared_gpu);
+    if (l.norms_gpu)               cuda_free(l.norms_gpu);
+    if (l.input_sizes_gpu)         cuda_free((float*)l.input_sizes_gpu);
+    if (l.layers_output_gpu)       cuda_free((float*)l.layers_output_gpu);
+    if (l.layers_delta_gpu)        cuda_free((float*)l.layers_delta_gpu);
+
+    // CONV-LSTM
+    if (l.f_gpu)                   cuda_free(l.f_gpu);
+    if (l.i_gpu)                   cuda_free(l.i_gpu);
+    if (l.g_gpu)                   cuda_free(l.g_gpu);
+    if (l.o_gpu)                   cuda_free(l.o_gpu);
+    if (l.c_gpu)                   cuda_free(l.c_gpu);
+    if (l.h_gpu)                   cuda_free(l.h_gpu);
+    if (l.bottelneck_hi_gpu)       cuda_free(l.bottelneck_hi_gpu);
+    if (l.bottelneck_delta_gpu)    cuda_free(l.bottelneck_delta_gpu);
+    if (l.temp_gpu)                cuda_free(l.temp_gpu);
+    if (l.temp2_gpu)               cuda_free(l.temp2_gpu);
+    if (l.temp3_gpu)               cuda_free(l.temp3_gpu);
+    if (l.dc_gpu)                  cuda_free(l.dc_gpu);
+    if (l.dh_gpu)                  cuda_free(l.dh_gpu);
+    if (l.prev_state_gpu)          cuda_free(l.prev_state_gpu);
+    if (l.prev_cell_gpu)           cuda_free(l.prev_cell_gpu);
+    if (l.stored_c_gpu)            cuda_free(l.stored_c_gpu);
+    if (l.stored_h_gpu)            cuda_free(l.stored_h_gpu);
+    if (l.last_prev_state_gpu)     cuda_free(l.last_prev_state_gpu);
+    if (l.last_prev_cell_gpu)      cuda_free(l.last_prev_cell_gpu);
+    if (l.cell_gpu)                cuda_free(l.cell_gpu);
+#ifdef CUDNN   // shouldn't be used for -map
+    if (!keep_cudnn_desc) {
+        if (l.srcTensorDesc) CHECK_CUDNN(cudnnDestroyTensorDescriptor(l.srcTensorDesc));
+        if (l.dstTensorDesc) CHECK_CUDNN(cudnnDestroyTensorDescriptor(l.dstTensorDesc));
+        if (l.srcTensorDesc16) CHECK_CUDNN(cudnnDestroyTensorDescriptor(l.srcTensorDesc16));
+        if (l.dstTensorDesc16) CHECK_CUDNN(cudnnDestroyTensorDescriptor(l.dstTensorDesc16));
+        if (l.dsrcTensorDesc) CHECK_CUDNN(cudnnDestroyTensorDescriptor(l.dsrcTensorDesc));
+        if (l.ddstTensorDesc) CHECK_CUDNN(cudnnDestroyTensorDescriptor(l.ddstTensorDesc));
+        if (l.dsrcTensorDesc16) CHECK_CUDNN(cudnnDestroyTensorDescriptor(l.dsrcTensorDesc16));
+        if (l.ddstTensorDesc16) CHECK_CUDNN(cudnnDestroyTensorDescriptor(l.ddstTensorDesc16));
+        if (l.normTensorDesc) CHECK_CUDNN(cudnnDestroyTensorDescriptor(l.normTensorDesc));
+        if (l.normDstTensorDesc) CHECK_CUDNN(cudnnDestroyTensorDescriptor(l.normDstTensorDesc));
+        if (l.normDstTensorDescF16) CHECK_CUDNN(cudnnDestroyTensorDescriptor(l.normDstTensorDescF16));
+
+        if (l.weightDesc) CHECK_CUDNN(cudnnDestroyFilterDescriptor(l.weightDesc));
+        if (l.weightDesc16) CHECK_CUDNN(cudnnDestroyFilterDescriptor(l.weightDesc16));
+        if (l.dweightDesc) CHECK_CUDNN(cudnnDestroyFilterDescriptor(l.dweightDesc));
+        if (l.dweightDesc16) CHECK_CUDNN(cudnnDestroyFilterDescriptor(l.dweightDesc16));
+
+        if (l.convDesc) CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(l.convDesc));
+
+        if (l.poolingDesc) CHECK_CUDNN(cudnnDestroyPoolingDescriptor(l.poolingDesc));
+
+        //cudnnConvolutionFwdAlgo_t fw_algo, fw_algo16;
+        //cudnnConvolutionBwdDataAlgo_t bd_algo, bd_algo16;
+        //cudnnConvolutionBwdFilterAlgo_t bf_algo, bf_algo16;
+    }
+#endif  // CUDNN
+
+#endif  // GPU
+}
diff --git a/darknet-master/src/layer.h b/darknet-master/src/layer.h
new file mode 100644
index 0000000..5b3d228
--- /dev/null
+++ b/darknet-master/src/layer.h
@@ -0,0 +1,338 @@
+#ifndef BASE_LAYER_H
+#define BASE_LAYER_H
+
+#include "activations.h"
+#include "stddef.h"
+#include "tree.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//struct network_state;
+
+//struct layer;
+//typedef struct layer layer;
+
+//typedef enum {
+//    CONVOLUTIONAL,
+//    DECONVOLUTIONAL,
+//    CONNECTED,
+//    MAXPOOL,
+//    SOFTMAX,
+//    DETECTION,
+//    DROPOUT,
+//    CROP,
+//    ROUTE,
+//    COST,
+//    NORMALIZATION,
+//    AVGPOOL,
+//    LOCAL,
+//    SHORTCUT,
+//    ACTIVE,
+//    RNN,
+//    GRU,
+//    CRNN,
+//    BATCHNORM,
+//    NETWORK,
+//    XNOR,
+//    REGION,
+//    YOLO,
+//    REORG,
+//    UPSAMPLE,
+//    REORG_OLD,
+//    BLANK
+//} LAYER_TYPE;
+
+//typedef enum{
+//    SSE, MASKED, SMOOTH
+//} COST_TYPE;
+
+//typedef struct {
+//    int batch;
+//    float learning_rate;
+//    float momentum;
+//    float decay;
+//    int adam;
+//    float B1;
+//    float B2;
+//    float eps;
+//    int t;
+//} update_args;
+
+/*
+struct layer{
+    LAYER_TYPE type;
+    ACTIVATION activation;
+    COST_TYPE cost_type;
+    void (*forward)   (struct layer, struct network_state);
+    void (*backward)  (struct layer, struct network_state);
+    void (*update)    (struct layer, int, float, float, float);
+    void (*forward_gpu)   (struct layer, struct network_state);
+    void (*backward_gpu)  (struct layer, struct network_state);
+    void (*update_gpu)    (struct layer, int, float, float, float);
+    int batch_normalize;
+    int shortcut;
+    int batch;
+    int forced;
+    int flipped;
+    int inputs;
+    int outputs;
+    int truths;
+    int h,w,c;
+    int out_h, out_w, out_c;
+    int n;
+    int max_boxes;
+    int groups;
+    int size;
+    int side;
+    int stride;
+    int reverse;
+    int spatial;
+    int pad;
+    int sqrt;
+    int flip;
+    int index;
+    int binary;
+    int xnor;
+    int use_bin_output;
+    int steps;
+    int hidden;
+    float dot;
+    float angle;
+    float jitter;
+    float saturation;
+    float exposure;
+    float shift;
+    float ratio;
+    float learning_rate_scale;
+    int focal_loss;
+    int noloss;
+    int softmax;
+    int classes;
+    int coords;
+    int background;
+    int rescore;
+    int objectness;
+    int does_cost;
+    int joint;
+    int noadjust;
+    int reorg;
+    int log;
+    int tanh;
+    int *mask;
+    int total;
+    float bflops;
+
+    int adam;
+    float B1;
+    float B2;
+    float eps;
+
+    int t;
+    float *m;
+    float *v;
+    float * bias_m;
+    float * bias_v;
+    float * scale_m;
+    float * scale_v;
+
+    tree *softmax_tree;
+    int  *map;
+
+    float alpha;
+    float beta;
+    float kappa;
+
+    float coord_scale;
+    float object_scale;
+    float noobject_scale;
+    float mask_scale;
+    float class_scale;
+    int bias_match;
+    int random;
+    float ignore_thresh;
+    float truth_thresh;
+    float thresh;
+    float focus;
+    int classfix;
+    int absolute;
+
+    int onlyforward;
+    int stopbackward;
+    int dontload;
+    int dontloadscales;
+
+    float temperature;
+    float probability;
+    float scale;
+
+    int *indexes;
+    float *rand;
+    float *cost;
+    char  *cweights;
+    float *state;
+    float *prev_state;
+    float *forgot_state;
+    float *forgot_delta;
+    float *state_delta;
+
+    float *concat;
+    float *concat_delta;
+
+    float *binary_weights;
+
+    float *biases;
+    float *bias_updates;
+
+    float *scales;
+    float *scale_updates;
+
+    float *weights;
+    float *weight_updates;
+
+    char *align_bit_weights_gpu;
+    float *mean_arr_gpu;
+    float *align_workspace_gpu;
+    float *transposed_align_workspace_gpu;
+    int align_workspace_size;
+
+    char *align_bit_weights;
+    float *mean_arr;
+    int align_bit_weights_size;
+    int lda_align;
+    int new_lda;
+    int bit_align;
+
+    float *col_image;
+    int   * input_layers;
+    int   * input_sizes;
+    float * delta;
+    float * output;
+    float * loss;
+    float * squared;
+    float * norms;
+
+    float * spatial_mean;
+    float * mean;
+    float * variance;
+
+    float * mean_delta;
+    float * variance_delta;
+
+    float * rolling_mean;
+    float * rolling_variance;
+
+    float * x;
+    float * x_norm;
+
+    struct layer *input_layer;
+    struct layer *self_layer;
+    struct layer *output_layer;
+
+    struct layer *input_gate_layer;
+    struct layer *state_gate_layer;
+    struct layer *input_save_layer;
+    struct layer *state_save_layer;
+    struct layer *input_state_layer;
+    struct layer *state_state_layer;
+
+    struct layer *input_z_layer;
+    struct layer *state_z_layer;
+
+    struct layer *input_r_layer;
+    struct layer *state_r_layer;
+
+    struct layer *input_h_layer;
+    struct layer *state_h_layer;
+
+    float *z_cpu;
+    float *r_cpu;
+    float *h_cpu;
+
+    float *binary_input;
+
+    size_t workspace_size;
+
+#ifdef GPU
+    float *z_gpu;
+    float *r_gpu;
+    float *h_gpu;
+
+    int *indexes_gpu;
+    float * prev_state_gpu;
+    float * forgot_state_gpu;
+    float * forgot_delta_gpu;
+    float * state_gpu;
+    float * state_delta_gpu;
+    float * gate_gpu;
+    float * gate_delta_gpu;
+    float * save_gpu;
+    float * save_delta_gpu;
+    float * concat_gpu;
+    float * concat_delta_gpu;
+
+    // adam
+    float *m_gpu;
+    float *v_gpu;
+    float *bias_m_gpu;
+    float *scale_m_gpu;
+    float *bias_v_gpu;
+    float *scale_v_gpu;
+
+    float *binary_input_gpu;
+    float *binary_weights_gpu;
+
+    float * mean_gpu;
+    float * variance_gpu;
+
+    float * rolling_mean_gpu;
+    float * rolling_variance_gpu;
+
+    float * variance_delta_gpu;
+    float * mean_delta_gpu;
+
+    float * col_image_gpu;
+
+    float * x_gpu;
+    float * x_norm_gpu;
+    float * weights_gpu;
+    float * weight_updates_gpu;
+
+    float * weights_gpu16;
+    float * weight_updates_gpu16;
+
+    float * biases_gpu;
+    float * bias_updates_gpu;
+
+    float * scales_gpu;
+    float * scale_updates_gpu;
+
+    float * output_gpu;
+    float * loss_gpu;
+    float * delta_gpu;
+    float * rand_gpu;
+    float * squared_gpu;
+    float * norms_gpu;
+    #ifdef CUDNN
+    cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
+    cudnnTensorDescriptor_t srcTensorDesc16, dstTensorDesc16;
+    cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
+    cudnnTensorDescriptor_t dsrcTensorDesc16, ddstTensorDesc16;
+    cudnnTensorDescriptor_t normTensorDesc, normDstTensorDesc, normDstTensorDescF16;
+    cudnnFilterDescriptor_t weightDesc, weightDesc16;
+    cudnnFilterDescriptor_t dweightDesc, dweightDesc16;
+    cudnnConvolutionDescriptor_t convDesc;
+    cudnnConvolutionFwdAlgo_t fw_algo, fw_algo16;
+    cudnnConvolutionBwdDataAlgo_t bd_algo, bd_algo16;
+    cudnnConvolutionBwdFilterAlgo_t bf_algo, bf_algo16;
+    cudnnPoolingDescriptor_t poolingDesc;
+    #endif  // CUDNN
+#endif  // GPU
+};
+*/
+//void free_layer(layer);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/list.c b/darknet-master/src/list.c
new file mode 100644
index 0000000..dbc70c3
--- /dev/null
+++ b/darknet-master/src/list.c
@@ -0,0 +1,116 @@
+#include <stdlib.h>
+#include <string.h>
+#include "list.h"
+#include "utils.h"
+#include "option_list.h"
+
+list *make_list()
+{
+    list* l = (list*)xmalloc(sizeof(list));
+    l->size = 0;
+    l->front = 0;
+    l->back = 0;
+    return l;
+}
+
+/*
+void transfer_node(list *s, list *d, node *n)
+{
+    node *prev, *next;
+    prev = n->prev;
+    next = n->next;
+    if(prev) prev->next = next;
+    if(next) next->prev = prev;
+    --s->size;
+    if(s->front == n) s->front = next;
+    if(s->back == n) s->back = prev;
+}
+*/
+
+void *list_pop(list *l){
+    if(!l->back) return 0;
+    node *b = l->back;
+    void *val = b->val;
+    l->back = b->prev;
+    if(l->back) l->back->next = 0;
+    free(b);
+    --l->size;
+
+    return val;
+}
+
+void list_insert(list *l, void *val)
+{
+    node* newnode = (node*)xmalloc(sizeof(node));
+    newnode->val = val;
+    newnode->next = 0;
+
+    if(!l->back){
+        l->front = newnode;
+        newnode->prev = 0;
+    }else{
+        l->back->next = newnode;
+        newnode->prev = l->back;
+    }
+    l->back = newnode;
+    ++l->size;
+}
+
+void free_node(node *n)
+{
+    node *next;
+    while(n) {
+        next = n->next;
+        free(n);
+        n = next;
+    }
+}
+
+void free_list_val(list *l)
+{
+    node *n = l->front;
+    node *next;
+    while (n) {
+        next = n->next;
+        free(n->val);
+        n = next;
+    }
+}
+
+void free_list(list *l)
+{
+    free_node(l->front);
+    free(l);
+}
+
+void free_list_contents(list *l)
+{
+    node *n = l->front;
+    while(n){
+        free(n->val);
+        n = n->next;
+    }
+}
+
+void free_list_contents_kvp(list *l)
+{
+    node *n = l->front;
+    while (n) {
+        kvp* p = (kvp*)n->val;
+        free(p->key);
+        free(n->val);
+        n = n->next;
+    }
+}
+
+void **list_to_array(list *l)
+{
+    void** a = (void**)xcalloc(l->size, sizeof(void*));
+    int count = 0;
+    node *n = l->front;
+    while(n){
+        a[count++] = n->val;
+        n = n->next;
+    }
+    return a;
+}
diff --git a/darknet-master/src/list.h b/darknet-master/src/list.h
new file mode 100644
index 0000000..182648f
--- /dev/null
+++ b/darknet-master/src/list.h
@@ -0,0 +1,34 @@
+#ifndef LIST_H
+#define LIST_H
+
+typedef struct node{
+    void *val;
+    struct node *next;
+    struct node *prev;
+} node;
+
+typedef struct list{
+    int size;
+    node *front;
+    node *back;
+} list;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+list *make_list();
+int list_find(list *l, void *val);
+
+void list_insert(list *, void *);
+
+void **list_to_array(list *l);
+
+void free_list_val(list *l);
+void free_list(list *l);
+void free_list_contents(list *l);
+void free_list_contents_kvp(list *l);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/local_layer.c b/darknet-master/src/local_layer.c
new file mode 100644
index 0000000..88c7b12
--- /dev/null
+++ b/darknet-master/src/local_layer.c
@@ -0,0 +1,283 @@
+#include "local_layer.h"
+#include "utils.h"
+#include "im2col.h"
+#include "col2im.h"
+#include "blas.h"
+#include "gemm.h"
+#include <stdio.h>
+#include <time.h>
+
+int local_out_height(local_layer l)
+{
+    int h = l.h;
+    if (!l.pad) h -= l.size;
+    else h -= 1;
+    return h/l.stride + 1;
+}
+
+int local_out_width(local_layer l)
+{
+    int w = l.w;
+    if (!l.pad) w -= l.size;
+    else w -= 1;
+    return w/l.stride + 1;
+}
+
+local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation)
+{
+    int i;
+    local_layer l = { (LAYER_TYPE)0 };
+    l.type = LOCAL;
+
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.n = n;
+    l.batch = batch;
+    l.stride = stride;
+    l.size = size;
+    l.pad = pad;
+
+    int out_h = local_out_height(l);
+    int out_w = local_out_width(l);
+    int locations = out_h*out_w;
+    l.out_h = out_h;
+    l.out_w = out_w;
+    l.out_c = n;
+    l.outputs = l.out_h * l.out_w * l.out_c;
+    l.inputs = l.w * l.h * l.c;
+
+    l.weights = (float*)xcalloc(c * n * size * size * locations, sizeof(float));
+    l.weight_updates = (float*)xcalloc(c * n * size * size * locations, sizeof(float));
+
+    l.biases = (float*)xcalloc(l.outputs, sizeof(float));
+    l.bias_updates = (float*)xcalloc(l.outputs, sizeof(float));
+
+    // float scale = 1./sqrt(size*size*c);
+    float scale = sqrt(2./(size*size*c));
+    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1,1);
+
+    l.col_image = (float*)xcalloc(out_h * out_w * size * size * c, sizeof(float));
+    l.output = (float*)xcalloc(l.batch * out_h * out_w * n, sizeof(float));
+    l.delta = (float*)xcalloc(l.batch * out_h * out_w * n, sizeof(float));
+
+    l.forward = forward_local_layer;
+    l.backward = backward_local_layer;
+    l.update = update_local_layer;
+
+#ifdef GPU
+    l.forward_gpu = forward_local_layer_gpu;
+    l.backward_gpu = backward_local_layer_gpu;
+    l.update_gpu = update_local_layer_gpu;
+
+    l.weights_gpu = cuda_make_array(l.weights, c*n*size*size*locations);
+    l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size*locations);
+
+    l.biases_gpu = cuda_make_array(l.biases, l.outputs);
+    l.bias_updates_gpu = cuda_make_array(l.bias_updates, l.outputs);
+
+    l.col_image_gpu = cuda_make_array(l.col_image, out_h*out_w*size*size*c);
+    l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
+    l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+
+#endif
+    l.activation = activation;
+
+    fprintf(stderr, "Local Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
+
+    return l;
+}
+
+void forward_local_layer(const local_layer l, network_state state)
+{
+    int out_h = local_out_height(l);
+    int out_w = local_out_width(l);
+    int i, j;
+    int locations = out_h * out_w;
+
+    for(i = 0; i < l.batch; ++i){
+        copy_cpu(l.outputs, l.biases, 1, l.output + i*l.outputs, 1);
+    }
+
+    for(i = 0; i < l.batch; ++i){
+        float *input = state.input + i*l.w*l.h*l.c;
+        im2col_cpu(input, l.c, l.h, l.w,
+                l.size, l.stride, l.pad, l.col_image);
+        float *output = l.output + i*l.outputs;
+        for(j = 0; j < locations; ++j){
+            float *a = l.weights + j*l.size*l.size*l.c*l.n;
+            float *b = l.col_image + j;
+            float *c = output + j;
+
+            int m = l.n;
+            int n = 1;
+            int k = l.size*l.size*l.c;
+
+            gemm(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
+        }
+    }
+    activate_array(l.output, l.outputs*l.batch, l.activation);
+}
+
+void backward_local_layer(local_layer l, network_state state)
+{
+    int i, j;
+    int locations = l.out_w*l.out_h;
+
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+
+    for(i = 0; i < l.batch; ++i){
+        axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
+    }
+
+    for(i = 0; i < l.batch; ++i){
+        float *input = state.input + i*l.w*l.h*l.c;
+        im2col_cpu(input, l.c, l.h, l.w,
+                l.size, l.stride, l.pad, l.col_image);
+
+        for(j = 0; j < locations; ++j){
+            float *a = l.delta + i*l.outputs + j;
+            float *b = l.col_image + j;
+            float *c = l.weight_updates + j*l.size*l.size*l.c*l.n;
+            int m = l.n;
+            int n = l.size*l.size*l.c;
+            int k = 1;
+
+            gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
+        }
+
+        if(state.delta){
+            for(j = 0; j < locations; ++j){
+                float *a = l.weights + j*l.size*l.size*l.c*l.n;
+                float *b = l.delta + i*l.outputs + j;
+                float *c = l.col_image + j;
+
+                int m = l.size*l.size*l.c;
+                int n = 1;
+                int k = l.n;
+
+                gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
+            }
+
+            col2im_cpu(l.col_image, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+        }
+    }
+}
+
+void update_local_layer(local_layer l, int batch, float learning_rate, float momentum, float decay)
+{
+    int locations = l.out_w*l.out_h;
+    int size = l.size*l.size*l.c*l.n*locations;
+    axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
+    scal_cpu(l.outputs, momentum, l.bias_updates, 1);
+
+    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(size, momentum, l.weight_updates, 1);
+}
+
+#ifdef GPU
+
+void forward_local_layer_gpu(const local_layer l, network_state state)
+{
+    int out_h = local_out_height(l);
+    int out_w = local_out_width(l);
+    int i, j;
+    int locations = out_h * out_w;
+
+    for(i = 0; i < l.batch; ++i){
+        copy_ongpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
+    }
+
+    for(i = 0; i < l.batch; ++i){
+        float *input = state.input + i*l.w*l.h*l.c;
+        im2col_ongpu(input, l.c, l.h, l.w,
+                l.size, l.stride, l.pad, l.col_image_gpu);
+        float *output = l.output_gpu + i*l.outputs;
+        for(j = 0; j < locations; ++j){
+            float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
+            float *b = l.col_image_gpu + j;
+            float *c = output + j;
+
+            int m = l.n;
+            int n = 1;
+            int k = l.size*l.size*l.c;
+
+            gemm_ongpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
+        }
+    }
+    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+}
+
+void backward_local_layer_gpu(local_layer l, network_state state)
+{
+    int i, j;
+    int locations = l.out_w*l.out_h;
+
+    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    for(i = 0; i < l.batch; ++i){
+        axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
+    }
+
+    for(i = 0; i < l.batch; ++i){
+        float *input = state.input + i*l.w*l.h*l.c;
+        im2col_ongpu(input, l.c, l.h, l.w,
+                l.size, l.stride, l.pad, l.col_image_gpu);
+
+        for(j = 0; j < locations; ++j){
+            float *a = l.delta_gpu + i*l.outputs + j;
+            float *b = l.col_image_gpu + j;
+            float *c = l.weight_updates_gpu + j*l.size*l.size*l.c*l.n;
+            int m = l.n;
+            int n = l.size*l.size*l.c;
+            int k = 1;
+
+            gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
+        }
+
+        if(state.delta){
+            for(j = 0; j < locations; ++j){
+                float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
+                float *b = l.delta_gpu + i*l.outputs + j;
+                float *c = l.col_image_gpu + j;
+
+                int m = l.size*l.size*l.c;
+                int n = 1;
+                int k = l.n;
+
+                gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
+            }
+
+            col2im_ongpu(l.col_image_gpu, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+        }
+    }
+}
+
+void update_local_layer_gpu(local_layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
+{
+    int locations = l.out_w*l.out_h;
+    int size = l.size*l.size*l.c*l.n*locations;
+    axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+    scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
+
+    axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+    axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+    scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
+}
+
+void pull_local_layer(local_layer l)
+{
+    int locations = l.out_w*l.out_h;
+    int size = l.size*l.size*l.c*l.n*locations;
+    cuda_pull_array(l.weights_gpu, l.weights, size);
+    cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
+}
+
+void push_local_layer(local_layer l)
+{
+    int locations = l.out_w*l.out_h;
+    int size = l.size*l.size*l.c*l.n*locations;
+    cuda_push_array(l.weights_gpu, l.weights, size);
+    cuda_push_array(l.biases_gpu, l.biases, l.outputs);
+}
+#endif
diff --git a/darknet-master/src/local_layer.h b/darknet-master/src/local_layer.h
new file mode 100644
index 0000000..45e02a5
--- /dev/null
+++ b/darknet-master/src/local_layer.h
@@ -0,0 +1,37 @@
+#ifndef LOCAL_LAYER_H
+#define LOCAL_LAYER_H
+
+#include "dark_cuda.h"
+#include "image.h"
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+
+typedef layer local_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef GPU
+void forward_local_layer_gpu(local_layer layer, network_state state);
+void backward_local_layer_gpu(local_layer layer, network_state state);
+void update_local_layer_gpu(local_layer layer, int batch, float learning_rate, float momentum, float decay, float loss_scale);
+
+void push_local_layer(local_layer layer);
+void pull_local_layer(local_layer layer);
+#endif
+
+local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation);
+
+void forward_local_layer(const local_layer layer, network_state state);
+void backward_local_layer(local_layer layer, network_state state);
+void update_local_layer(local_layer layer, int batch, float learning_rate, float momentum, float decay);
+
+void bias_output(float *output, float *biases, int batch, int n, int size);
+void backward_bias(float *bias_updates, float *delta, int batch, int n, int size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/lstm_layer.c b/darknet-master/src/lstm_layer.c
new file mode 100644
index 0000000..a794556
--- /dev/null
+++ b/darknet-master/src/lstm_layer.c
@@ -0,0 +1,646 @@
+#include "lstm_layer.h"
+#include "connected_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include "gemm.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void increment_layer(layer *l, int steps)
+{
+    int num = l->outputs*l->batch*steps;
+    l->output += num;
+    l->delta += num;
+    l->x += num;
+    l->x_norm += num;
+
+#ifdef GPU
+    l->output_gpu += num;
+    l->delta_gpu += num;
+    l->x_gpu += num;
+    l->x_norm_gpu += num;
+#endif
+}
+
+layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize)
+{
+    fprintf(stderr, "LSTM Layer: %d inputs, %d outputs\n", inputs, outputs);
+    batch = batch / steps;
+    layer l = { (LAYER_TYPE)0 };
+    l.batch = batch;
+    l.type = LSTM;
+    l.steps = steps;
+    l.inputs = inputs;
+    l.out_w = 1;
+    l.out_h = 1;
+    l.out_c = outputs;
+
+    l.uf = (layer*)xcalloc(1, sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.uf) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize);
+    l.uf->batch = batch;
+    if (l.workspace_size < l.uf->workspace_size) l.workspace_size = l.uf->workspace_size;
+
+    l.ui = (layer*)xcalloc(1, sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.ui) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize);
+    l.ui->batch = batch;
+    if (l.workspace_size < l.ui->workspace_size) l.workspace_size = l.ui->workspace_size;
+
+    l.ug = (layer*)xcalloc(1, sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.ug) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize);
+    l.ug->batch = batch;
+    if (l.workspace_size < l.ug->workspace_size) l.workspace_size = l.ug->workspace_size;
+
+    l.uo = (layer*)xcalloc(1, sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.uo) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize);
+    l.uo->batch = batch;
+    if (l.workspace_size < l.uo->workspace_size) l.workspace_size = l.uo->workspace_size;
+
+    l.wf = (layer*)xcalloc(1, sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wf) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize);
+    l.wf->batch = batch;
+    if (l.workspace_size < l.wf->workspace_size) l.workspace_size = l.wf->workspace_size;
+
+    l.wi = (layer*)xcalloc(1, sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wi) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize);
+    l.wi->batch = batch;
+    if (l.workspace_size < l.wi->workspace_size) l.workspace_size = l.wi->workspace_size;
+
+    l.wg = (layer*)xcalloc(1, sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wg) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize);
+    l.wg->batch = batch;
+    if (l.workspace_size < l.wg->workspace_size) l.workspace_size = l.wg->workspace_size;
+
+    l.wo = (layer*)xcalloc(1, sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wo) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize);
+    l.wo->batch = batch;
+    if (l.workspace_size < l.wo->workspace_size) l.workspace_size = l.wo->workspace_size;
+
+    l.batch_normalize = batch_normalize;
+    l.outputs = outputs;
+
+    l.output = (float*)xcalloc(outputs * batch * steps, sizeof(float));
+    l.state = (float*)xcalloc(outputs * batch, sizeof(float));
+
+    l.forward = forward_lstm_layer;
+    l.update = update_lstm_layer;
+    l.backward = backward_lstm_layer;
+
+    l.prev_state_cpu =  (float*)xcalloc(batch*outputs, sizeof(float));
+    l.prev_cell_cpu =   (float*)xcalloc(batch*outputs, sizeof(float));
+    l.cell_cpu =        (float*)xcalloc(batch*outputs*steps, sizeof(float));
+
+    l.f_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.i_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.g_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.o_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.c_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.h_cpu =           (float*)xcalloc(batch*outputs, sizeof(float));
+    l.temp_cpu =        (float*)xcalloc(batch*outputs, sizeof(float));
+    l.temp2_cpu =       (float*)xcalloc(batch*outputs, sizeof(float));
+    l.temp3_cpu =       (float*)xcalloc(batch*outputs, sizeof(float));
+    l.dc_cpu =          (float*)xcalloc(batch*outputs, sizeof(float));
+    l.dh_cpu =          (float*)xcalloc(batch*outputs, sizeof(float));
+
+#ifdef GPU
+    l.forward_gpu = forward_lstm_layer_gpu;
+    l.backward_gpu = backward_lstm_layer_gpu;
+    l.update_gpu = update_lstm_layer_gpu;
+
+    //l.state_gpu = cuda_make_array(l.state, batch*l.outputs);
+
+    l.output_gpu = cuda_make_array(0, batch*outputs*steps);
+    l.delta_gpu = cuda_make_array(0, batch*l.outputs*steps);
+
+    l.prev_state_gpu = cuda_make_array(0, batch*outputs);
+    l.prev_cell_gpu = cuda_make_array(0, batch*outputs);
+    l.cell_gpu = cuda_make_array(0, batch*outputs*steps);
+
+    l.f_gpu = cuda_make_array(0, batch*outputs);
+    l.i_gpu = cuda_make_array(0, batch*outputs);
+    l.g_gpu = cuda_make_array(0, batch*outputs);
+    l.o_gpu = cuda_make_array(0, batch*outputs);
+    l.c_gpu = cuda_make_array(0, batch*outputs);
+    l.h_gpu = cuda_make_array(0, batch*outputs);
+    l.temp_gpu =  cuda_make_array(0, batch*outputs);
+    l.temp2_gpu = cuda_make_array(0, batch*outputs);
+    l.temp3_gpu = cuda_make_array(0, batch*outputs);
+    l.dc_gpu = cuda_make_array(0, batch*outputs);
+    l.dh_gpu = cuda_make_array(0, batch*outputs);
+#ifdef CUDNN
+    /*
+        cudnnSetTensor4dDescriptor(l.wf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wf->out_c, l.wf->out_h, l.wf->out_w);
+        cudnnSetTensor4dDescriptor(l.wi->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wi->out_c, l.wi->out_h, l.wi->out_w);
+        cudnnSetTensor4dDescriptor(l.wg->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wg->out_c, l.wg->out_h, l.wg->out_w);
+        cudnnSetTensor4dDescriptor(l.wo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wo->out_c, l.wo->out_h, l.wo->out_w);
+
+        cudnnSetTensor4dDescriptor(l.uf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uf->out_c, l.uf->out_h, l.uf->out_w);
+        cudnnSetTensor4dDescriptor(l.ui->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ui->out_c, l.ui->out_h, l.ui->out_w);
+        cudnnSetTensor4dDescriptor(l.ug->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ug->out_c, l.ug->out_h, l.ug->out_w);
+        cudnnSetTensor4dDescriptor(l.uo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uo->out_c, l.uo->out_h, l.uo->out_w);
+        */
+#endif
+
+#endif
+
+    return l;
+}
+
+void update_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+{
+    update_connected_layer(*(l.wf), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.wi), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.wg), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.wo), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.uf), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.ui), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.ug), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.uo), batch, learning_rate, momentum, decay);
+}
+
+void forward_lstm_layer(layer l, network_state state)
+{
+    network_state s = { 0 };
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1);
+    if (state.train) {
+        fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i) {
+        s.input = l.h_cpu;
+        forward_connected_layer(wf, s);
+        forward_connected_layer(wi, s);
+        forward_connected_layer(wg, s);
+        forward_connected_layer(wo, s);
+
+        s.input = state.input;
+        forward_connected_layer(uf, s);
+        forward_connected_layer(ui, s);
+        forward_connected_layer(ug, s);
+        forward_connected_layer(uo, s);
+
+        copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);
+
+        activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC);
+        activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC);
+        activate_array(l.g_cpu, l.outputs*l.batch, TANH);
+        activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC);
+
+        copy_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.c_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.h_cpu, 1);
+        activate_array(l.h_cpu, l.outputs*l.batch, TANH);
+        mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.h_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.cell_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.h_cpu, 1, l.output, 1);
+
+        state.input += l.inputs*l.batch;
+        l.output    += l.outputs*l.batch;
+        l.cell_cpu      += l.outputs*l.batch;
+
+        increment_layer(&wf, 1);
+        increment_layer(&wi, 1);
+        increment_layer(&wg, 1);
+        increment_layer(&wo, 1);
+
+        increment_layer(&uf, 1);
+        increment_layer(&ui, 1);
+        increment_layer(&ug, 1);
+        increment_layer(&uo, 1);
+    }
+}
+
+void backward_lstm_layer(layer l, network_state state)
+{
+    network_state s = { 0 };
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    increment_layer(&wf, l.steps - 1);
+    increment_layer(&wi, l.steps - 1);
+    increment_layer(&wg, l.steps - 1);
+    increment_layer(&wo, l.steps - 1);
+
+    increment_layer(&uf, l.steps - 1);
+    increment_layer(&ui, l.steps - 1);
+    increment_layer(&ug, l.steps - 1);
+    increment_layer(&uo, l.steps - 1);
+
+    state.input += l.inputs*l.batch*(l.steps - 1);
+    if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1);
+
+    l.output += l.outputs*l.batch*(l.steps - 1);
+    l.cell_cpu += l.outputs*l.batch*(l.steps - 1);
+    l.delta += l.outputs*l.batch*(l.steps - 1);
+
+    for (i = l.steps - 1; i >= 0; --i) {
+        if (i != 0) copy_cpu(l.outputs*l.batch, l.cell_cpu - l.outputs*l.batch, 1, l.prev_cell_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.cell_cpu, 1, l.c_cpu, 1);
+        if (i != 0) copy_cpu(l.outputs*l.batch, l.output - l.outputs*l.batch, 1, l.prev_state_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.output, 1, l.h_cpu, 1);
+
+        l.dh_cpu = (i == 0) ? 0 : l.delta - l.outputs*l.batch;
+
+        copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);
+
+        activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC);
+        activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC);
+        activate_array(l.g_cpu, l.outputs*l.batch, TANH);
+        activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC);
+
+        copy_cpu(l.outputs*l.batch, l.delta, 1, l.temp3_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);
+        activate_array(l.temp_cpu, l.outputs*l.batch, TANH);
+
+        copy_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.temp2_cpu, 1);
+
+        gradient_array(l.temp_cpu, l.outputs*l.batch, TANH, l.temp2_cpu);
+        axpy_cpu(l.outputs*l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);
+        activate_array(l.temp_cpu, l.outputs*l.batch, TANH);
+        mul_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp_cpu, 1);
+        gradient_array(l.o_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wo.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;
+        backward_connected_layer(wo, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uo.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(uo, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);
+        gradient_array(l.g_cpu, l.outputs*l.batch, TANH, l.temp_cpu);
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wg.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;
+        backward_connected_layer(wg, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ug.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(ug, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);
+        gradient_array(l.i_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wi.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;
+        backward_connected_layer(wi, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ui.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(ui, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.prev_cell_cpu, 1, l.temp_cpu, 1);
+        gradient_array(l.f_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wf.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;
+        backward_connected_layer(wf, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uf.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(uf, s);
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
+        mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.temp_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, l.dc_cpu, 1);
+
+        state.input -= l.inputs*l.batch;
+        if (state.delta) state.delta -= l.inputs*l.batch;
+        l.output -= l.outputs*l.batch;
+        l.cell_cpu -= l.outputs*l.batch;
+        l.delta -= l.outputs*l.batch;
+
+        increment_layer(&wf, -1);
+        increment_layer(&wi, -1);
+        increment_layer(&wg, -1);
+        increment_layer(&wo, -1);
+
+        increment_layer(&uf, -1);
+        increment_layer(&ui, -1);
+        increment_layer(&ug, -1);
+        increment_layer(&uo, -1);
+    }
+}
+
+#ifdef GPU
+void update_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
+{
+    update_connected_layer_gpu(*(l.wf), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.wi), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.wg), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.wo), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.uf), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.ui), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.ug), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.uo), batch, learning_rate, momentum, decay, loss_scale);
+}
+
+void forward_lstm_layer_gpu(layer l, network_state state)
+{
+    network_state s = { 0 };
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1);
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1);
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1);
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1);
+
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1);
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1);
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1);
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1);
+    if (state.train) {
+        fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i) {
+        s.input = l.h_gpu;
+        forward_connected_layer_gpu(wf, s);
+        forward_connected_layer_gpu(wi, s);
+        forward_connected_layer_gpu(wg, s);
+        forward_connected_layer_gpu(wo, s);
+
+        s.input = state.input;
+        forward_connected_layer_gpu(uf, s);
+        forward_connected_layer_gpu(ui, s);
+        forward_connected_layer_gpu(ug, s);
+        forward_connected_layer_gpu(uo, s);
+
+        copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
+
+        activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
+        activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
+        activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH);
+        activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
+
+        copy_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
+        mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
+        mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1);
+        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
+        mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1);
+        copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1);
+
+        state.input += l.inputs*l.batch;
+        l.output_gpu    += l.outputs*l.batch;
+        l.cell_gpu      += l.outputs*l.batch;
+
+        increment_layer(&wf, 1);
+        increment_layer(&wi, 1);
+        increment_layer(&wg, 1);
+        increment_layer(&wo, 1);
+
+        increment_layer(&uf, 1);
+        increment_layer(&ui, 1);
+        increment_layer(&ug, 1);
+        increment_layer(&uo, 1);
+    }
+}
+
+void backward_lstm_layer_gpu(layer l, network_state state)
+{
+    network_state s = { 0 };
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    increment_layer(&wf, l.steps - 1);
+    increment_layer(&wi, l.steps - 1);
+    increment_layer(&wg, l.steps - 1);
+    increment_layer(&wo, l.steps - 1);
+
+    increment_layer(&uf, l.steps - 1);
+    increment_layer(&ui, l.steps - 1);
+    increment_layer(&ug, l.steps - 1);
+    increment_layer(&uo, l.steps - 1);
+
+    state.input += l.inputs*l.batch*(l.steps - 1);
+    if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1);
+
+    l.output_gpu += l.outputs*l.batch*(l.steps - 1);
+    l.cell_gpu += l.outputs*l.batch*(l.steps - 1);
+    l.delta_gpu += l.outputs*l.batch*(l.steps - 1);
+
+    for (i = l.steps - 1; i >= 0; --i) {
+        if (i != 0) copy_ongpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, 1, l.prev_cell_gpu, 1);
+        copy_ongpu(l.outputs*l.batch, l.cell_gpu, 1, l.c_gpu, 1);
+        if (i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
+        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1);
+
+        l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
+
+        copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
+        axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
+
+        activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
+        activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
+        activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH);
+        activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
+
+        copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
+        activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH);
+
+        copy_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1);
+        mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1);
+
+        gradient_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu);
+        axpy_ongpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1);
+
+        copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
+        activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH);
+        mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);
+        gradient_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
+        copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1);
+        s.input = l.prev_state_gpu;
+        s.delta = l.dh_gpu;
+        backward_connected_layer_gpu(wo, s);
+
+        copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer_gpu(uo, s);
+
+        copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
+        mul_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
+        gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu);
+        copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1);
+        s.input = l.prev_state_gpu;
+        s.delta = l.dh_gpu;
+        backward_connected_layer_gpu(wg, s);
+
+        copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer_gpu(ug, s);
+
+        copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
+        mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
+        gradient_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
+        copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1);
+        s.input = l.prev_state_gpu;
+        s.delta = l.dh_gpu;
+        backward_connected_layer_gpu(wi, s);
+
+        copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer_gpu(ui, s);
+
+        copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
+        mul_ongpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1);
+        gradient_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
+        copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1);
+        s.input = l.prev_state_gpu;
+        s.delta = l.dh_gpu;
+        backward_connected_layer_gpu(wf, s);
+
+        copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer_gpu(uf, s);
+
+        copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
+        mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1);
+        copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1);
+
+        state.input -= l.inputs*l.batch;
+        if (state.delta) state.delta -= l.inputs*l.batch;
+        l.output_gpu -= l.outputs*l.batch;
+        l.cell_gpu -= l.outputs*l.batch;
+        l.delta_gpu -= l.outputs*l.batch;
+
+        increment_layer(&wf, -1);
+        increment_layer(&wi, -1);
+        increment_layer(&wg, -1);
+        increment_layer(&wo, -1);
+
+        increment_layer(&uf, -1);
+        increment_layer(&ui, -1);
+        increment_layer(&ug, -1);
+        increment_layer(&uo, -1);
+    }
+}
+#endif
diff --git a/darknet-master/src/lstm_layer.h b/darknet-master/src/lstm_layer.h
new file mode 100644
index 0000000..a116a83
--- /dev/null
+++ b/darknet-master/src/lstm_layer.h
@@ -0,0 +1,27 @@
+#ifndef LSTM_LAYER_H
+#define LSTM_LAYER_H
+
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+#define USET
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize);
+
+void forward_lstm_layer(layer l, network_state state);
+void backward_lstm_layer(layer l, network_state state);
+void update_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+
+#ifdef GPU
+void forward_lstm_layer_gpu(layer l, network_state state);
+void backward_lstm_layer_gpu(layer l, network_state state);
+void update_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/matrix.c b/darknet-master/src/matrix.c
new file mode 100644
index 0000000..715ee80
--- /dev/null
+++ b/darknet-master/src/matrix.c
@@ -0,0 +1,332 @@
+#include "matrix.h"
+#include "utils.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+
+void free_matrix(matrix m)
+{
+    int i;
+    for(i = 0; i < m.rows; ++i) free(m.vals[i]);
+    free(m.vals);
+}
+
+float matrix_topk_accuracy(matrix truth, matrix guess, int k)
+{
+    int* indexes = (int*)xcalloc(k, sizeof(int));
+    int n = truth.cols;
+    int i,j;
+    int correct = 0;
+    for(i = 0; i < truth.rows; ++i){
+        top_k(guess.vals[i], n, k, indexes);
+        for(j = 0; j < k; ++j){
+            int class_id = indexes[j];
+            if(truth.vals[i][class_id]){
+                ++correct;
+                break;
+            }
+        }
+    }
+    free(indexes);
+    return (float)correct/truth.rows;
+}
+
+void scale_matrix(matrix m, float scale)
+{
+    int i,j;
+    for(i = 0; i < m.rows; ++i){
+        for(j = 0; j < m.cols; ++j){
+            m.vals[i][j] *= scale;
+        }
+    }
+}
+
+matrix resize_matrix(matrix m, int size)
+{
+    int i;
+    if (m.rows == size) return m;
+    if (m.rows < size) {
+        m.vals = (float**)xrealloc(m.vals, size * sizeof(float*));
+        for (i = m.rows; i < size; ++i) {
+            m.vals[i] = (float*)xcalloc(m.cols, sizeof(float));
+        }
+    } else if (m.rows > size) {
+        for (i = size; i < m.rows; ++i) {
+            free(m.vals[i]);
+        }
+        m.vals = (float**)xrealloc(m.vals, size * sizeof(float*));
+    }
+    m.rows = size;
+    return m;
+}
+
+void matrix_add_matrix(matrix from, matrix to)
+{
+    assert(from.rows == to.rows && from.cols == to.cols);
+    int i,j;
+    for(i = 0; i < from.rows; ++i){
+        for(j = 0; j < from.cols; ++j){
+            to.vals[i][j] += from.vals[i][j];
+        }
+    }
+}
+
+matrix make_matrix(int rows, int cols)
+{
+    int i;
+    matrix m;
+    m.rows = rows;
+    m.cols = cols;
+    m.vals = (float**)xcalloc(m.rows, sizeof(float*));
+    for(i = 0; i < m.rows; ++i){
+        m.vals[i] = (float*)xcalloc(m.cols, sizeof(float));
+    }
+    return m;
+}
+
+matrix hold_out_matrix(matrix *m, int n)
+{
+    int i;
+    matrix h;
+    h.rows = n;
+    h.cols = m->cols;
+    h.vals = (float**)xcalloc(h.rows, sizeof(float*));
+    for(i = 0; i < n; ++i){
+        int index = rand()%m->rows;
+        h.vals[i] = m->vals[index];
+        m->vals[index] = m->vals[--(m->rows)];
+    }
+    return h;
+}
+
+float *pop_column(matrix *m, int c)
+{
+    float* col = (float*)xcalloc(m->rows, sizeof(float));
+    int i, j;
+    for(i = 0; i < m->rows; ++i){
+        col[i] = m->vals[i][c];
+        for(j = c; j < m->cols-1; ++j){
+            m->vals[i][j] = m->vals[i][j+1];
+        }
+    }
+    --m->cols;
+    return col;
+}
+
+matrix csv_to_matrix(char *filename)
+{
+    FILE *fp = fopen(filename, "r");
+    if(!fp) file_error(filename);
+
+    matrix m;
+    m.cols = -1;
+
+    char *line;
+
+    int n = 0;
+    int size = 1024;
+    m.vals = (float**)xcalloc(size, sizeof(float*));
+    while((line = fgetl(fp))){
+        if(m.cols == -1) m.cols = count_fields(line);
+        if(n == size){
+            size *= 2;
+            m.vals = (float**)xrealloc(m.vals, size * sizeof(float*));
+        }
+        m.vals[n] = parse_fields(line, m.cols);
+        free(line);
+        ++n;
+    }
+    m.vals = (float**)xrealloc(m.vals, n * sizeof(float*));
+    m.rows = n;
+    return m;
+}
+
+void matrix_to_csv(matrix m)
+{
+    int i, j;
+
+    for(i = 0; i < m.rows; ++i){
+        for(j = 0; j < m.cols; ++j){
+            if(j > 0) printf(",");
+            printf("%.17g", m.vals[i][j]);
+        }
+        printf("\n");
+    }
+}
+
+void print_matrix(matrix m)
+{
+    int i, j;
+    printf("%d X %d Matrix:\n",m.rows, m.cols);
+    printf(" __");
+    for(j = 0; j < 16*m.cols-1; ++j) printf(" ");
+    printf("__ \n");
+
+    printf("|  ");
+    for(j = 0; j < 16*m.cols-1; ++j) printf(" ");
+    printf("  |\n");
+
+    for(i = 0; i < m.rows; ++i){
+        printf("|  ");
+        for(j = 0; j < m.cols; ++j){
+            printf("%15.7f ", m.vals[i][j]);
+        }
+        printf(" |\n");
+    }
+    printf("|__");
+    for(j = 0; j < 16*m.cols-1; ++j) printf(" ");
+    printf("__|\n");
+}
+
+
+matrix make_matrix(int rows, int cols);
+
+void copy(float *x, float *y, int n);
+float dist(float *x, float *y, int n);
+int *sample(int n);
+
+int closest_center(float *datum, matrix centers)
+{
+    int j;
+    int best = 0;
+    float best_dist = dist(datum, centers.vals[best], centers.cols);
+    for (j = 0; j < centers.rows; ++j) {
+        float new_dist = dist(datum, centers.vals[j], centers.cols);
+        if (new_dist < best_dist) {
+            best_dist = new_dist;
+            best = j;
+        }
+    }
+    return best;
+}
+
+float dist_to_closest_center(float *datum, matrix centers)
+{
+    int ci = closest_center(datum, centers);
+    return dist(datum, centers.vals[ci], centers.cols);
+}
+
+int kmeans_expectation(matrix data, int *assignments, matrix centers)
+{
+    int i;
+    int converged = 1;
+    for (i = 0; i < data.rows; ++i) {
+        int closest = closest_center(data.vals[i], centers);
+        if (closest != assignments[i]) converged = 0;
+        assignments[i] = closest;
+    }
+    return converged;
+}
+
+void kmeans_maximization(matrix data, int *assignments, matrix centers)
+{
+    matrix old_centers = make_matrix(centers.rows, centers.cols);
+
+    int i, j;
+    int *counts = (int*)xcalloc(centers.rows, sizeof(int));
+    for (i = 0; i < centers.rows; ++i) {
+        for (j = 0; j < centers.cols; ++j) {
+            old_centers.vals[i][j] = centers.vals[i][j];
+            centers.vals[i][j] = 0;
+        }
+    }
+    for (i = 0; i < data.rows; ++i) {
+        ++counts[assignments[i]];
+        for (j = 0; j < data.cols; ++j) {
+            centers.vals[assignments[i]][j] += data.vals[i][j];
+        }
+    }
+    for (i = 0; i < centers.rows; ++i) {
+        if (counts[i]) {
+            for (j = 0; j < centers.cols; ++j) {
+                centers.vals[i][j] /= counts[i];
+            }
+        }
+    }
+
+    for (i = 0; i < centers.rows; ++i) {
+        for (j = 0; j < centers.cols; ++j) {
+            if(centers.vals[i][j] == 0) centers.vals[i][j] = old_centers.vals[i][j];
+        }
+    }
+    free(counts);
+    free_matrix(old_centers);
+}
+
+
+
+void random_centers(matrix data, matrix centers) {
+    int i;
+    int *s = sample(data.rows);
+    for (i = 0; i < centers.rows; ++i) {
+        copy(data.vals[s[i]], centers.vals[i], data.cols);
+    }
+    free(s);
+}
+
+int *sample(int n)
+{
+    int i;
+    int* s = (int*)xcalloc(n, sizeof(int));
+    for (i = 0; i < n; ++i) s[i] = i;
+    for (i = n - 1; i >= 0; --i) {
+        int swap = s[i];
+        int index = rand() % (i + 1);
+        s[i] = s[index];
+        s[index] = swap;
+    }
+    return s;
+}
+
+float dist(float *x, float *y, int n)
+{
+    //printf(" x0 = %f, x1 = %f, y0 = %f, y1 = %f \n", x[0], x[1], y[0], y[1]);
+    float mw = (x[0] < y[0]) ? x[0] : y[0];
+    float mh = (x[1] < y[1]) ? x[1] : y[1];
+    float inter = mw*mh;
+    float sum = x[0] * x[1] + y[0] * y[1];
+    float un = sum - inter;
+    float iou = inter / un;
+    return 1 - iou;
+}
+
+void copy(float *x, float *y, int n)
+{
+    int i;
+    for (i = 0; i < n; ++i) y[i] = x[i];
+}
+
+model do_kmeans(matrix data, int k)
+{
+    matrix centers = make_matrix(k, data.cols);
+    int* assignments = (int*)xcalloc(data.rows, sizeof(int));
+    //smart_centers(data, centers);
+    random_centers(data, centers);  // IoU = 67.31% after kmeans
+
+    /*
+    // IoU = 63.29%, anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+    centers.vals[0][0] = 10; centers.vals[0][1] = 13;
+    centers.vals[1][0] = 16; centers.vals[1][1] = 30;
+    centers.vals[2][0] = 33; centers.vals[2][1] = 23;
+    centers.vals[3][0] = 30; centers.vals[3][1] = 61;
+    centers.vals[4][0] = 62; centers.vals[4][1] = 45;
+    centers.vals[5][0] = 59; centers.vals[5][1] = 119;
+    centers.vals[6][0] = 116; centers.vals[6][1] = 90;
+    centers.vals[7][0] = 156; centers.vals[7][1] = 198;
+    centers.vals[8][0] = 373; centers.vals[8][1] = 326;
+    */
+
+    // range centers [min - max] using exp graph or Pyth example
+    if (k == 1) kmeans_maximization(data, assignments, centers);
+    int i;
+    for(i = 0; i < 1000 && !kmeans_expectation(data, assignments, centers); ++i) {
+        kmeans_maximization(data, assignments, centers);
+    }
+    printf("\n iterations = %d \n", i);
+    model m;
+    m.assignments = assignments;
+    m.centers = centers;
+    return m;
+}
diff --git a/darknet-master/src/matrix.h b/darknet-master/src/matrix.h
new file mode 100644
index 0000000..d565722
--- /dev/null
+++ b/darknet-master/src/matrix.h
@@ -0,0 +1,37 @@
+#ifndef MATRIX_H
+#define MATRIX_H
+#include "darknet.h"
+
+//typedef struct matrix{
+//    int rows, cols;
+//    float **vals;
+//} matrix;
+
+typedef struct {
+    int *assignments;
+    matrix centers;
+} model;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+model do_kmeans(matrix data, int k);
+matrix make_matrix(int rows, int cols);
+void free_matrix(matrix m);
+void print_matrix(matrix m);
+
+matrix csv_to_matrix(char *filename);
+void matrix_to_csv(matrix m);
+matrix hold_out_matrix(matrix *m, int n);
+float matrix_topk_accuracy(matrix truth, matrix guess, int k);
+void matrix_add_matrix(matrix from, matrix to);
+void scale_matrix(matrix m, float scale);
+matrix resize_matrix(matrix m, int size);
+
+float *pop_column(matrix *m, int c);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/maxpool_layer.c b/darknet-master/src/maxpool_layer.c
new file mode 100644
index 0000000..0e7dfd5
--- /dev/null
+++ b/darknet-master/src/maxpool_layer.c
@@ -0,0 +1,414 @@
+#include "maxpool_layer.h"
+#include "convolutional_layer.h"
+#include "dark_cuda.h"
+#include "utils.h"
+#include "gemm.h"
+#include <stdio.h>
+
+image get_maxpool_image(maxpool_layer l)
+{
+    int h = l.out_h;
+    int w = l.out_w;
+    int c = l.c;
+    return float_to_image(w,h,c,l.output);
+}
+
+image get_maxpool_delta(maxpool_layer l)
+{
+    int h = l.out_h;
+    int w = l.out_w;
+    int c = l.c;
+    return float_to_image(w,h,c,l.delta);
+}
+
+void create_maxpool_cudnn_tensors(layer *l)
+{
+#ifdef CUDNN
+    CHECK_CUDNN(cudnnCreatePoolingDescriptor(&l->poolingDesc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->srcTensorDesc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dstTensorDesc));
+#endif // CUDNN
+}
+
+void cudnn_maxpool_setup(layer *l)
+{
+#ifdef CUDNN
+    CHECK_CUDNN(cudnnSetPooling2dDescriptor(
+        l->poolingDesc,
+        CUDNN_POOLING_MAX,
+        CUDNN_NOT_PROPAGATE_NAN,    // CUDNN_PROPAGATE_NAN, CUDNN_NOT_PROPAGATE_NAN
+        l->size,
+        l->size,
+        l->pad/2, //0, //l.pad,
+        l->pad/2, //0, //l.pad,
+        l->stride_x,
+        l->stride_y));
+
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w));
+#endif // CUDNN
+}
+
+
+void cudnn_local_avgpool_setup(layer *l)
+{
+#ifdef CUDNN
+    CHECK_CUDNN(cudnnSetPooling2dDescriptor(
+        l->poolingDesc,
+        CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
+        CUDNN_NOT_PROPAGATE_NAN,    // CUDNN_PROPAGATE_NAN, CUDNN_NOT_PROPAGATE_NAN
+        l->size,
+        l->size,
+        l->pad / 2, //0, //l.pad,
+        l->pad / 2, //0, //l.pad,
+        l->stride_x,
+        l->stride_y));
+
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w));
+#endif // CUDNN
+}
+
+maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels, int antialiasing, int avgpool, int train)
+{
+    maxpool_layer l = { (LAYER_TYPE)0 };
+    l.avgpool = avgpool;
+    if (avgpool) l.type = LOCAL_AVGPOOL;
+    else l.type = MAXPOOL;
+    l.train = train;
+
+    const int blur_stride_x = stride_x;
+    const int blur_stride_y = stride_y;
+    l.antialiasing = antialiasing;
+    if (antialiasing) {
+        stride_x = stride_y = l.stride = l.stride_x = l.stride_y = 1; // use stride=1 in host-layer
+    }
+
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.pad = padding;
+    l.maxpool_depth = maxpool_depth;
+    l.out_channels = out_channels;
+    if (maxpool_depth) {
+        l.out_c = out_channels;
+        l.out_w = l.w;
+        l.out_h = l.h;
+    }
+    else {
+        l.out_w = (w + padding - size) / stride_x + 1;
+        l.out_h = (h + padding - size) / stride_y + 1;
+        l.out_c = c;
+    }
+    l.outputs = l.out_h * l.out_w * l.out_c;
+    l.inputs = h*w*c;
+    l.size = size;
+    l.stride = stride_x;
+    l.stride_x = stride_x;
+    l.stride_y = stride_y;
+    int output_size = l.out_h * l.out_w * l.out_c * batch;
+
+    if (train) {
+        if (!avgpool) l.indexes = (int*)xcalloc(output_size, sizeof(int));
+        l.delta = (float*)xcalloc(output_size, sizeof(float));
+    }
+    l.output = (float*)xcalloc(output_size, sizeof(float));
+    if (avgpool) {
+        l.forward = forward_local_avgpool_layer;
+        l.backward = backward_local_avgpool_layer;
+    }
+    else {
+        l.forward = forward_maxpool_layer;
+        l.backward = backward_maxpool_layer;
+    }
+#ifdef GPU
+    if (avgpool) {
+        l.forward_gpu = forward_local_avgpool_layer_gpu;
+        l.backward_gpu = backward_local_avgpool_layer_gpu;
+    }
+    else {
+        l.forward_gpu = forward_maxpool_layer_gpu;
+        l.backward_gpu = backward_maxpool_layer_gpu;
+    }
+
+    if (train) {
+        if (!avgpool) l.indexes_gpu = cuda_make_int_array(output_size);
+        l.delta_gpu = cuda_make_array(l.delta, output_size);
+    }
+    l.output_gpu  = cuda_make_array(l.output, output_size);
+    create_maxpool_cudnn_tensors(&l);
+    if (avgpool) cudnn_local_avgpool_setup(&l);
+    else cudnn_maxpool_setup(&l);
+
+#endif  // GPU
+    l.bflops = (l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.;
+    if (avgpool) {
+        if (stride_x == stride_y)
+            fprintf(stderr, "avg               %2dx%2d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+        else
+            fprintf(stderr, "avg              %2dx%2d/%2dx%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, stride_y, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+    }
+    else {
+        if (maxpool_depth)
+            fprintf(stderr, "max-depth         %2dx%2d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+        else if (stride_x == stride_y)
+            fprintf(stderr, "max               %2dx%2d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+        else
+            fprintf(stderr, "max              %2dx%2d/%2dx%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, stride_y, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+    }
+
+    if (l.antialiasing) {
+        printf("AA:  ");
+        l.input_layer = (layer*)calloc(1, sizeof(layer));
+        int blur_size = 3;
+        int blur_pad = blur_size / 2;
+        if (l.antialiasing == 2) {
+            blur_size = 2;
+            blur_pad = 0;
+        }
+        *(l.input_layer) = make_convolutional_layer(batch, 1, l.out_h, l.out_w, l.out_c, l.out_c, l.out_c, blur_size, blur_stride_x, blur_stride_y, 1, blur_pad, LINEAR, 0, 0, 0, 0, 0, 1, 0, NULL, 0, 0, train);
+        const int blur_nweights = l.out_c * blur_size * blur_size;  // (n / n) * n * blur_size * blur_size;
+        int i;
+        if (blur_size == 2) {
+            for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
+                l.input_layer->weights[i + 0] = 1 / 4.f;
+                l.input_layer->weights[i + 1] = 1 / 4.f;
+                l.input_layer->weights[i + 2] = 1 / 4.f;
+                l.input_layer->weights[i + 3] = 1 / 4.f;
+            }
+        }
+        else {
+            for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
+                l.input_layer->weights[i + 0] = 1 / 16.f;
+                l.input_layer->weights[i + 1] = 2 / 16.f;
+                l.input_layer->weights[i + 2] = 1 / 16.f;
+
+                l.input_layer->weights[i + 3] = 2 / 16.f;
+                l.input_layer->weights[i + 4] = 4 / 16.f;
+                l.input_layer->weights[i + 5] = 2 / 16.f;
+
+                l.input_layer->weights[i + 6] = 1 / 16.f;
+                l.input_layer->weights[i + 7] = 2 / 16.f;
+                l.input_layer->weights[i + 8] = 1 / 16.f;
+            }
+        }
+        for (i = 0; i < l.out_c; ++i) l.input_layer->biases[i] = 0;
+#ifdef GPU
+        if (gpu_index >= 0) {
+            if (l.antialiasing) l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs);
+            push_convolutional_layer(*(l.input_layer));
+        }
+#endif  // GPU
+    }
+
+    return l;
+}
+
+void resize_maxpool_layer(maxpool_layer *l, int w, int h)
+{
+    l->h = h;
+    l->w = w;
+    l->inputs = h*w*l->c;
+
+    l->out_w = (w + l->pad - l->size) / l->stride_x + 1;
+    l->out_h = (h + l->pad - l->size) / l->stride_y + 1;
+    l->outputs = l->out_w * l->out_h * l->out_c;
+    int output_size = l->outputs * l->batch;
+
+    if (l->train) {
+        if (!l->avgpool) l->indexes = (int*)xrealloc(l->indexes, output_size * sizeof(int));
+        l->delta = (float*)xrealloc(l->delta, output_size * sizeof(float));
+    }
+    l->output = (float*)xrealloc(l->output, output_size * sizeof(float));
+
+#ifdef GPU
+    CHECK_CUDA(cudaFree(l->output_gpu));
+    l->output_gpu  = cuda_make_array(l->output, output_size);
+
+    if (l->train) {
+        if (!l->avgpool) {
+            CHECK_CUDA(cudaFree((float *)l->indexes_gpu));
+            l->indexes_gpu = cuda_make_int_array(output_size);
+        }
+        CHECK_CUDA(cudaFree(l->delta_gpu));
+        l->delta_gpu = cuda_make_array(l->delta, output_size);
+    }
+
+    if(l->avgpool) cudnn_local_avgpool_setup(l);
+    else cudnn_maxpool_setup(l);
+#endif
+}
+
+void forward_maxpool_layer(const maxpool_layer l, network_state state)
+{
+    if (l.maxpool_depth)
+    {
+        int b, i, j, k, g;
+        for (b = 0; b < l.batch; ++b) {
+            #pragma omp parallel for
+            for (i = 0; i < l.h; ++i) {
+                for (j = 0; j < l.w; ++j) {
+                    for (g = 0; g < l.out_c; ++g)
+                    {
+                        int out_index = j + l.w*(i + l.h*(g + l.out_c*b));
+                        float max = -FLT_MAX;
+                        int max_i = -1;
+
+                        for (k = g; k < l.c; k += l.out_c)
+                        {
+                            int in_index = j + l.w*(i + l.h*(k + l.c*b));
+                            float val = state.input[in_index];
+
+                            max_i = (val > max) ? in_index : max_i;
+                            max = (val > max) ? val : max;
+                        }
+                        l.output[out_index] = max;
+                        if (l.indexes) l.indexes[out_index] = max_i;
+                    }
+                }
+            }
+        }
+        return;
+    }
+
+
+    if (!state.train && l.stride_x == l.stride_y) {
+        forward_maxpool_layer_avx(state.input, l.output, l.indexes, l.size, l.w, l.h, l.out_w, l.out_h, l.c, l.pad, l.stride, l.batch);
+    }
+    else
+    {
+
+        int b, i, j, k, m, n;
+        int w_offset = -l.pad / 2;
+        int h_offset = -l.pad / 2;
+
+        int h = l.out_h;
+        int w = l.out_w;
+        int c = l.c;
+
+        for (b = 0; b < l.batch; ++b) {
+            for (k = 0; k < c; ++k) {
+                for (i = 0; i < h; ++i) {
+                    for (j = 0; j < w; ++j) {
+                        int out_index = j + w*(i + h*(k + c*b));
+                        float max = -FLT_MAX;
+                        int max_i = -1;
+                        for (n = 0; n < l.size; ++n) {
+                            for (m = 0; m < l.size; ++m) {
+                                int cur_h = h_offset + i*l.stride_y + n;
+                                int cur_w = w_offset + j*l.stride_x + m;
+                                int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
+                                int valid = (cur_h >= 0 && cur_h < l.h &&
+                                    cur_w >= 0 && cur_w < l.w);
+                                float val = (valid != 0) ? state.input[index] : -FLT_MAX;
+                                max_i = (val > max) ? index : max_i;
+                                max = (val > max) ? val : max;
+                            }
+                        }
+                        l.output[out_index] = max;
+                        if (l.indexes) l.indexes[out_index] = max_i;
+                    }
+                }
+            }
+        }
+    }
+
+    if (l.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        s.input = l.output;
+        forward_convolutional_layer(*(l.input_layer), s);
+        //simple_copy_ongpu(l.outputs*l.batch, l.output, l.input_antialiasing);
+        memcpy(l.output, l.input_layer->output, l.input_layer->outputs * l.input_layer->batch * sizeof(float));
+    }
+}
+
+void backward_maxpool_layer(const maxpool_layer l, network_state state)
+{
+    int i;
+    int h = l.out_h;
+    int w = l.out_w;
+    int c = l.out_c;
+    #pragma omp parallel for
+    for(i = 0; i < h*w*c*l.batch; ++i){
+        int index = l.indexes[i];
+        state.delta[index] += l.delta[i];
+    }
+}
+
+
+void forward_local_avgpool_layer(const maxpool_layer l, network_state state)
+{
+    int b, i, j, k, m, n;
+    int w_offset = -l.pad / 2;
+    int h_offset = -l.pad / 2;
+
+    int h = l.out_h;
+    int w = l.out_w;
+    int c = l.c;
+
+    for (b = 0; b < l.batch; ++b) {
+        for (k = 0; k < c; ++k) {
+            for (i = 0; i < h; ++i) {
+                for (j = 0; j < w; ++j) {
+                    int out_index = j + w*(i + h*(k + c*b));
+                    float avg = 0;
+                    int counter = 0;
+                    for (n = 0; n < l.size; ++n) {
+                        for (m = 0; m < l.size; ++m) {
+                            int cur_h = h_offset + i*l.stride_y + n;
+                            int cur_w = w_offset + j*l.stride_x + m;
+                            int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
+                            int valid = (cur_h >= 0 && cur_h < l.h &&
+                                cur_w >= 0 && cur_w < l.w);
+                            if (valid) {
+                                counter++;
+                                avg += state.input[index];
+                            }
+
+                        }
+                    }
+                    l.output[out_index] = avg / counter;
+                }
+            }
+        }
+    }
+}
+
+void backward_local_avgpool_layer(const maxpool_layer l, network_state state)
+{
+
+    int b, i, j, k, m, n;
+    int w_offset = -l.pad / 2;
+    int h_offset = -l.pad / 2;
+
+    int h = l.out_h;
+    int w = l.out_w;
+    int c = l.c;
+
+    for (b = 0; b < l.batch; ++b) {
+        for (k = 0; k < c; ++k) {
+            for (i = 0; i < h; ++i) {
+                for (j = 0; j < w; ++j) {
+                    int out_index = j + w*(i + h*(k + c*b));
+                    for (n = 0; n < l.size; ++n) {
+                        for (m = 0; m < l.size; ++m) {
+                            int cur_h = h_offset + i*l.stride_y + n;
+                            int cur_w = w_offset + j*l.stride_x + m;
+                            int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
+                            int valid = (cur_h >= 0 && cur_h < l.h &&
+                                cur_w >= 0 && cur_w < l.w);
+
+                            if (valid) state.delta[index] += l.delta[out_index] / (l.size*l.size);
+                        }
+                    }
+
+                }
+            }
+        }
+    }
+
+}
diff --git a/darknet-master/src/maxpool_layer.h b/darknet-master/src/maxpool_layer.h
new file mode 100644
index 0000000..3c00611
--- /dev/null
+++ b/darknet-master/src/maxpool_layer.h
@@ -0,0 +1,36 @@
+#ifndef MAXPOOL_LAYER_H
+#define MAXPOOL_LAYER_H
+
+#include "image.h"
+#include "dark_cuda.h"
+#include "layer.h"
+#include "network.h"
+
+typedef layer maxpool_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+image get_maxpool_image(maxpool_layer l);
+maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels, int antialiasing, int avgpool, int train);
+void resize_maxpool_layer(maxpool_layer *l, int w, int h);
+void forward_maxpool_layer(const maxpool_layer l, network_state state);
+void backward_maxpool_layer(const maxpool_layer l, network_state state);
+
+void forward_local_avgpool_layer(const maxpool_layer l, network_state state);
+void backward_local_avgpool_layer(const maxpool_layer l, network_state state);
+
+#ifdef GPU
+void forward_maxpool_layer_gpu(maxpool_layer l, network_state state);
+void backward_maxpool_layer_gpu(maxpool_layer l, network_state state);
+void cudnn_maxpool_setup(maxpool_layer *l);
+
+void forward_local_avgpool_layer_gpu(maxpool_layer layer, network_state state);
+void backward_local_avgpool_layer_gpu(maxpool_layer layer, network_state state);
+#endif // GPU
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/maxpool_layer_kernels.cu b/darknet-master/src/maxpool_layer_kernels.cu
new file mode 100644
index 0000000..1d0d1bd
--- /dev/null
+++ b/darknet-master/src/maxpool_layer_kernels.cu
@@ -0,0 +1,387 @@
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+
+#include "maxpool_layer.h"
+#include "convolutional_layer.h"
+#include "blas.h"
+#include "dark_cuda.h"
+
+__global__ void forward_maxpool_depth_layer_kernel(int n, int w, int h, int c, int out_c, int batch, float *input, float *output, int *indexes)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= n) return;
+
+    int j = id % w;
+    id = id / w;
+    int i = id % h;
+    id = id / h;
+    //int g = id % out_c;
+    //id = id / out_c;
+    int b = id % batch;
+
+    int k;
+    for (int g = 0; g < out_c; ++g)
+    {
+        int out_index = j + w*(i + h*(g + out_c*b));
+        float max = -FLT_MAX;
+        int max_i = -1;
+
+        for (k = g; k < c; k += out_c)
+        {
+            int in_index = j + w*(i + h*(k + c*b));
+            float val = input[in_index];
+
+            max_i = (val > max) ? in_index : max_i;
+            max = (val > max) ? val : max;
+        }
+        output[out_index] = max;
+        if (indexes) indexes[out_index] = max_i;
+    }
+}
+
+
+__global__ void backward_maxpool_depth_layer_kernel(int n, int w, int h, int c, int batch, float *delta, float *prev_delta, int *indexes)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= n) return;
+
+    int index = indexes[id];
+    prev_delta[index] += delta[id];
+}
+
+
+__global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *input, float *output, int *indexes)
+{
+    int h = (in_h + pad - size) / stride_y + 1;
+    int w = (in_w + pad - size) / stride_x + 1;
+    int c = in_c;
+
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(id >= n) return;
+
+    int j = id % w;
+    id /= w;
+    int i = id % h;
+    id /= h;
+    int k = id % c;
+    id /= c;
+    int b = id;
+
+    int w_offset = -pad / 2;
+    int h_offset = -pad / 2;
+
+    int out_index = j + w*(i + h*(k + c*b));
+    float max = -INFINITY;
+    int max_i = -1;
+    int l, m;
+    for(l = 0; l < size; ++l){
+        for(m = 0; m < size; ++m){
+            int cur_h = h_offset + i*stride_y + l;
+            int cur_w = w_offset + j*stride_x + m;
+            int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
+            int valid = (cur_h >= 0 && cur_h < in_h &&
+                    cur_w >= 0 && cur_w < in_w);
+            float val = (valid != 0) ? input[index] : -INFINITY;
+            max_i = (val > max) ? index : max_i;
+            max   = (val > max) ? val   : max;
+        }
+    }
+    output[out_index] = max;
+    if (indexes) indexes[out_index] = max_i;
+}
+
+__global__ void forward_zero_nonmax_kernel(int n, float *input, float *output)
+{
+
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= n) return;
+
+    if (input[id] != output[id]) output[id] = 0;
+}
+
+__global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *delta, float *prev_delta, int *indexes)
+{
+    int h = (in_h + pad - size) / stride_y + 1;
+    int w = (in_w + pad - size) / stride_x + 1;
+    int c = in_c;
+    int area_x = (size - 1) / stride_x;
+    int area_y = (size - 1) / stride_y;
+
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(id >= n) return;
+
+    int index = id;
+    int j = id % in_w;
+    id /= in_w;
+    int i = id % in_h;
+    id /= in_h;
+    int k = id % in_c;
+    id /= in_c;
+    int b = id;
+
+    int w_offset = -pad / 2;
+    int h_offset = -pad / 2;
+
+    float d = 0;
+    int l, m;
+    for(l = -area_y; l < area_y+1; ++l){
+        for(m = -area_x; m < area_x+1; ++m){
+            int out_w = (j-w_offset)/stride_x + m;
+            int out_h = (i-h_offset)/stride_y + l;
+            int out_index = out_w + w*(out_h + h*(k + c*b));
+            int valid = (out_w >= 0 && out_w < w &&
+                     out_h >= 0 && out_h < h);
+            d += (valid && indexes[out_index] == index) ? delta[out_index] : 0;
+        }
+    }
+    prev_delta[index] += d;
+}
+
+__global__ void backward_zero_nonmax_kernel(int n, int *indexes, float *prev_delta)
+{
+
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= n) return;
+
+    if (indexes[id] != id) prev_delta[id] = 0;
+}
+extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
+{
+    if (layer.maxpool_depth) {
+        int h = layer.out_h;
+        int w = layer.out_w;
+        int c = 1;// layer.out_c;
+
+        size_t n = h*w*c*layer.batch;
+
+        forward_maxpool_depth_layer_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(
+            n, layer.w, layer.h, layer.c, layer.out_c, layer.batch, state.input, layer.output_gpu, layer.indexes_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        return;
+    }
+
+#ifdef CUDNN_DISABLED
+    if (!state.train && layer.stride == layer.size) {
+        // cudnnPoolingBackward
+        cudnnStatus_t maxpool_status;
+
+        float alpha = 1, beta = 0;
+        maxpool_status = cudnnPoolingForward(
+            cudnn_handle(),
+            layer.poolingDesc,
+            &alpha,
+            layer.srcTensorDesc,
+            state.input,
+            &beta,
+            layer.dstTensorDesc,
+            layer.output_gpu);
+
+        //maxpool_status = cudnnDestroyPoolingDescriptor(poolingDesc);
+        //cudnnDestroyTensorDescriptor(layer.srcTensorDesc);
+        //cudnnDestroyTensorDescriptor(layer.dstTensorDesc);
+
+    }
+    else
+#endif
+    {
+        int h = layer.out_h;
+        int w = layer.out_w;
+        int c = layer.out_c;
+
+        size_t n = h*w*c*layer.batch;
+
+        forward_maxpool_layer_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        if (layer.maxpool_zero_nonmax) {
+            forward_zero_nonmax_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (n, state.input, layer.output_gpu);
+            CHECK_CUDA(cudaPeekAtLastError());
+        }
+    }
+
+    if (layer.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
+        s.input = layer.output_gpu;
+        forward_convolutional_layer_gpu(*(layer.input_layer), s);
+        simple_copy_ongpu(layer.outputs*layer.batch, layer.output_gpu, layer.input_antialiasing_gpu);
+        simple_copy_ongpu(layer.input_layer->outputs*layer.input_layer->batch, layer.input_layer->output_gpu, layer.output_gpu);
+    }
+}
+
+extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
+{
+    if (layer.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        s.delta = layer.delta_gpu;  // s.delta will be returned to l.delta_gpu
+        s.input = layer.input_antialiasing_gpu;
+        //if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
+        simple_copy_ongpu(layer.input_layer->outputs*layer.input_layer->batch, layer.delta_gpu, layer.input_layer->delta_gpu);
+        backward_convolutional_layer_gpu(*(layer.input_layer), s);
+
+        //simple_copy_ongpu(layer.outputs*layer.batch, layer.input_antialiasing_gpu, layer.output_gpu);
+    }
+
+    if (layer.maxpool_depth) {
+        int h = layer.out_h;
+        int w = layer.out_w;
+        int c = layer.out_c;
+
+        size_t n = h * w * c * layer.batch;
+
+        backward_maxpool_depth_layer_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, layer.w, layer.h, layer.c, layer.batch, layer.delta_gpu, state.delta, layer.indexes_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+        return;
+    }
+
+    size_t n = layer.h*layer.w*layer.c*layer.batch;
+
+    backward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, layer.delta_gpu, state.delta, layer.indexes_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+
+    if (layer.maxpool_zero_nonmax) {
+        backward_zero_nonmax_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (n, layer.indexes_gpu, state.delta);
+        CHECK_CUDA(cudaPeekAtLastError());
+    }
+}
+
+
+
+
+__global__ void forward_local_avgpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *input, float *output)
+{
+    int h = (in_h + pad - size) / stride_y + 1;
+    int w = (in_w + pad - size) / stride_x + 1;
+    int c = in_c;
+
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= n) return;
+
+    int j = id % w;
+    id /= w;
+    int i = id % h;
+    id /= h;
+    int k = id % c;
+    id /= c;
+    int b = id;
+
+    int w_offset = -pad / 2;
+    int h_offset = -pad / 2;
+
+    int out_index = j + w*(i + h*(k + c*b));
+    float avg = 0;
+    int counter = 0;
+    int l, m;
+    for (l = 0; l < size; ++l) {
+        for (m = 0; m < size; ++m) {
+            int cur_h = h_offset + i*stride_y + l;
+            int cur_w = w_offset + j*stride_x + m;
+            int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
+            int valid = (cur_h >= 0 && cur_h < in_h &&
+                cur_w >= 0 && cur_w < in_w);
+            if (valid) {
+                counter++;
+                avg += input[index];
+            }
+        }
+    }
+    output[out_index] = avg / counter;  // as CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
+}
+
+
+__global__ void backward_local_avgpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *delta, float *prev_delta)
+{
+    int h = (in_h + pad - size) / stride_y + 1;
+    int w = (in_w + pad - size) / stride_x + 1;
+    int c = in_c;
+    int area_x = (size - 1) / stride_x;
+    int area_y = (size - 1) / stride_y;
+
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= n) return;
+
+    int index = id;
+    int j = id % in_w;
+    id /= in_w;
+    int i = id % in_h;
+    id /= in_h;
+    int k = id % in_c;
+    id /= in_c;
+    int b = id;
+
+    int w_offset = -pad / 2;
+    int h_offset = -pad / 2;
+
+    int counter = 0;
+    float d = 0;
+    int l, m;
+    for (l = -area_y; l < area_y + 1; ++l) {
+        for (m = -area_x; m < area_x + 1; ++m) {
+            int out_w = (j - w_offset) / stride_x + m;
+            int out_h = (i - h_offset) / stride_y + l;
+            int out_index = out_w + w*(out_h + h*(k + c*b));
+            int valid = (out_w >= 0 && out_w < w && out_h >= 0 && out_h < h);
+            if (valid) {
+                counter++;
+                d += delta[out_index];
+            }
+        }
+    }
+    if(counter > 0) prev_delta[index] += d / counter;
+}
+
+
+
+extern "C" void forward_local_avgpool_layer_gpu(maxpool_layer layer, network_state state)
+{
+
+#ifdef CUDNN_DISABLED
+    if (!state.train && layer.stride == layer.size) {
+        // cudnnPoolingBackward
+        cudnnStatus_t maxpool_status;
+
+        float alpha = 1, beta = 0;
+        maxpool_status = cudnnPoolingForward(
+            cudnn_handle(),
+            layer.poolingDesc,
+            &alpha,
+            layer.srcTensorDesc,
+            state.input,
+            &beta,
+            layer.dstTensorDesc,
+            layer.output_gpu);
+
+        //maxpool_status = cudnnDestroyPoolingDescriptor(poolingDesc);
+        //cudnnDestroyTensorDescriptor(layer.srcTensorDesc);
+        //cudnnDestroyTensorDescriptor(layer.dstTensorDesc);
+
+    }
+    else
+#endif
+    {
+        int h = layer.out_h;
+        int w = layer.out_w;
+        int c = layer.out_c;
+
+        size_t n = h*w*c*layer.batch;
+
+        forward_local_avgpool_layer_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, state.input, layer.output_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+    }
+}
+
+extern "C" void backward_local_avgpool_layer_gpu(maxpool_layer layer, network_state state)
+{
+    size_t n = layer.h*layer.w*layer.c*layer.batch;
+
+    backward_local_avgpool_layer_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, layer.delta_gpu, state.delta);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
diff --git a/darknet-master/src/network.c b/darknet-master/src/network.c
new file mode 100644
index 0000000..40c1cbc
--- /dev/null
+++ b/darknet-master/src/network.c
@@ -0,0 +1,1690 @@
+#include "darknet.h"
+
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
+#include "network.h"
+#include "image.h"
+#include "data.h"
+#include "utils.h"
+#include "blas.h"
+
+#include "crop_layer.h"
+#include "connected_layer.h"
+#include "gru_layer.h"
+#include "rnn_layer.h"
+#include "crnn_layer.h"
+#include "conv_lstm_layer.h"
+#include "local_layer.h"
+#include "convolutional_layer.h"
+#include "activation_layer.h"
+#include "detection_layer.h"
+#include "region_layer.h"
+#include "normalization_layer.h"
+#include "batchnorm_layer.h"
+#include "maxpool_layer.h"
+#include "reorg_layer.h"
+#include "reorg_old_layer.h"
+#include "avgpool_layer.h"
+#include "cost_layer.h"
+#include "softmax_layer.h"
+#include "dropout_layer.h"
+#include "route_layer.h"
+#include "shortcut_layer.h"
+#include "scale_channels_layer.h"
+#include "sam_layer.h"
+#include "yolo_layer.h"
+#include "gaussian_yolo_layer.h"
+#include "upsample_layer.h"
+#include "parser.h"
+
+load_args get_base_args(network *net)
+{
+    load_args args = { 0 };
+    args.w = net->w;
+    args.h = net->h;
+    args.size = net->w;
+
+    args.min = net->min_crop;
+    args.max = net->max_crop;
+    args.angle = net->angle;
+    args.aspect = net->aspect;
+    args.exposure = net->exposure;
+    args.center = net->center;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
+    return args;
+}
+
+int64_t get_current_iteration(network net)
+{
+    return *net.cur_iteration;
+}
+
+int get_current_batch(network net)
+{
+    int batch_num = (*net.seen)/(net.batch*net.subdivisions);
+    return batch_num;
+}
+
+/*
+void reset_momentum(network net)
+{
+    if (net.momentum == 0) return;
+    net.learning_rate = 0;
+    net.momentum = 0;
+    net.decay = 0;
+    #ifdef GPU
+        //if(net.gpu_index >= 0) update_network_gpu(net);
+    #endif
+}
+*/
+
+void reset_network_state(network *net, int b)
+{
+    int i;
+    for (i = 0; i < net->n; ++i) {
+#ifdef GPU
+        layer l = net->layers[i];
+        if (l.state_gpu) {
+            fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
+        }
+        if (l.h_gpu) {
+            fill_ongpu(l.outputs, 0, l.h_gpu + l.outputs*b, 1);
+        }
+#endif
+    }
+}
+
+void reset_rnn(network *net)
+{
+    reset_network_state(net, 0);
+}
+
+float get_current_seq_subdivisions(network net)
+{
+    int sequence_subdivisions = net.init_sequential_subdivisions;
+
+    if (net.num_steps > 0)
+    {
+        int batch_num = get_current_batch(net);
+        int i;
+        for (i = 0; i < net.num_steps; ++i) {
+            if (net.steps[i] > batch_num) break;
+            sequence_subdivisions *= net.seq_scales[i];
+        }
+    }
+    if (sequence_subdivisions < 1) sequence_subdivisions = 1;
+    if (sequence_subdivisions > net.subdivisions) sequence_subdivisions = net.subdivisions;
+    return sequence_subdivisions;
+}
+
+int get_sequence_value(network net)
+{
+    int sequence = 1;
+    if (net.sequential_subdivisions != 0) sequence = net.subdivisions / net.sequential_subdivisions;
+    if (sequence < 1) sequence = 1;
+    return sequence;
+}
+
+float get_current_rate(network net)
+{
+    int batch_num = get_current_batch(net);
+    int i;
+    float rate;
+    if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
+    switch (net.policy) {
+        case CONSTANT:
+            return net.learning_rate;
+        case STEP:
+            return net.learning_rate * pow(net.scale, batch_num/net.step);
+        case STEPS:
+            rate = net.learning_rate;
+            for(i = 0; i < net.num_steps; ++i){
+                if(net.steps[i] > batch_num) return rate;
+                rate *= net.scales[i];
+                //if(net.steps[i] > batch_num - 1 && net.scales[i] > 1) reset_momentum(net);
+            }
+            return rate;
+        case EXP:
+            return net.learning_rate * pow(net.gamma, batch_num);
+        case POLY:
+            return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
+            //if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
+            //return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
+        case RANDOM:
+            return net.learning_rate * pow(rand_uniform(0,1), net.power);
+        case SIG:
+            return net.learning_rate * (1./(1.+exp(net.gamma*(batch_num - net.step))));
+        case SGDR:
+        {
+            int last_iteration_start = 0;
+            int cycle_size = net.batches_per_cycle;
+            while ((last_iteration_start + cycle_size) < batch_num)
+            {
+                last_iteration_start += cycle_size;
+                cycle_size *= net.batches_cycle_mult;
+            }
+            rate = net.learning_rate_min +
+                0.5*(net.learning_rate - net.learning_rate_min)
+                * (1. + cos((float)(batch_num - last_iteration_start)*3.14159265 / cycle_size));
+
+            return rate;
+        }
+        default:
+            fprintf(stderr, "Policy is weird!\n");
+            return net.learning_rate;
+    }
+}
+
+char *get_layer_string(LAYER_TYPE a)
+{
+    switch(a){
+        case CONVOLUTIONAL:
+            return "convolutional";
+        case ACTIVE:
+            return "activation";
+        case LOCAL:
+            return "local";
+        case DECONVOLUTIONAL:
+            return "deconvolutional";
+        case CONNECTED:
+            return "connected";
+        case RNN:
+            return "rnn";
+        case GRU:
+            return "gru";
+        case LSTM:
+            return "lstm";
+        case CRNN:
+            return "crnn";
+        case MAXPOOL:
+            return "maxpool";
+        case REORG:
+            return "reorg";
+        case AVGPOOL:
+            return "avgpool";
+        case SOFTMAX:
+            return "softmax";
+        case DETECTION:
+            return "detection";
+        case REGION:
+            return "region";
+        case YOLO:
+            return "yolo";
+        case GAUSSIAN_YOLO:
+            return "Gaussian_yolo";
+        case DROPOUT:
+            return "dropout";
+        case CROP:
+            return "crop";
+        case COST:
+            return "cost";
+        case ROUTE:
+            return "route";
+        case SHORTCUT:
+            return "shortcut";
+        case SCALE_CHANNELS:
+            return "scale_channels";
+        case SAM:
+            return "sam";
+        case NORMALIZATION:
+            return "normalization";
+        case BATCHNORM:
+            return "batchnorm";
+        default:
+            break;
+    }
+    return "none";
+}
+
+network make_network(int n)
+{
+    network net = {0};
+    net.n = n;
+    net.layers = (layer*)xcalloc(net.n, sizeof(layer));
+    net.seen = (uint64_t*)xcalloc(1, sizeof(uint64_t));
+    net.cuda_graph_ready = (int*)xcalloc(1, sizeof(int));
+    net.badlabels_reject_threshold = (float*)xcalloc(1, sizeof(float));
+    net.delta_rolling_max = (float*)xcalloc(1, sizeof(float));
+    net.delta_rolling_avg = (float*)xcalloc(1, sizeof(float));
+    net.delta_rolling_std = (float*)xcalloc(1, sizeof(float));
+    net.cur_iteration = (int*)xcalloc(1, sizeof(int));
+    net.total_bbox = (int*)xcalloc(1, sizeof(int));
+    net.rewritten_bbox = (int*)xcalloc(1, sizeof(int));
+    *net.rewritten_bbox = *net.total_bbox = 0;
+#ifdef GPU
+    net.input_gpu = (float**)xcalloc(1, sizeof(float*));
+    net.truth_gpu = (float**)xcalloc(1, sizeof(float*));
+
+    net.input16_gpu = (float**)xcalloc(1, sizeof(float*));
+    net.output16_gpu = (float**)xcalloc(1, sizeof(float*));
+    net.max_input16_size = (size_t*)xcalloc(1, sizeof(size_t));
+    net.max_output16_size = (size_t*)xcalloc(1, sizeof(size_t));
+#endif
+    return net;
+}
+
+void forward_network(network net, network_state state)
+{
+    state.workspace = net.workspace;
+    int i;
+    for(i = 0; i < net.n; ++i){
+        state.index = i;
+        layer l = net.layers[i];
+        if(l.delta && state.train && l.train){
+            scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
+        }
+        //double time = get_time_point();
+        l.forward(l, state);
+        //printf("%d - Predicted in %lf milli-seconds.\n", i, ((double)get_time_point() - time) / 1000);
+        state.input = l.output;
+
+        /*
+        float avg_val = 0;
+        int k;
+        for (k = 0; k < l.outputs; ++k) avg_val += l.output[k];
+        printf(" i: %d - avg_val = %f \n", i, avg_val / l.outputs);
+        */
+    }
+}
+
+void update_network(network net)
+{
+    int i;
+    int update_batch = net.batch*net.subdivisions;
+    float rate = get_current_rate(net);
+    for(i = 0; i < net.n; ++i){
+        layer l = net.layers[i];
+        if (l.train == 0) continue;
+        if(l.update){
+            l.update(l, update_batch, rate, net.momentum, net.decay);
+        }
+    }
+}
+
+float *get_network_output(network net)
+{
+#ifdef GPU
+    if (gpu_index >= 0) return get_network_output_gpu(net);
+#endif
+    int i;
+    for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break;
+    return net.layers[i].output;
+}
+
+float get_network_cost(network net)
+{
+    int i;
+    float sum = 0;
+    int count = 0;
+    for(i = 0; i < net.n; ++i){
+        if(net.layers[i].cost){
+            sum += net.layers[i].cost[0];
+            ++count;
+        }
+    }
+    return sum/count;
+}
+
+int get_predicted_class_network(network net)
+{
+    float *out = get_network_output(net);
+    int k = get_network_output_size(net);
+    return max_index(out, k);
+}
+
+void backward_network(network net, network_state state)
+{
+    int i;
+    float *original_input = state.input;
+    float *original_delta = state.delta;
+    state.workspace = net.workspace;
+    for(i = net.n-1; i >= 0; --i){
+        state.index = i;
+        if(i == 0){
+            state.input = original_input;
+            state.delta = original_delta;
+        }else{
+            layer prev = net.layers[i-1];
+            state.input = prev.output;
+            state.delta = prev.delta;
+        }
+        layer l = net.layers[i];
+        if (l.stopbackward) break;
+        if (l.onlyforward) continue;
+        l.backward(l, state);
+    }
+}
+
+float train_network_datum(network net, float *x, float *y)
+{
+#ifdef GPU
+    if(gpu_index >= 0) return train_network_datum_gpu(net, x, y);
+#endif
+    network_state state={0};
+    *net.seen += net.batch;
+    state.index = 0;
+    state.net = net;
+    state.input = x;
+    state.delta = 0;
+    state.truth = y;
+    state.train = 1;
+    forward_network(net, state);
+    backward_network(net, state);
+    float error = get_network_cost(net);
+    //if(((*net.seen)/net.batch)%net.subdivisions == 0) update_network(net);
+    if(*(state.net.total_bbox) > 0)
+        fprintf(stderr, " total_bbox = %d, rewritten_bbox = %f %% \n", *(state.net.total_bbox), 100 * (float)*(state.net.rewritten_bbox) / *(state.net.total_bbox));
+    return error;
+}
+
+float train_network_sgd(network net, data d, int n)
+{
+    int batch = net.batch;
+    float* X = (float*)xcalloc(batch * d.X.cols, sizeof(float));
+    float* y = (float*)xcalloc(batch * d.y.cols, sizeof(float));
+
+    int i;
+    float sum = 0;
+    for(i = 0; i < n; ++i){
+        get_random_batch(d, batch, X, y);
+        net.current_subdivision = i;
+        float err = train_network_datum(net, X, y);
+        sum += err;
+    }
+    free(X);
+    free(y);
+    return (float)sum/(n*batch);
+}
+
+float train_network(network net, data d)
+{
+    return train_network_waitkey(net, d, 0);
+}
+
+float train_network_waitkey(network net, data d, int wait_key)
+{
+    assert(d.X.rows % net.batch == 0);
+    int batch = net.batch;
+    int n = d.X.rows / batch;
+    float* X = (float*)xcalloc(batch * d.X.cols, sizeof(float));
+    float* y = (float*)xcalloc(batch * d.y.cols, sizeof(float));
+
+    int i;
+    float sum = 0;
+    for(i = 0; i < n; ++i){
+        get_next_batch(d, batch, i*batch, X, y);
+        net.current_subdivision = i;
+        float err = train_network_datum(net, X, y);
+        sum += err;
+        if(wait_key) wait_key_cv(5);
+    }
+    (*net.cur_iteration) += 1;
+#ifdef GPU
+    update_network_gpu(net);
+#else   // GPU
+    update_network(net);
+#endif  // GPU
+
+    int ema_start_point = net.max_batches / 2;
+
+    if (net.ema_alpha && (*net.cur_iteration) >= ema_start_point)
+    {
+        int ema_period = (net.max_batches - ema_start_point - 1000) * (1.0 - net.ema_alpha);
+        int ema_apply_point = net.max_batches - 1000;
+
+        if (!is_ema_initialized(net))
+        {
+            ema_update(net, 0); // init EMA
+            printf(" EMA initialization \n");
+        }
+
+        if ((*net.cur_iteration) == ema_apply_point)
+        {
+            ema_apply(net); // apply EMA (BN rolling mean/var recalculation is required)
+            printf(" ema_apply() \n");
+        }
+        else
+        if ((*net.cur_iteration) < ema_apply_point)// && (*net.cur_iteration) % ema_period == 0)
+        {
+            ema_update(net, net.ema_alpha); // update EMA
+            printf(" ema_update(), ema_alpha = %f \n", net.ema_alpha);
+        }
+    }
+
+
+    int reject_stop_point = net.max_batches*3/4;
+
+    if ((*net.cur_iteration) < reject_stop_point &&
+        net.weights_reject_freq &&
+        (*net.cur_iteration) % net.weights_reject_freq == 0)
+    {
+        float sim_threshold = 0.4;
+        reject_similar_weights(net, sim_threshold);
+    }
+
+
+    free(X);
+    free(y);
+    return (float)sum/(n*batch);
+}
+
+
+float train_network_batch(network net, data d, int n)
+{
+    int i,j;
+    network_state state={0};
+    state.index = 0;
+    state.net = net;
+    state.train = 1;
+    state.delta = 0;
+    float sum = 0;
+    int batch = 2;
+    for(i = 0; i < n; ++i){
+        for(j = 0; j < batch; ++j){
+            int index = random_gen()%d.X.rows;
+            state.input = d.X.vals[index];
+            state.truth = d.y.vals[index];
+            forward_network(net, state);
+            backward_network(net, state);
+            sum += get_network_cost(net);
+        }
+        update_network(net);
+    }
+    return (float)sum/(n*batch);
+}
+
+int recalculate_workspace_size(network *net)
+{
+#ifdef GPU
+    cuda_set_device(net->gpu_index);
+    if (gpu_index >= 0) cuda_free(net->workspace);
+#endif
+    int i;
+    size_t workspace_size = 0;
+    for (i = 0; i < net->n; ++i) {
+        layer l = net->layers[i];
+        //printf(" %d: layer = %d,", i, l.type);
+        if (l.type == CONVOLUTIONAL) {
+            l.workspace_size = get_convolutional_workspace_size(l);
+        }
+        else if (l.type == CONNECTED) {
+            l.workspace_size = get_connected_workspace_size(l);
+        }
+        if (l.workspace_size > workspace_size) workspace_size = l.workspace_size;
+        net->layers[i] = l;
+    }
+
+#ifdef GPU
+    if (gpu_index >= 0) {
+        printf("\n try to allocate additional workspace_size = %1.2f MB \n", (float)workspace_size / 1000000);
+        net->workspace = cuda_make_array(0, workspace_size / sizeof(float) + 1);
+        printf(" CUDA allocate done! \n");
+    }
+    else {
+        free(net->workspace);
+        net->workspace = (float*)xcalloc(1, workspace_size);
+    }
+#else
+    free(net->workspace);
+    net->workspace = (float*)xcalloc(1, workspace_size);
+#endif
+    //fprintf(stderr, " Done!\n");
+    return 0;
+}
+
+void set_batch_network(network *net, int b)
+{
+    net->batch = b;
+    int i;
+    for(i = 0; i < net->n; ++i){
+        net->layers[i].batch = b;
+
+#ifdef CUDNN
+        if(net->layers[i].type == CONVOLUTIONAL){
+            cudnn_convolutional_setup(net->layers + i, cudnn_fastest, 0);
+        }
+        else if (net->layers[i].type == MAXPOOL) {
+            cudnn_maxpool_setup(net->layers + i);
+        }
+#endif
+
+    }
+    recalculate_workspace_size(net); // recalculate workspace size
+}
+
+int resize_network(network *net, int w, int h)
+{
+#ifdef GPU
+    cuda_set_device(net->gpu_index);
+    if(gpu_index >= 0){
+        cuda_free(net->workspace);
+        if (net->input_gpu) {
+            cuda_free(*net->input_gpu);
+            *net->input_gpu = 0;
+            cuda_free(*net->truth_gpu);
+            *net->truth_gpu = 0;
+        }
+
+        if (net->input_state_gpu) cuda_free(net->input_state_gpu);
+        if (net->input_pinned_cpu) {
+            if (net->input_pinned_cpu_flag) cudaFreeHost(net->input_pinned_cpu);
+            else free(net->input_pinned_cpu);
+        }
+    }
+#endif
+    int i;
+    //if(w == net->w && h == net->h) return 0;
+    net->w = w;
+    net->h = h;
+    int inputs = 0;
+    size_t workspace_size = 0;
+    //fprintf(stderr, "Resizing to %d x %d...\n", w, h);
+    //fflush(stderr);
+    for (i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
+        //printf(" (resize %d: layer = %d) , ", i, l.type);
+        if(l.type == CONVOLUTIONAL){
+            resize_convolutional_layer(&l, w, h);
+        }
+        else if (l.type == CRNN) {
+            resize_crnn_layer(&l, w, h);
+        }else if (l.type == CONV_LSTM) {
+            resize_conv_lstm_layer(&l, w, h);
+        }else if(l.type == CROP){
+            resize_crop_layer(&l, w, h);
+        }else if(l.type == MAXPOOL){
+            resize_maxpool_layer(&l, w, h);
+        }else if (l.type == LOCAL_AVGPOOL) {
+            resize_maxpool_layer(&l, w, h);
+        }else if (l.type == BATCHNORM) {
+            resize_batchnorm_layer(&l, w, h);
+        }else if(l.type == REGION){
+            resize_region_layer(&l, w, h);
+        }else if (l.type == YOLO) {
+            resize_yolo_layer(&l, w, h);
+        }else if (l.type == GAUSSIAN_YOLO) {
+            resize_gaussian_yolo_layer(&l, w, h);
+        }else if(l.type == ROUTE){
+            resize_route_layer(&l, net);
+        }else if (l.type == SHORTCUT) {
+            resize_shortcut_layer(&l, w, h, net);
+        }else if (l.type == SCALE_CHANNELS) {
+            resize_scale_channels_layer(&l, net);
+        }else if (l.type == SAM) {
+            resize_sam_layer(&l, w, h);
+        }else if (l.type == DROPOUT) {
+            resize_dropout_layer(&l, inputs);
+            l.out_w = l.w = w;
+            l.out_h = l.h = h;
+            l.output = net->layers[i - 1].output;
+            l.delta = net->layers[i - 1].delta;
+#ifdef GPU
+            l.output_gpu = net->layers[i-1].output_gpu;
+            l.delta_gpu = net->layers[i-1].delta_gpu;
+#endif
+        }else if (l.type == UPSAMPLE) {
+            resize_upsample_layer(&l, w, h);
+        }else if(l.type == REORG){
+            resize_reorg_layer(&l, w, h);
+        } else if (l.type == REORG_OLD) {
+            resize_reorg_old_layer(&l, w, h);
+        }else if(l.type == AVGPOOL){
+            resize_avgpool_layer(&l, w, h);
+        }else if(l.type == NORMALIZATION){
+            resize_normalization_layer(&l, w, h);
+        }else if(l.type == COST){
+            resize_cost_layer(&l, inputs);
+        }else{
+            fprintf(stderr, "Resizing type %d \n", (int)l.type);
+            error("Cannot resize this type of layer", DARKNET_LOC);
+        }
+        if(l.workspace_size > workspace_size) workspace_size = l.workspace_size;
+        inputs = l.outputs;
+        net->layers[i] = l;
+        //if(l.type != DROPOUT)
+        {
+            w = l.out_w;
+            h = l.out_h;
+        }
+        //if(l.type == AVGPOOL) break;
+    }
+#ifdef GPU
+    const int size = get_network_input_size(*net) * net->batch;
+    if(gpu_index >= 0){
+        printf(" try to allocate additional workspace_size = %1.2f MB \n", (float)workspace_size / 1000000);
+        net->workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1);
+        net->input_state_gpu = cuda_make_array(0, size);
+        if (cudaSuccess == cudaHostAlloc(&net->input_pinned_cpu, size * sizeof(float), cudaHostRegisterMapped))
+            net->input_pinned_cpu_flag = 1;
+        else {
+            cudaGetLastError(); // reset CUDA-error
+            net->input_pinned_cpu = (float*)xcalloc(size, sizeof(float));
+            net->input_pinned_cpu_flag = 0;
+        }
+        printf(" CUDA allocate done! \n");
+    }else {
+        free(net->workspace);
+        net->workspace = (float*)xcalloc(1, workspace_size);
+        if(!net->input_pinned_cpu_flag)
+            net->input_pinned_cpu = (float*)xrealloc(net->input_pinned_cpu, size * sizeof(float));
+    }
+#else
+    free(net->workspace);
+    net->workspace = (float*)xcalloc(1, workspace_size);
+#endif
+    //fprintf(stderr, " Done!\n");
+    return 0;
+}
+
+int get_network_output_size(network net)
+{
+    int i;
+    for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break;
+    return net.layers[i].outputs;
+}
+
+int get_network_input_size(network net)
+{
+    return net.layers[0].inputs;
+}
+
+detection_layer get_network_detection_layer(network net)
+{
+    int i;
+    for(i = 0; i < net.n; ++i){
+        if(net.layers[i].type == DETECTION){
+            return net.layers[i];
+        }
+    }
+    fprintf(stderr, "Detection layer not found!!\n");
+    detection_layer l = { (LAYER_TYPE)0 };
+    return l;
+}
+
+image get_network_image_layer(network net, int i)
+{
+    layer l = net.layers[i];
+    if (l.out_w && l.out_h && l.out_c){
+        return float_to_image(l.out_w, l.out_h, l.out_c, l.output);
+    }
+    image def = {0};
+    return def;
+}
+
+layer* get_network_layer(network* net, int i)
+{
+    return net->layers + i;
+}
+
+image get_network_image(network net)
+{
+    int i;
+    for(i = net.n-1; i >= 0; --i){
+        image m = get_network_image_layer(net, i);
+        if(m.h != 0) return m;
+    }
+    image def = {0};
+    return def;
+}
+
+void visualize_network(network net)
+{
+    image *prev = 0;
+    int i;
+    char buff[256];
+    for(i = 0; i < net.n; ++i){
+        sprintf(buff, "Layer %d", i);
+        layer l = net.layers[i];
+        if(l.type == CONVOLUTIONAL){
+            prev = visualize_convolutional_layer(l, buff, prev);
+        }
+    }
+}
+
+void top_predictions(network net, int k, int *index)
+{
+    int size = get_network_output_size(net);
+    float *out = get_network_output(net);
+    top_k(out, size, k, index);
+}
+
+// A version of network_predict that uses a pointer for the network
+// struct to make the python binding work properly.
+float *network_predict_ptr(network *net, float *input)
+{
+    return network_predict(*net, input);
+}
+
+float *network_predict(network net, float *input)
+{
+#ifdef GPU
+    if(gpu_index >= 0)  return network_predict_gpu(net, input);
+#endif
+
+    network_state state = {0};
+    state.net = net;
+    state.index = 0;
+    state.input = input;
+    state.truth = 0;
+    state.train = 0;
+    state.delta = 0;
+    forward_network(net, state);
+    float *out = get_network_output(net);
+    return out;
+}
+
+#ifdef CUDA_OPENGL_INTEGRATION
+float *network_predict_gl_texture(network *net, uint32_t texture_id)
+{
+    if(net->batch != 1) {
+        set_batch_network(net, 1);
+    }
+
+    if(gpu_index >= 0) {
+        return network_predict_gpu_gl_texture(*net, texture_id);
+    }
+
+    return NULL;
+}
+#endif // CUDA_OPENGL_INTEGRATION
+
+int num_detections(network *net, float thresh)
+{
+    int i;
+    int s = 0;
+    for (i = 0; i < net->n; ++i) {
+        layer l = net->layers[i];
+        if (l.type == YOLO) {
+            s += yolo_num_detections(l, thresh);
+        }
+        if (l.type == GAUSSIAN_YOLO) {
+            s += gaussian_yolo_num_detections(l, thresh);
+        }
+        if (l.type == DETECTION || l.type == REGION) {
+            s += l.w*l.h*l.n;
+        }
+    }
+    return s;
+}
+
+int num_detections_batch(network *net, float thresh, int batch)
+{
+    int i;
+    int s = 0;
+    for (i = 0; i < net->n; ++i) {
+        layer l = net->layers[i];
+        if (l.type == YOLO) {
+            s += yolo_num_detections_batch(l, thresh, batch);
+        }
+        if (l.type == DETECTION || l.type == REGION) {
+            s += l.w*l.h*l.n;
+        }
+    }
+    return s;
+}
+
+detection *make_network_boxes(network *net, float thresh, int *num)
+{
+    int i;
+    layer l = net->layers[net->n - 1];
+    for (i = 0; i < net->n; ++i) {
+        layer l_tmp = net->layers[i];
+        if (l_tmp.type == YOLO || l_tmp.type == GAUSSIAN_YOLO || l_tmp.type == DETECTION || l_tmp.type == REGION) {
+            l = l_tmp;
+            break;
+        }
+    }
+
+    int nboxes = num_detections(net, thresh);
+    if (num) *num = nboxes;
+    detection* dets = (detection*)xcalloc(nboxes, sizeof(detection));
+    for (i = 0; i < nboxes; ++i) {
+        dets[i].prob = (float*)xcalloc(l.classes, sizeof(float));
+        // tx,ty,tw,th uncertainty
+        if(l.type == GAUSSIAN_YOLO) dets[i].uc = (float*)xcalloc(4, sizeof(float)); // Gaussian_YOLOv3
+        else dets[i].uc = NULL;
+
+        if (l.coords > 4) dets[i].mask = (float*)xcalloc(l.coords - 4, sizeof(float));
+        else dets[i].mask = NULL;
+
+        if(l.embedding_output) dets[i].embeddings = (float*)xcalloc(l.embedding_size, sizeof(float));
+        else dets[i].embeddings = NULL;
+        dets[i].embedding_size = l.embedding_size;
+    }
+    return dets;
+}
+
+detection *make_network_boxes_batch(network *net, float thresh, int *num, int batch)
+{
+    int i;
+    layer l = net->layers[net->n - 1];
+    for (i = 0; i < net->n; ++i) {
+        layer l_tmp = net->layers[i];
+        if (l_tmp.type == YOLO || l_tmp.type == GAUSSIAN_YOLO || l_tmp.type == DETECTION || l_tmp.type == REGION) {
+            l = l_tmp;
+            break;
+        }
+    }
+
+    int nboxes = num_detections_batch(net, thresh, batch);
+    assert(num != NULL);
+    *num = nboxes;
+    detection* dets = (detection*)calloc(nboxes, sizeof(detection));
+    for (i = 0; i < nboxes; ++i) {
+        dets[i].prob = (float*)calloc(l.classes, sizeof(float));
+        // tx,ty,tw,th uncertainty
+        if (l.type == GAUSSIAN_YOLO) dets[i].uc = (float*)xcalloc(4, sizeof(float)); // Gaussian_YOLOv3
+        else dets[i].uc = NULL;
+
+        if (l.coords > 4) dets[i].mask = (float*)xcalloc(l.coords - 4, sizeof(float));
+        else dets[i].mask = NULL;
+
+        if (l.embedding_output) dets[i].embeddings = (float*)xcalloc(l.embedding_size, sizeof(float));
+        else dets[i].embeddings = NULL;
+        dets[i].embedding_size = l.embedding_size;
+    }
+    return dets;
+}
+
+void custom_get_region_detections(layer l, int w, int h, int net_w, int net_h, float thresh, int *map, float hier, int relative, detection *dets, int letter)
+{
+    box* boxes = (box*)xcalloc(l.w * l.h * l.n, sizeof(box));
+    float** probs = (float**)xcalloc(l.w * l.h * l.n, sizeof(float*));
+    int i, j;
+    for (j = 0; j < l.w*l.h*l.n; ++j) probs[j] = (float*)xcalloc(l.classes, sizeof(float));
+    get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, map);
+    for (j = 0; j < l.w*l.h*l.n; ++j) {
+        dets[j].classes = l.classes;
+        dets[j].bbox = boxes[j];
+        dets[j].objectness = 1;
+        float highest_prob = 0;
+        dets[j].best_class_idx = -1;
+        for (i = 0; i < l.classes; ++i) {
+            if (probs[j][i] > highest_prob) {
+                highest_prob = probs[j][i];
+                dets[j].best_class_idx = i;
+            }
+            dets[j].prob[i] = probs[j][i];
+        }
+    }
+
+    free(boxes);
+    free_ptrs((void **)probs, l.w*l.h*l.n);
+
+    //correct_region_boxes(dets, l.w*l.h*l.n, w, h, net_w, net_h, relative);
+    correct_yolo_boxes(dets, l.w*l.h*l.n, w, h, net_w, net_h, relative, letter);
+}
+
+void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets, int letter)
+{
+    int prev_classes = -1;
+    int j;
+    for (j = 0; j < net->n; ++j) {
+        layer l = net->layers[j];
+        if (l.type == YOLO) {
+            int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets, letter);
+            dets += count;
+            if (prev_classes < 0) prev_classes = l.classes;
+            else if (prev_classes != l.classes) {
+                printf(" Error: Different [yolo] layers have different number of classes = %d and %d - check your cfg-file! \n",
+                    prev_classes, l.classes);
+            }
+        }
+        if (l.type == GAUSSIAN_YOLO) {
+            int count = get_gaussian_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets, letter);
+            dets += count;
+        }
+        if (l.type == REGION) {
+            custom_get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets, letter);
+            //get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
+            dets += l.w*l.h*l.n;
+        }
+        if (l.type == DETECTION) {
+            get_detection_detections(l, w, h, thresh, dets);
+            dets += l.w*l.h*l.n;
+        }
+    }
+}
+
+void fill_network_boxes_batch(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets, int letter, int batch)
+{
+    int prev_classes = -1;
+    int j;
+    for (j = 0; j < net->n; ++j) {
+        layer l = net->layers[j];
+        if (l.type == YOLO) {
+            int count = get_yolo_detections_batch(l, w, h, net->w, net->h, thresh, map, relative, dets, letter, batch);
+            dets += count;
+            if (prev_classes < 0) prev_classes = l.classes;
+            else if (prev_classes != l.classes) {
+                printf(" Error: Different [yolo] layers have different number of classes = %d and %d - check your cfg-file! \n",
+                    prev_classes, l.classes);
+            }
+        }
+        if (l.type == REGION) {
+            custom_get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets, letter);
+            //get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
+            dets += l.w*l.h*l.n;
+        }
+        if (l.type == DETECTION) {
+            get_detection_detections(l, w, h, thresh, dets);
+            dets += l.w*l.h*l.n;
+        }
+    }
+}
+
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter)
+{
+    detection *dets = make_network_boxes(net, thresh, num);
+    fill_network_boxes(net, w, h, thresh, hier, map, relative, dets, letter);
+    return dets;
+}
+
+void free_detections(detection *dets, int n)
+{
+    int i;
+    for (i = 0; i < n; ++i) {
+        free(dets[i].prob);
+        if (dets[i].uc) free(dets[i].uc);
+        if (dets[i].mask) free(dets[i].mask);
+        if (dets[i].embeddings) free(dets[i].embeddings);
+    }
+    free(dets);
+}
+
+void free_batch_detections(det_num_pair *det_num_pairs, int n)
+{
+    int  i;
+    for(i=0; i<n; ++i)
+        free_detections(det_num_pairs[i].dets, det_num_pairs[i].num);
+    free(det_num_pairs);
+}
+
+// JSON format:
+//{
+// "frame_id":8990,
+// "objects":[
+//  {"class_id":4, "name":"aeroplane", "relative coordinates":{"center_x":0.398831, "center_y":0.630203, "width":0.057455, "height":0.020396}, "confidence":0.793070},
+//  {"class_id":14, "name":"bird", "relative coordinates":{"center_x":0.398831, "center_y":0.630203, "width":0.057455, "height":0.020396}, "confidence":0.265497}
+// ]
+//},
+
+char *detection_to_json(detection *dets, int nboxes, int classes, char **names, long long int frame_id, char *filename)
+{
+    const float thresh = 0.005; // function get_network_boxes() has already filtred dets by actual threshold
+
+    char *send_buf = (char *)calloc(1024, sizeof(char));
+    if (!send_buf) return 0;
+    if (filename) {
+        sprintf(send_buf, "{\n \"frame_id\":%lld, \n \"filename\":\"%s\", \n \"objects\": [ \n", frame_id, filename);
+    }
+    else {
+        sprintf(send_buf, "{\n \"frame_id\":%lld, \n \"objects\": [ \n", frame_id);
+    }
+
+    int i, j;
+    int class_id = -1;
+    for (i = 0; i < nboxes; ++i) {
+        for (j = 0; j < classes; ++j) {
+            int show = strncmp(names[j], "dont_show", 9);
+            if (dets[i].prob[j] > thresh && show)
+            {
+                if (class_id != -1) strcat(send_buf, ", \n");
+                class_id = j;
+                char *buf = (char *)calloc(2048, sizeof(char));
+                if (!buf) return 0;
+                //sprintf(buf, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f}",
+                //    image_id, j, dets[i].bbox.x, dets[i].bbox.y, dets[i].bbox.w, dets[i].bbox.h, dets[i].prob[j]);
+
+                sprintf(buf, "  {\"class_id\":%d, \"name\":\"%s\", \"relative_coordinates\":{\"center_x\":%f, \"center_y\":%f, \"width\":%f, \"height\":%f}, \"confidence\":%f}",
+                    j, names[j], dets[i].bbox.x, dets[i].bbox.y, dets[i].bbox.w, dets[i].bbox.h, dets[i].prob[j]);
+
+                int send_buf_len = strlen(send_buf);
+                int buf_len = strlen(buf);
+                int total_len = send_buf_len + buf_len + 100;
+                send_buf = (char *)realloc(send_buf, total_len * sizeof(char));
+                if (!send_buf) {
+                    if (buf) free(buf);
+                    return 0;
+                }
+                strcat(send_buf, buf);
+                free(buf);
+            }
+        }
+    }
+    strcat(send_buf, "\n ] \n}");
+    return send_buf;
+}
+
+
+float *network_predict_image(network *net, image im)
+{
+    //image imr = letterbox_image(im, net->w, net->h);
+    float *p;
+    if(net->batch != 1) set_batch_network(net, 1);
+    if (im.w == net->w && im.h == net->h) {
+        // Input image is the same size as our net, predict on that image
+        p = network_predict(*net, im.data);
+    }
+    else {
+        // Need to resize image to the desired size for the net
+        image imr = resize_image(im, net->w, net->h);
+        p = network_predict(*net, imr.data);
+        free_image(imr);
+    }
+    return p;
+}
+
+det_num_pair* network_predict_batch(network *net, image im, int batch_size, int w, int h, float thresh, float hier, int *map, int relative, int letter)
+{
+    network_predict(*net, im.data);
+    det_num_pair *pdets = (struct det_num_pair *)calloc(batch_size, sizeof(det_num_pair));
+    int num;
+    int batch;
+    for(batch=0; batch < batch_size; batch++){
+        detection *dets = make_network_boxes_batch(net, thresh, &num, batch);
+        fill_network_boxes_batch(net, w, h, thresh, hier, map, relative, dets, letter, batch);
+        pdets[batch].num = num;
+        pdets[batch].dets = dets;
+    }
+    return pdets;
+}
+
+float *network_predict_image_letterbox(network *net, image im)
+{
+    //image imr = letterbox_image(im, net->w, net->h);
+    float *p;
+    if (net->batch != 1) set_batch_network(net, 1);
+    if (im.w == net->w && im.h == net->h) {
+        // Input image is the same size as our net, predict on that image
+        p = network_predict(*net, im.data);
+    }
+    else {
+        // Need to resize image to the desired size for the net
+        image imr = letterbox_image(im, net->w, net->h);
+        p = network_predict(*net, imr.data);
+        free_image(imr);
+    }
+    return p;
+}
+
+int network_width(network *net) { return net->w; }
+int network_height(network *net) { return net->h; }
+
+matrix network_predict_data_multi(network net, data test, int n)
+{
+    int i,j,b,m;
+    int k = get_network_output_size(net);
+    matrix pred = make_matrix(test.X.rows, k);
+    float* X = (float*)xcalloc(net.batch * test.X.rows, sizeof(float));
+    for(i = 0; i < test.X.rows; i += net.batch){
+        for(b = 0; b < net.batch; ++b){
+            if(i+b == test.X.rows) break;
+            memcpy(X+b*test.X.cols, test.X.vals[i+b], test.X.cols*sizeof(float));
+        }
+        for(m = 0; m < n; ++m){
+            float *out = network_predict(net, X);
+            for(b = 0; b < net.batch; ++b){
+                if(i+b == test.X.rows) break;
+                for(j = 0; j < k; ++j){
+                    pred.vals[i+b][j] += out[j+b*k]/n;
+                }
+            }
+        }
+    }
+    free(X);
+    return pred;
+}
+
+matrix network_predict_data(network net, data test)
+{
+    int i,j,b;
+    int k = get_network_output_size(net);
+    matrix pred = make_matrix(test.X.rows, k);
+    float* X = (float*)xcalloc(net.batch * test.X.cols, sizeof(float));
+    for(i = 0; i < test.X.rows; i += net.batch){
+        for(b = 0; b < net.batch; ++b){
+            if(i+b == test.X.rows) break;
+            memcpy(X+b*test.X.cols, test.X.vals[i+b], test.X.cols*sizeof(float));
+        }
+        float *out = network_predict(net, X);
+        for(b = 0; b < net.batch; ++b){
+            if(i+b == test.X.rows) break;
+            for(j = 0; j < k; ++j){
+                pred.vals[i+b][j] = out[j+b*k];
+            }
+        }
+    }
+    free(X);
+    return pred;
+}
+
+void print_network(network net)
+{
+    int i,j;
+    for(i = 0; i < net.n; ++i){
+        layer l = net.layers[i];
+        float *output = l.output;
+        int n = l.outputs;
+        float mean = mean_array(output, n);
+        float vari = variance_array(output, n);
+        fprintf(stderr, "Layer %d - Mean: %f, Variance: %f\n",i,mean, vari);
+        if(n > 100) n = 100;
+        for(j = 0; j < n; ++j) fprintf(stderr, "%f, ", output[j]);
+        if(n == 100)fprintf(stderr,".....\n");
+        fprintf(stderr, "\n");
+    }
+}
+
+void compare_networks(network n1, network n2, data test)
+{
+    matrix g1 = network_predict_data(n1, test);
+    matrix g2 = network_predict_data(n2, test);
+    int i;
+    int a,b,c,d;
+    a = b = c = d = 0;
+    for(i = 0; i < g1.rows; ++i){
+        int truth = max_index(test.y.vals[i], test.y.cols);
+        int p1 = max_index(g1.vals[i], g1.cols);
+        int p2 = max_index(g2.vals[i], g2.cols);
+        if(p1 == truth){
+            if(p2 == truth) ++d;
+            else ++c;
+        }else{
+            if(p2 == truth) ++b;
+            else ++a;
+        }
+    }
+    printf("%5d %5d\n%5d %5d\n", a, b, c, d);
+    float num = pow((abs(b - c) - 1.), 2.);
+    float den = b + c;
+    printf("%f\n", num/den);
+}
+
+float network_accuracy(network net, data d)
+{
+    matrix guess = network_predict_data(net, d);
+    float acc = matrix_topk_accuracy(d.y, guess,1);
+    free_matrix(guess);
+    return acc;
+}
+
+float *network_accuracies(network net, data d, int n)
+{
+    static float acc[2];
+    matrix guess = network_predict_data(net, d);
+    acc[0] = matrix_topk_accuracy(d.y, guess, 1);
+    acc[1] = matrix_topk_accuracy(d.y, guess, n);
+    free_matrix(guess);
+    return acc;
+}
+
+float network_accuracy_multi(network net, data d, int n)
+{
+    matrix guess = network_predict_data_multi(net, d, n);
+    float acc = matrix_topk_accuracy(d.y, guess,1);
+    free_matrix(guess);
+    return acc;
+}
+
+void free_network_ptr(network* net)
+{
+    free_network(*net);
+}
+
+void free_network(network net)
+{
+    int i;
+    for (i = 0; i < net.n; ++i) {
+        free_layer(net.layers[i]);
+    }
+    free(net.layers);
+
+    free(net.seq_scales);
+    free(net.scales);
+    free(net.steps);
+    free(net.seen);
+    free(net.cuda_graph_ready);
+    free(net.badlabels_reject_threshold);
+    free(net.delta_rolling_max);
+    free(net.delta_rolling_avg);
+    free(net.delta_rolling_std);
+    free(net.cur_iteration);
+    free(net.total_bbox);
+    free(net.rewritten_bbox);
+
+#ifdef GPU
+    if (gpu_index >= 0) cuda_free(net.workspace);
+    else free(net.workspace);
+    free_pinned_memory();
+    if (net.input_state_gpu) cuda_free(net.input_state_gpu);
+    if (net.input_pinned_cpu) {   // CPU
+        if (net.input_pinned_cpu_flag) cudaFreeHost(net.input_pinned_cpu);
+        else free(net.input_pinned_cpu);
+    }
+    if (*net.input_gpu) cuda_free(*net.input_gpu);
+    if (*net.truth_gpu) cuda_free(*net.truth_gpu);
+    if (net.input_gpu) free(net.input_gpu);
+    if (net.truth_gpu) free(net.truth_gpu);
+
+    if (*net.input16_gpu) cuda_free(*net.input16_gpu);
+    if (*net.output16_gpu) cuda_free(*net.output16_gpu);
+    if (net.input16_gpu) free(net.input16_gpu);
+    if (net.output16_gpu) free(net.output16_gpu);
+    if (net.max_input16_size) free(net.max_input16_size);
+    if (net.max_output16_size) free(net.max_output16_size);
+#else
+    free(net.workspace);
+#endif
+}
+
+static float relu(float src) {
+    if (src > 0) return src;
+    return 0;
+}
+
+static float lrelu(float src) {
+    const float eps = 0.001;
+    if (src > eps) return src;
+    return eps;
+}
+
+void fuse_conv_batchnorm(network net)
+{
+    int j;
+    for (j = 0; j < net.n; ++j) {
+        layer *l = &net.layers[j];
+
+        if (l->type == CONVOLUTIONAL) {
+            //printf(" Merges Convolutional-%d and batch_norm \n", j);
+
+            if (l->share_layer != NULL) {
+                l->batch_normalize = 0;
+            }
+
+            if (l->batch_normalize) {
+                int f;
+                for (f = 0; f < l->n; ++f)
+                {
+                    l->biases[f] = l->biases[f] - (double)l->scales[f] * l->rolling_mean[f] / (sqrt((double)l->rolling_variance[f] + .00001));
+
+                    double precomputed = l->scales[f] / (sqrt((double)l->rolling_variance[f] + .00001));
+
+                    const size_t filter_size = l->size*l->size*l->c / l->groups;
+                    int i;
+                    for (i = 0; i < filter_size; ++i) {
+                        int w_index = f*filter_size + i;
+
+                        l->weights[w_index] *= precomputed;
+                    }
+                }
+
+                free_convolutional_batchnorm(l);
+                l->batch_normalize = 0;
+#ifdef GPU
+                if (gpu_index >= 0) {
+                    push_convolutional_layer(*l);
+                }
+#endif
+            }
+        }
+        else  if (l->type == SHORTCUT && l->weights && l->weights_normalization)
+        {
+            if (l->nweights > 0) {
+                //cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
+                int i;
+                for (i = 0; i < l->nweights; ++i) printf(" w = %f,", l->weights[i]);
+                printf(" l->nweights = %d, j = %d \n", l->nweights, j);
+            }
+
+            // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w)
+            const int layer_step = l->nweights / (l->n + 1);    // 1 or l.c or (l.c * l.h * l.w)
+
+            int chan, i;
+            for (chan = 0; chan < layer_step; ++chan)
+            {
+                float sum = 1, max_val = -FLT_MAX;
+
+                if (l->weights_normalization == SOFTMAX_NORMALIZATION) {
+                    for (i = 0; i < (l->n + 1); ++i) {
+                        int w_index = chan + i * layer_step;
+                        float w = l->weights[w_index];
+                        if (max_val < w) max_val = w;
+                    }
+                }
+
+                const float eps = 0.0001;
+                sum = eps;
+
+                for (i = 0; i < (l->n + 1); ++i) {
+                    int w_index = chan + i * layer_step;
+                    float w = l->weights[w_index];
+                    if (l->weights_normalization == RELU_NORMALIZATION) sum += lrelu(w);
+                    else if (l->weights_normalization == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
+                }
+
+                for (i = 0; i < (l->n + 1); ++i) {
+                    int w_index = chan + i * layer_step;
+                    float w = l->weights[w_index];
+                    if (l->weights_normalization == RELU_NORMALIZATION) w = lrelu(w) / sum;
+                    else if (l->weights_normalization == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
+                    l->weights[w_index] = w;
+                }
+            }
+
+            l->weights_normalization = NO_NORMALIZATION;
+
+#ifdef GPU
+            if (gpu_index >= 0) {
+                push_shortcut_layer(*l);
+            }
+#endif
+        }
+        else {
+            //printf(" Fusion skip layer type: %d \n", l->type);
+        }
+    }
+}
+
+void forward_blank_layer(layer l, network_state state) {}
+
+void calculate_binary_weights(network net)
+{
+    int j;
+    for (j = 0; j < net.n; ++j) {
+        layer *l = &net.layers[j];
+
+        if (l->type == CONVOLUTIONAL) {
+            //printf(" Merges Convolutional-%d and batch_norm \n", j);
+
+            if (l->xnor) {
+                //printf("\n %d \n", j);
+                //l->lda_align = 256; // 256bit for AVX2    // set in make_convolutional_layer()
+                //if (l->size*l->size*l->c >= 2048) l->lda_align = 512;
+
+                binary_align_weights(l);
+
+                if (net.layers[j].use_bin_output) {
+                    l->activation = LINEAR;
+                }
+
+#ifdef GPU
+                // fuse conv_xnor + shortcut -> conv_xnor
+                if ((j + 1) < net.n && net.layers[j].type == CONVOLUTIONAL) {
+                    layer *sc = &net.layers[j + 1];
+                    if (sc->type == SHORTCUT && sc->w == sc->out_w && sc->h == sc->out_h && sc->c == sc->out_c)
+                    {
+                        l->bin_conv_shortcut_in_gpu = net.layers[net.layers[j + 1].index].output_gpu;
+                        l->bin_conv_shortcut_out_gpu = net.layers[j + 1].output_gpu;
+
+                        net.layers[j + 1].type = BLANK;
+                        net.layers[j + 1].forward_gpu = forward_blank_layer;
+                    }
+                }
+#endif  // GPU
+            }
+        }
+    }
+    //printf("\n calculate_binary_weights Done! \n");
+
+}
+
+void copy_cudnn_descriptors(layer src, layer *dst)
+{
+#ifdef CUDNN
+    dst->normTensorDesc = src.normTensorDesc;
+    dst->normDstTensorDesc = src.normDstTensorDesc;
+    dst->normDstTensorDescF16 = src.normDstTensorDescF16;
+
+    dst->srcTensorDesc = src.srcTensorDesc;
+    dst->dstTensorDesc = src.dstTensorDesc;
+
+    dst->srcTensorDesc16 = src.srcTensorDesc16;
+    dst->dstTensorDesc16 = src.dstTensorDesc16;
+#endif // CUDNN
+}
+
+void copy_weights_net(network net_train, network *net_map)
+{
+    int k;
+    for (k = 0; k < net_train.n; ++k) {
+        layer *l = &(net_train.layers[k]);
+        layer tmp_layer;
+        copy_cudnn_descriptors(net_map->layers[k], &tmp_layer);
+        net_map->layers[k] = net_train.layers[k];
+        copy_cudnn_descriptors(tmp_layer, &net_map->layers[k]);
+
+        if (l->type == CRNN) {
+            layer tmp_input_layer, tmp_self_layer, tmp_output_layer;
+            copy_cudnn_descriptors(*net_map->layers[k].input_layer, &tmp_input_layer);
+            copy_cudnn_descriptors(*net_map->layers[k].self_layer, &tmp_self_layer);
+            copy_cudnn_descriptors(*net_map->layers[k].output_layer, &tmp_output_layer);
+            net_map->layers[k].input_layer = net_train.layers[k].input_layer;
+            net_map->layers[k].self_layer = net_train.layers[k].self_layer;
+            net_map->layers[k].output_layer = net_train.layers[k].output_layer;
+            //net_map->layers[k].output_gpu = net_map->layers[k].output_layer->output_gpu;  // already copied out of if()
+
+            copy_cudnn_descriptors(tmp_input_layer, net_map->layers[k].input_layer);
+            copy_cudnn_descriptors(tmp_self_layer, net_map->layers[k].self_layer);
+            copy_cudnn_descriptors(tmp_output_layer, net_map->layers[k].output_layer);
+        }
+        else if(l->input_layer) // for AntiAliasing
+        {
+            layer tmp_input_layer;
+            copy_cudnn_descriptors(*net_map->layers[k].input_layer, &tmp_input_layer);
+            net_map->layers[k].input_layer = net_train.layers[k].input_layer;
+            copy_cudnn_descriptors(tmp_input_layer, net_map->layers[k].input_layer);
+        }
+        net_map->layers[k].batch = 1;
+        net_map->layers[k].steps = 1;
+        net_map->layers[k].train = 0;
+    }
+}
+
+
+// combine Training and Validation networks
+network combine_train_valid_networks(network net_train, network net_map)
+{
+    network net_combined = make_network(net_train.n);
+    layer *old_layers = net_combined.layers;
+    net_combined = net_train;
+    net_combined.layers = old_layers;
+    net_combined.batch = 1;
+
+    int k;
+    for (k = 0; k < net_train.n; ++k) {
+        layer *l = &(net_train.layers[k]);
+        net_combined.layers[k] = net_train.layers[k];
+        net_combined.layers[k].batch = 1;
+
+        if (l->type == CONVOLUTIONAL) {
+#ifdef CUDNN
+            net_combined.layers[k].normTensorDesc = net_map.layers[k].normTensorDesc;
+            net_combined.layers[k].normDstTensorDesc = net_map.layers[k].normDstTensorDesc;
+            net_combined.layers[k].normDstTensorDescF16 = net_map.layers[k].normDstTensorDescF16;
+
+            net_combined.layers[k].srcTensorDesc = net_map.layers[k].srcTensorDesc;
+            net_combined.layers[k].dstTensorDesc = net_map.layers[k].dstTensorDesc;
+
+            net_combined.layers[k].srcTensorDesc16 = net_map.layers[k].srcTensorDesc16;
+            net_combined.layers[k].dstTensorDesc16 = net_map.layers[k].dstTensorDesc16;
+#endif // CUDNN
+        }
+    }
+    return net_combined;
+}
+
+void free_network_recurrent_state(network net)
+{
+    int k;
+    for (k = 0; k < net.n; ++k) {
+        if (net.layers[k].type == CONV_LSTM) free_state_conv_lstm(net.layers[k]);
+        if (net.layers[k].type == CRNN) free_state_crnn(net.layers[k]);
+    }
+}
+
+void randomize_network_recurrent_state(network net)
+{
+    int k;
+    for (k = 0; k < net.n; ++k) {
+        if (net.layers[k].type == CONV_LSTM) randomize_state_conv_lstm(net.layers[k]);
+        if (net.layers[k].type == CRNN) free_state_crnn(net.layers[k]);
+    }
+}
+
+
+void remember_network_recurrent_state(network net)
+{
+    int k;
+    for (k = 0; k < net.n; ++k) {
+        if (net.layers[k].type == CONV_LSTM) remember_state_conv_lstm(net.layers[k]);
+        //if (net.layers[k].type == CRNN) free_state_crnn(net.layers[k]);
+    }
+}
+
+void restore_network_recurrent_state(network net)
+{
+    int k;
+    for (k = 0; k < net.n; ++k) {
+        if (net.layers[k].type == CONV_LSTM) restore_state_conv_lstm(net.layers[k]);
+        if (net.layers[k].type == CRNN) free_state_crnn(net.layers[k]);
+    }
+}
+
+
+int is_ema_initialized(network net)
+{
+    int i;
+    for (i = 0; i < net.n; ++i) {
+        layer l = net.layers[i];
+        if (l.type == CONVOLUTIONAL) {
+            int k;
+            if (l.weights_ema) {
+                for (k = 0; k < l.nweights; ++k) {
+                    if (l.weights_ema[k] != 0) return 1;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+void ema_update(network net, float ema_alpha)
+{
+    int i;
+    for (i = 0; i < net.n; ++i) {
+        layer l = net.layers[i];
+        if (l.type == CONVOLUTIONAL) {
+#ifdef GPU
+            if (gpu_index >= 0) {
+                pull_convolutional_layer(l);
+            }
+#endif
+            int k;
+            if (l.weights_ema) {
+                for (k = 0; k < l.nweights; ++k) {
+                    l.weights_ema[k] = ema_alpha * l.weights_ema[k] + (1 - ema_alpha) * l.weights[k];
+                }
+            }
+
+            for (k = 0; k < l.n; ++k) {
+                if (l.biases_ema) l.biases_ema[k] = ema_alpha * l.biases_ema[k] + (1 - ema_alpha) * l.biases[k];
+                if (l.scales_ema) l.scales_ema[k] = ema_alpha * l.scales_ema[k] + (1 - ema_alpha) * l.scales[k];
+            }
+        }
+    }
+}
+
+
+void ema_apply(network net)
+{
+    int i;
+    for (i = 0; i < net.n; ++i) {
+        layer l = net.layers[i];
+        if (l.type == CONVOLUTIONAL) {
+            int k;
+            if (l.weights_ema) {
+                for (k = 0; k < l.nweights; ++k) {
+                    l.weights[k] = l.weights_ema[k];
+                }
+            }
+
+            for (k = 0; k < l.n; ++k) {
+                if (l.biases_ema) l.biases[k] = l.biases_ema[k];
+                if (l.scales_ema) l.scales[k] = l.scales_ema[k];
+            }
+
+#ifdef GPU
+            if (gpu_index >= 0) {
+                push_convolutional_layer(l);
+            }
+#endif
+        }
+    }
+}
+
+
+
+void reject_similar_weights(network net, float sim_threshold)
+{
+    int i;
+    for (i = 0; i < net.n; ++i) {
+        layer l = net.layers[i];
+        if (i == 0) continue;
+        if (net.n > i + 1) if (net.layers[i + 1].type == YOLO) continue;
+        if (net.n > i + 2) if (net.layers[i + 2].type == YOLO) continue;
+        if (net.n > i + 3) if (net.layers[i + 3].type == YOLO) continue;
+
+        if (l.type == CONVOLUTIONAL && l.activation != LINEAR) {
+#ifdef GPU
+            if (gpu_index >= 0) {
+                pull_convolutional_layer(l);
+            }
+#endif
+            int k, j;
+            float max_sim = -1000;
+            int max_sim_index = 0;
+            int max_sim_index2 = 0;
+            int filter_size = l.size*l.size*l.c;
+            for (k = 0; k < l.n; ++k)
+            {
+                for (j = k+1; j < l.n; ++j)
+                {
+                    int w1 = k;
+                    int w2 = j;
+
+                    float sim = cosine_similarity(&l.weights[filter_size*w1], &l.weights[filter_size*w2], filter_size);
+                    if (sim > max_sim) {
+                        max_sim = sim;
+                        max_sim_index = w1;
+                        max_sim_index2 = w2;
+                    }
+                }
+            }
+
+            printf(" reject_similar_weights: i = %d, l.n = %d, w1 = %d, w2 = %d, sim = %f, thresh = %f \n",
+                i, l.n, max_sim_index, max_sim_index2, max_sim, sim_threshold);
+
+            if (max_sim > sim_threshold) {
+                printf(" rejecting... \n");
+                float scale = sqrt(2. / (l.size*l.size*l.c / l.groups));
+
+                for (k = 0; k < filter_size; ++k) {
+                    l.weights[max_sim_index*filter_size + k] = scale*rand_uniform(-1, 1);
+                }
+                if (l.biases) l.biases[max_sim_index] = 0.0f;
+                if (l.scales) l.scales[max_sim_index] = 1.0f;
+            }
+
+#ifdef GPU
+            if (gpu_index >= 0) {
+                push_convolutional_layer(l);
+            }
+#endif
+        }
+    }
+}
diff --git a/darknet-master/src/network.h b/darknet-master/src/network.h
new file mode 100644
index 0000000..c8a7706
--- /dev/null
+++ b/darknet-master/src/network.h
@@ -0,0 +1,196 @@
+// Oh boy, why am I about to do this....
+#ifndef NETWORK_H
+#define NETWORK_H
+
+/*
+ * Necessary in C++ to get format macros out of inttypes.h
+ */
+#ifdef __cplusplus
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS 1
+#endif
+#endif
+#include <inttypes.h>
+
+#include "darknet.h"
+
+#include <stdint.h>
+#include "layer.h"
+
+
+#include "image.h"
+#include "data.h"
+#include "tree.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+typedef enum {
+    CONSTANT, STEP, EXP, POLY, STEPS, SIG, RANDOM
+} learning_rate_policy;
+
+typedef struct network{
+    float *workspace;
+    int n;
+    int batch;
+    uint64_t *seen;
+    float epoch;
+    int subdivisions;
+    float momentum;
+    float decay;
+    layer *layers;
+    int outputs;
+    float *output;
+    learning_rate_policy policy;
+
+    float learning_rate;
+    float gamma;
+    float scale;
+    float power;
+    int time_steps;
+    int step;
+    int max_batches;
+    float *scales;
+    int   *steps;
+    int num_steps;
+    int burn_in;
+    int cudnn_half;
+
+    int adam;
+    float B1;
+    float B2;
+    float eps;
+
+    int inputs;
+    int h, w, c;
+    int max_crop;
+    int min_crop;
+    int flip; // horizontal flip 50% probability augmentaiont for classifier training (default = 1)
+    float angle;
+    float aspect;
+    float exposure;
+    float saturation;
+    float hue;
+    int small_object;
+
+    int gpu_index;
+    tree *hierarchy;
+
+    #ifdef GPU
+    float *input_state_gpu;
+
+    float **input_gpu;
+    float **truth_gpu;
+    float **input16_gpu;
+    float **output16_gpu;
+    size_t *max_input16_size;
+    size_t *max_output16_size;
+    int wait_stream;
+    #endif
+} network;
+
+
+typedef struct network_state {
+    float *truth;
+    float *input;
+    float *delta;
+    float *workspace;
+    int train;
+    int index;
+    network net;
+} network_state;
+*/
+
+#ifdef GPU
+float train_networks(network *nets, int n, data d, int interval);
+void sync_nets(network *nets, int n, int interval);
+float train_network_datum_gpu(network net, float *x, float *y);
+float *network_predict_gpu(network net, float *input);
+float *network_predict_gpu_gl_texture(network net, uint32_t texture_id);
+float * get_network_output_gpu_layer(network net, int i);
+float * get_network_delta_gpu_layer(network net, int i);
+float *get_network_output_gpu(network net);
+void forward_network_gpu(network net, network_state state);
+void backward_network_gpu(network net, network_state state);
+void update_network_gpu(network net);
+void forward_backward_network_gpu(network net, float *x, float *y);
+#endif
+
+float get_current_seq_subdivisions(network net);
+int get_sequence_value(network net);
+float get_current_rate(network net);
+int get_current_batch(network net);
+int64_t get_current_iteration(network net);
+//void free_network(network net); // darknet.h
+void compare_networks(network n1, network n2, data d);
+char *get_layer_string(LAYER_TYPE a);
+
+network make_network(int n);
+void forward_network(network net, network_state state);
+void backward_network(network net, network_state state);
+void update_network(network net);
+
+float train_network(network net, data d);
+float train_network_waitkey(network net, data d, int wait_key);
+float train_network_batch(network net, data d, int n);
+float train_network_sgd(network net, data d, int n);
+float train_network_datum(network net, float *x, float *y);
+
+matrix network_predict_data(network net, data test);
+//LIB_API float *network_predict(network net, float *input);
+//LIB_API float *network_predict_ptr(network *net, float *input);
+float network_accuracy(network net, data d);
+float *network_accuracies(network net, data d, int n);
+float network_accuracy_multi(network net, data d, int n);
+void top_predictions(network net, int n, int *index);
+float *get_network_output(network net);
+float *get_network_output_layer(network net, int i);
+float *get_network_delta_layer(network net, int i);
+float *get_network_delta(network net);
+int get_network_output_size_layer(network net, int i);
+int get_network_output_size(network net);
+image get_network_image(network net);
+image get_network_image_layer(network net, int i);
+int get_predicted_class_network(network net);
+void print_network(network net);
+void visualize_network(network net);
+int resize_network(network *net, int w, int h);
+//LIB_API void set_batch_network(network *net, int b);
+int get_network_input_size(network net);
+float get_network_cost(network net);
+//LIB_API layer* get_network_layer(network* net, int i);
+//LIB_API detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter);
+//LIB_API detection *make_network_boxes(network *net, float thresh, int *num);
+//LIB_API void free_detections(detection *dets, int n);
+//LIB_API void reset_rnn(network *net);
+//LIB_API network *load_network_custom(char *cfg, char *weights, int clear, int batch);
+//LIB_API network *load_network(char *cfg, char *weights, int clear);
+//LIB_API float *network_predict_image(network *net, image im);
+//LIB_API float validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, float thresh_calc_avg_iou, const float iou_thresh, int map_points, int letter_box, network *existing_net);
+//LIB_API void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int dont_show, int calc_map, int mjpeg_port);
+//LIB_API int network_width(network *net);
+//LIB_API int network_height(network *net);
+//LIB_API void optimize_picture(network *net, image orig, int max_layer, float scale, float rate, float thresh, int norm);
+
+int get_network_nuisance(network net);
+int get_network_background(network net);
+//LIB_API void fuse_conv_batchnorm(network net);
+//LIB_API void calculate_binary_weights(network net);
+network combine_train_valid_networks(network net_train, network net_map);
+void copy_weights_net(network net_train, network *net_map);
+void free_network_recurrent_state(network net);
+void randomize_network_recurrent_state(network net);
+void remember_network_recurrent_state(network net);
+void restore_network_recurrent_state(network net);
+int is_ema_initialized(network net);
+void ema_update(network net, float ema_alpha);
+void ema_apply(network net);
+void reject_similar_weights(network net, float sim_threshold);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/network_kernels.cu b/darknet-master/src/network_kernels.cu
new file mode 100644
index 0000000..3b3d4e6
--- /dev/null
+++ b/darknet-master/src/network_kernels.cu
@@ -0,0 +1,838 @@
+#include "dark_cuda.h"
+
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
+#include "network.h"
+#include "image.h"
+#include "data.h"
+#include "utils.h"
+#include "parser.h"
+
+#include "crop_layer.h"
+#include "connected_layer.h"
+#include "rnn_layer.h"
+#include "gru_layer.h"
+#include "crnn_layer.h"
+#include "detection_layer.h"
+#include "region_layer.h"
+#include "convolutional_layer.h"
+#include "activation_layer.h"
+#include "maxpool_layer.h"
+#include "reorg_layer.h"
+#include "avgpool_layer.h"
+#include "normalization_layer.h"
+#include "batchnorm_layer.h"
+#include "cost_layer.h"
+#include "local_layer.h"
+#include "softmax_layer.h"
+#include "dropout_layer.h"
+#include "route_layer.h"
+#include "shortcut_layer.h"
+#include "blas.h"
+
+//#ifdef OPENCV
+//#include <opencv2/highgui/highgui_c.h>
+//#endif
+
+#include "http_stream.h"
+
+float * get_network_output_gpu_layer(network net, int i);
+float * get_network_delta_gpu_layer(network net, int i);
+float * get_network_output_gpu(network net);
+
+typedef struct time_benchmark_layers {
+    float time;
+    int layer_id, layer_type;
+} time_benchmark_layers;
+
+int time_comparator(const void *pa, const void *pb)
+{
+    time_benchmark_layers a = *(time_benchmark_layers *)pa;
+    time_benchmark_layers b = *(time_benchmark_layers *)pb;
+    float diff = a.time - b.time;
+    if (diff < 0) return 1;
+    else if (diff > 0) return -1;
+    return 0;
+}
+
+void forward_network_gpu(network net, network_state state)
+{
+    static time_benchmark_layers *avg_time_per_layer = NULL;
+    static time_benchmark_layers *sorted_avg_time_per_layer = NULL;
+    double start_time, end_time;
+    if (net.benchmark_layers) {
+        if (!avg_time_per_layer) {
+            avg_time_per_layer = (time_benchmark_layers *)calloc(net.n, sizeof(time_benchmark_layers));
+            sorted_avg_time_per_layer = (time_benchmark_layers *)calloc(net.n, sizeof(time_benchmark_layers));
+        }
+        cudaDeviceSynchronize();
+    }
+
+    //printf("\n");
+    state.workspace = net.workspace;
+    int i;
+    for(i = 0; i < net.n; ++i){
+        state.index = i;
+        layer l = net.layers[i];
+        if(l.delta_gpu && state.train){
+            fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
+        }
+
+        if (net.benchmark_layers) {
+            start_time = get_time_point();
+        }
+
+        l.forward_gpu(l, state);
+
+        if (net.benchmark_layers) {
+            CHECK_CUDA(cudaDeviceSynchronize());
+            end_time = get_time_point();
+            const double took_time = (end_time - start_time) / 1000;
+            const double alpha = 0.9;
+            if (avg_time_per_layer[i].time == 0) {
+                avg_time_per_layer[i].layer_id = i;
+                avg_time_per_layer[i].layer_type = l.type;
+                avg_time_per_layer[i].time = took_time;
+            }
+            else avg_time_per_layer[i].time = avg_time_per_layer[i].time * alpha + took_time * (1 - alpha);
+
+            sorted_avg_time_per_layer[i] = avg_time_per_layer[i];
+            printf("\n fw-layer %d - type: %d - %lf ms - avg_time %lf ms \n", i, l.type, took_time, avg_time_per_layer[i].time);
+        }
+
+        if(net.wait_stream)
+            cudaStreamSynchronize(get_cuda_stream());
+        state.input = l.output_gpu;
+        //cudaDeviceSynchronize();
+
+        /*
+        cuda_pull_array(l.output_gpu, l.output, l.outputs);
+        cudaStreamSynchronize(get_cuda_stream());
+        float avg_val = 0;
+        int k;
+        for (k = 0; k < l.outputs; ++k) avg_val += l.output[k];
+        printf(" i: %d - avg_val = %f \n", i, avg_val / l.outputs);
+        */
+
+/*
+        cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+        if (l.out_w >= 0 && l.out_h >= 1 && l.c >= 3) {
+            int j;
+            for (j = 0; j < l.out_c; ++j) {
+                image img = make_image(l.out_w, l.out_h, 3);
+                memcpy(img.data, l.output + l.out_w*l.out_h*j, l.out_w*l.out_h * 1 * sizeof(float));
+                memcpy(img.data + l.out_w*l.out_h * 1, l.output + l.out_w*l.out_h*j, l.out_w*l.out_h * 1 * sizeof(float));
+                memcpy(img.data + l.out_w*l.out_h * 2, l.output + l.out_w*l.out_h*j, l.out_w*l.out_h * 1 * sizeof(float));
+                char buff[256];
+                sprintf(buff, "layer-%d slice-%d", i, j);
+                show_image(img, buff);
+                save_image(img, buff);
+            }
+            cvWaitKey(0); // wait press-key in console
+            cvDestroyAllWindows();
+        }
+*/
+    }
+
+    if (net.benchmark_layers) {
+        printf("\n\nSorted by time (forward):\n");
+        qsort(sorted_avg_time_per_layer, net.n, sizeof(time_benchmark_layers), time_comparator);
+        for (i = 0; i < net.n; ++i) {
+            //printf("layer %d - type: %d - avg_time %lf ms \n", avg_time_per_layer[i].layer_id, avg_time_per_layer[i].layer_type, avg_time_per_layer[i].time);
+            printf("%d - fw-sort-layer %d - type: %d - avg_time %lf ms \n", i, sorted_avg_time_per_layer[i].layer_id, sorted_avg_time_per_layer[i].layer_type, sorted_avg_time_per_layer[i].time);
+        }
+    }
+
+    //cudaStreamSynchronize(get_cuda_stream());   // sync CUDA-functions
+    //cudaDeviceSynchronize();
+}
+
+void backward_network_gpu(network net, network_state state)
+{
+    static time_benchmark_layers *avg_time_per_layer = NULL;
+    static time_benchmark_layers *sorted_avg_time_per_layer = NULL;
+    double start_time, end_time;
+    if (net.benchmark_layers) {
+        if (!avg_time_per_layer) {
+            avg_time_per_layer = (time_benchmark_layers *)calloc(net.n, sizeof(time_benchmark_layers));
+            sorted_avg_time_per_layer = (time_benchmark_layers *)calloc(net.n, sizeof(time_benchmark_layers));
+        }
+        cudaDeviceSynchronize();
+    }
+
+    state.workspace = net.workspace;
+    int i;
+    float * original_input = state.input;
+    float * original_delta = state.delta;
+    for(i = net.n-1; i >= 0; --i){
+        state.index = i;
+        layer l = net.layers[i];
+        if (l.stopbackward == 1) break;
+        if (l.stopbackward > get_current_iteration(net)) break;
+        if(i == 0){
+            state.input = original_input;
+            state.delta = original_delta;
+        }else{
+            layer prev = net.layers[i-1];
+            state.input = prev.output_gpu;
+            state.delta = prev.delta_gpu;
+            if (net.optimized_memory && !prev.keep_delta_gpu) {
+                state.delta = net.state_delta_gpu;
+            }
+        }
+        if (l.onlyforward) continue;
+
+        if (net.benchmark_layers) {
+            start_time = get_time_point();
+        }
+
+        l.backward_gpu(l, state);
+
+        if (net.benchmark_layers) {
+            CHECK_CUDA(cudaDeviceSynchronize());
+            end_time = get_time_point();
+            const double took_time = (end_time - start_time) / 1000;
+            const double alpha = 0.9;
+            if (avg_time_per_layer[i].time == 0) {
+                avg_time_per_layer[i].layer_id = i;
+                avg_time_per_layer[i].layer_type = l.type;
+                avg_time_per_layer[i].time = took_time;
+            }
+            else avg_time_per_layer[i].time = avg_time_per_layer[i].time * alpha + took_time * (1 - alpha);
+
+            sorted_avg_time_per_layer[i] = avg_time_per_layer[i];
+            printf("\n bw-layer %d - type: %d - %lf ms - avg_time %lf ms \n", i, l.type, took_time, avg_time_per_layer[i].time);
+        }
+
+        if (i != 0) {
+            layer prev = net.layers[i - 1];
+            if (net.optimized_memory && state.delta && !prev.keep_delta_gpu) {
+                if (prev.delta_gpu != state.delta) simple_copy_ongpu(prev.outputs*prev.batch, state.delta, prev.delta_gpu);
+                fill_ongpu(prev.outputs*prev.batch, 0, net.state_delta_gpu, 1);
+            }
+        }
+
+        /*
+        if(i != 0)
+        {
+            layer l = net.layers[i - 1];
+            int state_delta_nan_inf = is_nan_or_inf(state.delta, l.outputs * l.batch);
+            int state_input_nan_inf = is_nan_or_inf(state.input, l.outputs * l.batch);
+            printf("\n i - %d  is_nan_or_inf(s.delta) = %d \n", i, state_delta_nan_inf);
+            printf(" i - %d  is_nan_or_inf(s.input) = %d \n", i, state_input_nan_inf);
+            if (state_delta_nan_inf || state_input_nan_inf) { printf(" found "); }
+        }
+        */
+    }
+
+    if (net.adversarial && net.attention)
+    {
+        int img_size = net.w * net.h * net.c;
+        float *original_input_cpu = (float *)xcalloc(img_size, sizeof(float));
+        float *original_delta_cpu = (float *)xcalloc(img_size, sizeof(float));
+        cuda_pull_array(original_input, original_input_cpu, img_size);
+        cuda_pull_array(original_delta, original_delta_cpu, img_size);
+
+        image attention_img = make_attention_image(img_size, original_delta_cpu, original_input_cpu, net.w, net.h, net.c, 0.7);
+        show_image(attention_img, "attention_img");
+        resize_window_cv("attention_img", 500, 500);
+
+        //static int img_counter = 0;
+        //img_counter++;
+        //char buff[256];
+        //sprintf(buff, "attention_img_%d.png", img_counter);
+        //save_image_png(attention_img, buff);
+        free_image(attention_img);
+
+        image attention_mask_img = make_attention_image(img_size, original_delta_cpu, original_delta_cpu, net.w, net.h, net.c, 1.0);
+        show_image(attention_mask_img, "attention_mask_img");
+        resize_window_cv("attention_mask_img", 500, 500);
+
+        //sprintf(buff, "attention_mask_img_%d.png", img_counter);
+        //save_image_png(attention_mask_img, buff);
+        free_image(attention_mask_img);
+
+        free(original_input_cpu);
+        free(original_delta_cpu);
+    }
+    if (net.adversarial) {
+        int x_size = get_network_input_size(net)*net.batch;
+        printf(" x_size = %d, original_delta = %p, original_input = %p, net.learning_rate = %f \n",
+            x_size, original_delta, original_input, net.learning_rate);
+        axpy_ongpu(x_size, net.learning_rate, original_delta, 1, original_input, 1);
+        constrain_min_max_ongpu(x_size, 0, 1, original_input, 1);
+    }
+
+    if (net.benchmark_layers) {
+        printf("\n\nSorted by time (backward):\n");
+        qsort(sorted_avg_time_per_layer, net.n, sizeof(time_benchmark_layers), time_comparator);
+        for (i = 0; i < net.n; ++i) {
+            //printf("layer %d - type: %d - avg_time %lf ms \n", avg_time_per_layer[i].layer_id, avg_time_per_layer[i].layer_type, avg_time_per_layer[i].time);
+            printf("%d - bw-sort-layer %d - type: %d - avg_time %lf ms \n", i, sorted_avg_time_per_layer[i].layer_id, sorted_avg_time_per_layer[i].layer_type, sorted_avg_time_per_layer[i].time);
+        }
+    }
+}
+
+void update_network_gpu(network net)
+{
+    cuda_set_device(net.gpu_index);
+    const int iteration_num = (*net.seen) / (net.batch * net.subdivisions);
+    int i;
+    int update_batch = net.batch*net.subdivisions * get_sequence_value(net);
+    float rate = get_current_rate(net);
+    for(i = 0; i < net.n; ++i){
+        layer l = net.layers[i];
+        if (l.train == 0) continue;
+
+        l.t = get_current_batch(net);
+        if (iteration_num > (net.max_batches * 1 / 2)) l.deform = 0;
+        if (l.burnin_update && (l.burnin_update*net.burn_in > iteration_num)) continue;
+        if (l.train_only_bn) continue;
+
+        if(l.update_gpu && l.dont_update < iteration_num){
+            l.update_gpu(l, update_batch, rate, net.momentum, net.decay, net.loss_scale);
+        }
+    }
+}
+
+void forward_backward_network_gpu(network net, float *x, float *y)
+{
+    network_state state;
+    state.index = 0;
+    state.net = net;
+    int x_size = get_network_input_size(net)*net.batch;
+    int y_size = get_network_output_size(net)*net.batch;
+    if(net.layers[net.n-1].truths) y_size = net.layers[net.n-1].truths*net.batch;
+    if(!*net.input_gpu){
+        *net.input_gpu = cuda_make_array(x, x_size);
+        *net.truth_gpu = cuda_make_array(y, y_size);
+    }else{
+        cuda_push_array(*net.input_gpu, x, x_size);
+        cuda_push_array(*net.truth_gpu, y, y_size);
+    }
+    state.input = *net.input_gpu;
+    state.delta = 0;
+    if (net.adversarial) {
+        state.delta = cuda_make_array(NULL, x_size);
+    }
+    state.truth = *net.truth_gpu;
+    state.train = 1;
+#if defined(CUDNN_HALF) && defined(CUDNN)
+    int i;
+    for (i = 0; i < net.n; ++i) {
+        layer l = net.layers[i];
+        if (net.cudnn_half){
+            if (l.type == CONVOLUTIONAL && l.weights_gpu && l.weights_gpu16) {
+                assert((l.nweights) > 0);
+                cuda_convert_f32_to_f16(l.weights_gpu, l.nweights, l.weights_gpu16);
+            }
+            else if (l.type == CRNN && l.input_layer->weights_gpu && l.input_layer->weights_gpu16) {
+                assert((l.input_layer->c*l.input_layer->n*l.input_layer->size*l.input_layer->size) > 0);
+                cuda_convert_f32_to_f16(l.input_layer->weights_gpu, l.input_layer->nweights, l.input_layer->weights_gpu16);
+                cuda_convert_f32_to_f16(l.self_layer->weights_gpu, l.self_layer->nweights, l.self_layer->weights_gpu16);
+                cuda_convert_f32_to_f16(l.output_layer->weights_gpu, l.output_layer->nweights, l.output_layer->weights_gpu16);
+            }
+            else if (l.type == CONV_LSTM && l.wf->weights_gpu && l.wf->weights_gpu16) {
+                assert((l.wf->c * l.wf->n * l.wf->size * l.wf->size) > 0);
+                if (l.peephole) {
+                    cuda_convert_f32_to_f16(l.vf->weights_gpu, l.vf->nweights, l.vf->weights_gpu16);
+                    cuda_convert_f32_to_f16(l.vi->weights_gpu, l.vi->nweights, l.vi->weights_gpu16);
+                    cuda_convert_f32_to_f16(l.vo->weights_gpu, l.vo->nweights, l.vo->weights_gpu16);
+                }
+                cuda_convert_f32_to_f16(l.wf->weights_gpu, l.wf->nweights, l.wf->weights_gpu16);
+                if (!l.bottleneck) {
+                    cuda_convert_f32_to_f16(l.wi->weights_gpu, l.wi->nweights, l.wi->weights_gpu16);
+                    cuda_convert_f32_to_f16(l.wg->weights_gpu, l.wg->nweights, l.wg->weights_gpu16);
+                    cuda_convert_f32_to_f16(l.wo->weights_gpu, l.wo->nweights, l.wo->weights_gpu16);
+                }
+                cuda_convert_f32_to_f16(l.uf->weights_gpu, l.uf->nweights, l.uf->weights_gpu16);
+                cuda_convert_f32_to_f16(l.ui->weights_gpu, l.ui->nweights, l.ui->weights_gpu16);
+                cuda_convert_f32_to_f16(l.ug->weights_gpu, l.ug->nweights, l.ug->weights_gpu16);
+                cuda_convert_f32_to_f16(l.uo->weights_gpu, l.uo->nweights, l.uo->weights_gpu16);
+            }
+        }
+    }
+#endif
+    forward_network_gpu(net, state);
+    //cudaStreamSynchronize(get_cuda_stream());
+    backward_network_gpu(net, state);
+
+    if (net.adversarial) {
+        cuda_free(state.delta);
+        cuda_pull_array(*net.input_gpu, x, x_size);
+    }
+    if(*(state.net.total_bbox) > 0)
+        fprintf(stderr, " total_bbox = %d, rewritten_bbox = %f %% \n", *(state.net.total_bbox), 100 * (float)*(state.net.rewritten_bbox) / *(state.net.total_bbox));
+}
+
+float train_network_datum_gpu(network net, float *x, float *y)
+{
+    *net.seen += net.batch;
+    if (net.adversarial_lr && rand_int(0, 1) == 1 && get_current_iteration(net) > net.burn_in) {
+        net.adversarial = 1;
+        float lr_old = net.learning_rate;
+        float scale = (get_current_iteration(net) / ((float)net.max_batches));
+        //scale = sin(scale * M_PI);
+        net.learning_rate = net.adversarial_lr * scale;
+        //layer l = net.layers[net.n - 1];
+        int y_size = get_network_output_size(net)*net.batch;
+        if (net.layers[net.n - 1].truths) y_size = net.layers[net.n - 1].truths*net.batch;
+        float *truth_cpu = (float *)xcalloc(y_size, sizeof(float));
+
+        const int img_size = net.w*net.h*net.c;
+        float *old_input = (float *)xcalloc(img_size*net.batch, sizeof(float));
+        memcpy(old_input, x, img_size*net.batch * sizeof(float));
+
+        printf("\n adversarial training, adversarial_lr = %f \n", net.adversarial_lr * scale);
+
+        forward_backward_network_gpu(net, x, truth_cpu);
+
+        int b;
+        for (b = 0; b < net.batch; ++b) {
+            if (b % 2 == 1 && net.contrastive) {
+                //printf(" b = %d old img, ", b);
+                memcpy(x + img_size*b, old_input + img_size*b, img_size * sizeof(float));
+            }
+        }
+
+        image im;
+        im.w = net.w;
+        im.h = net.h;
+        im.c = net.c;
+        im.data = x;
+        show_image(im, "adversarial data augmentation");
+        resize_window_cv("adversarial data augmentation", 500, 500);
+        wait_key_cv(1);
+
+        free(old_input);
+        free(truth_cpu);
+        net.learning_rate = lr_old;
+        net.adversarial = 0;
+    }
+    forward_backward_network_gpu(net, x, y);
+    float error = get_network_cost(net);
+    //if (((*net.seen) / net.batch) % net.subdivisions == 0) update_network_gpu(net);
+    const int sequence = get_sequence_value(net);
+    //if (((*net.seen) / net.batch) % (net.subdivisions*sequence) == 0) update_network_gpu(net);
+
+    return error;
+}
+
+typedef struct {
+    network net;
+    data d;
+    float *err;
+} train_args;
+
+void *train_thread(void *ptr)
+{
+    train_args args = *(train_args*)ptr;
+    free(ptr);
+    cuda_set_device(args.net.gpu_index);
+    *args.err = train_network(args.net, args.d);
+    return 0;
+}
+
+pthread_t train_network_in_thread(network net, data d, float *err)
+{
+    pthread_t thread;
+    train_args *ptr = (train_args *)calloc(1, sizeof(train_args));
+    ptr->net = net;
+    ptr->d = d;
+    ptr->err = err;
+    if(pthread_create(&thread, 0, train_thread, ptr)) error("Thread creation failed", DARKNET_LOC);
+    return thread;
+}
+
+void pull_updates(layer l)
+{
+    if(l.type == CONVOLUTIONAL){
+        cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
+        cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+        if(l.scale_updates) cuda_pull_array(l.scale_updates_gpu, l.scale_updates, l.n);
+    } else if(l.type == CONNECTED){
+        cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
+        cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
+    }
+}
+
+void push_updates(layer l)
+{
+    if(l.type == CONVOLUTIONAL){
+        cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+        if(l.scale_updates) cuda_push_array(l.scale_updates_gpu, l.scale_updates, l.n);
+    } else if(l.type == CONNECTED){
+        cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
+        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
+    }
+}
+
+void update_layer(layer l, network net)
+{
+    int update_batch = net.batch*net.subdivisions;
+    float rate = get_current_rate(net);
+    l.t = get_current_batch(net);
+    if(l.update_gpu){
+        l.update_gpu(l, update_batch, rate, net.momentum, net.decay, net.loss_scale);
+    }
+}
+
+void merge_weights(layer l, layer base)
+{
+    if (l.type == CONVOLUTIONAL) {
+        axpy_cpu(l.n, 1, l.biases, 1, base.biases, 1);
+        axpy_cpu(l.nweights, 1, l.weights, 1, base.weights, 1);
+        if (l.scales) {
+            axpy_cpu(l.n, 1, l.scales, 1, base.scales, 1);
+        }
+    } else if(l.type == CONNECTED) {
+        axpy_cpu(l.outputs, 1, l.biases, 1, base.biases, 1);
+        axpy_cpu(l.outputs*l.inputs, 1, l.weights, 1, base.weights, 1);
+    }
+}
+
+void scale_weights(layer l, float s)
+{
+    if (l.type == CONVOLUTIONAL) {
+        scal_cpu(l.n, s, l.biases, 1);
+        scal_cpu(l.nweights, s, l.weights, 1);
+        if (l.scales) {
+            scal_cpu(l.n, s, l.scales, 1);
+        }
+    } else if(l.type == CONNECTED) {
+        scal_cpu(l.outputs, s, l.biases, 1);
+        scal_cpu(l.outputs*l.inputs, s, l.weights, 1);
+    }
+}
+
+
+void pull_weights(layer l)
+{
+    if(l.type == CONVOLUTIONAL){
+        cuda_pull_array(l.biases_gpu, l.biases, l.n);
+        cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
+        if(l.scales) cuda_pull_array(l.scales_gpu, l.scales, l.n);
+    } else if(l.type == CONNECTED){
+        cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
+        cuda_pull_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
+    }
+}
+
+void push_weights(layer l)
+{
+    if(l.type == CONVOLUTIONAL){
+        cuda_push_array(l.biases_gpu, l.biases, l.n);
+        cuda_push_array(l.weights_gpu, l.weights, l.nweights);
+        if(l.scales) cuda_push_array(l.scales_gpu, l.scales, l.n);
+    } else if(l.type == CONNECTED){
+        cuda_push_array(l.biases_gpu, l.biases, l.outputs);
+        cuda_push_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
+    }
+}
+
+void distribute_weights(layer l, layer base)
+{
+    if(l.type == CONVOLUTIONAL){
+        cuda_push_array(l.biases_gpu, base.biases, l.n);
+        cuda_push_array(l.weights_gpu, base.weights, l.nweights);
+        if(base.scales) cuda_push_array(l.scales_gpu, base.scales, l.n);
+    } else if(l.type == CONNECTED){
+        cuda_push_array(l.biases_gpu, base.biases, l.outputs);
+        cuda_push_array(l.weights_gpu, base.weights, l.outputs*l.inputs);
+    }
+}
+
+
+void merge_updates(layer l, layer base)
+{
+    if (l.type == CONVOLUTIONAL) {
+        axpy_cpu(l.n, 1, l.bias_updates, 1, base.bias_updates, 1);
+        axpy_cpu(l.nweights, 1, l.weight_updates, 1, base.weight_updates, 1);
+        if (l.scale_updates) {
+            axpy_cpu(l.n, 1, l.scale_updates, 1, base.scale_updates, 1);
+        }
+    } else if(l.type == CONNECTED) {
+        axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.bias_updates, 1);
+        axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weight_updates, 1);
+    }
+}
+
+void distribute_updates(layer l, layer base)
+{
+    if(l.type == CONVOLUTIONAL){
+        cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.n);
+        cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.nweights);
+        if(base.scale_updates) cuda_push_array(l.scale_updates_gpu, base.scale_updates, l.n);
+    } else if(l.type == CONNECTED){
+        cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.outputs);
+        cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.outputs*l.inputs);
+    }
+}
+
+void sync_layer(network *nets, int n, int j)
+{
+    //printf("Syncing layer %d\n", j);
+    int i;
+    network net = nets[0];
+    layer base = net.layers[j];
+    cuda_set_device(net.gpu_index);
+    pull_weights(base);
+    for (i = 1; i < n; ++i) {
+        cuda_set_device(nets[i].gpu_index);
+        layer l = nets[i].layers[j];
+        pull_weights(l);
+        merge_weights(l, base);
+    }
+    scale_weights(base, 1./n);
+    for (i = 0; i < n; ++i) {
+        cuda_set_device(nets[i].gpu_index);
+        layer l = nets[i].layers[j];
+        distribute_weights(l, base);
+    }
+    //printf("Done syncing layer %d\n", j);
+}
+
+typedef struct{
+    network *nets;
+    int n;
+    int j;
+} sync_args;
+
+void *sync_layer_thread(void *ptr)
+{
+    sync_args args = *(sync_args*)ptr;
+    sync_layer(args.nets, args.n, args.j);
+    free(ptr);
+    return 0;
+}
+
+pthread_t sync_layer_in_thread(network *nets, int n, int j)
+{
+    pthread_t thread;
+    sync_args *ptr = (sync_args *)calloc(1, sizeof(sync_args));
+    ptr->nets = nets;
+    ptr->n = n;
+    ptr->j = j;
+    if(pthread_create(&thread, 0, sync_layer_thread, ptr)) error("Thread creation failed", DARKNET_LOC);
+    return thread;
+}
+
+void sync_nets(network *nets, int n, int interval)
+{
+    int j;
+    int layers = nets[0].n;
+    pthread_t *threads = (pthread_t *) calloc(layers, sizeof(pthread_t));
+
+    *nets[0].seen += interval * (n-1) * nets[0].batch * nets[0].subdivisions;
+    for (j = 0; j < n; ++j){
+        *nets[j].seen = *nets[0].seen;
+    }
+    for (j = 0; j < layers; ++j) {
+        threads[j] = sync_layer_in_thread(nets, n, j);
+    }
+    for (j = 0; j < layers; ++j) {
+        pthread_join(threads[j], 0);
+    }
+    free(threads);
+}
+
+float train_networks(network *nets, int n, data d, int interval)
+{
+    int i;
+#ifdef _DEBUG
+    int batch = nets[0].batch;
+    int subdivisions = nets[0].subdivisions;
+    assert(batch * subdivisions * n == d.X.rows);
+#endif
+    pthread_t *threads = (pthread_t *) calloc(n, sizeof(pthread_t));
+    float *errors = (float *) calloc(n, sizeof(float));
+
+    float sum = 0;
+    for(i = 0; i < n; ++i){
+        data p = get_data_part(d, i, n);
+        threads[i] = train_network_in_thread(nets[i], p, errors + i);
+    }
+    for(i = 0; i < n; ++i){
+        pthread_join(threads[i], 0);
+        //printf("%f\n", errors[i]);
+        sum += errors[i];
+    }
+    //cudaDeviceSynchronize();
+    *nets[0].cur_iteration += (n - 1);
+    *nets[0].seen = nets[0].batch * nets[0].subdivisions * get_current_iteration(nets[0]); // remove this line, when you will save to weights-file both: seen & cur_iteration
+    if (get_current_iteration(nets[0]) % interval == 0)
+    {
+        printf("Syncing... ");
+        fflush(stdout);
+        sync_nets(nets, n, interval);
+        printf("Done!\n");
+    }
+    //cudaDeviceSynchronize();
+    free(threads);
+    free(errors);
+    return (float)sum/(n);
+}
+
+float *get_network_output_layer_gpu(network net, int i)
+{
+    layer l = net.layers[i];
+    if(l.type != REGION && l.type != YOLO && (*net.cuda_graph_ready) == 0) cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch);
+    return l.output;
+}
+
+float *get_network_output_gpu(network net)
+{
+    int i;
+    for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break;
+    return get_network_output_layer_gpu(net, i);
+}
+
+float *network_predict_gpu(network net, float *input)
+{
+    if (net.gpu_index != cuda_get_device())
+        cuda_set_device(net.gpu_index);
+    int size = get_network_input_size(net) * net.batch;
+    network_state state;
+    state.index = 0;
+    state.net = net;
+    //state.input = cuda_make_array(input, size);   // memory will be allocated in the parse_network_cfg_custom()
+    state.input = net.input_state_gpu;
+    memcpy(net.input_pinned_cpu, input, size * sizeof(float));
+    state.truth = 0;
+    state.train = 0;
+    state.delta = 0;
+
+    //cudaGraphExec_t instance = (cudaGraphExec_t)net.cuda_graph_exec;
+    static cudaGraphExec_t instance;
+
+    if ((*net.cuda_graph_ready) == 0) {
+        static cudaGraph_t graph;
+        if (net.use_cuda_graph == 1) {
+            int i;
+            for (i = 0; i < 16; ++i) switch_stream(i);
+
+            cudaStream_t stream0 = switch_stream(0);
+            CHECK_CUDA(cudaDeviceSynchronize());
+            printf("Try to capture graph... \n");
+            //cudaGraph_t graph = (cudaGraph_t)net.cuda_graph;
+            CHECK_CUDA(cudaStreamBeginCapture(stream0, cudaStreamCaptureModeGlobal));
+        }
+
+        cuda_push_array(state.input, net.input_pinned_cpu, size);
+        forward_network_gpu(net, state);
+
+        if (net.use_cuda_graph == 1) {
+            cudaStream_t stream0 = switch_stream(0);
+            CHECK_CUDA(cudaStreamEndCapture(stream0, &graph));
+            CHECK_CUDA(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
+            (*net.cuda_graph_ready) = 1;
+            printf(" graph is captured... \n");
+            CHECK_CUDA(cudaDeviceSynchronize());
+        }
+        CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
+    }
+    else {
+        cudaStream_t stream0 = switch_stream(0);
+        //printf(" cudaGraphLaunch \n");
+        CHECK_CUDA( cudaGraphLaunch(instance, stream0) );
+        CHECK_CUDA( cudaStreamSynchronize(stream0) );
+        //printf(" ~cudaGraphLaunch \n");
+    }
+
+    float *out = get_network_output_gpu(net);
+    reset_wait_stream_events();
+    //cuda_free(state.input);   // will be freed in the free_network()
+    return out;
+}
+
+#ifdef CUDA_OPENGL_INTEGRATION
+float *network_predict_gpu_gl_texture(network net, uint32_t texture_id)
+{
+    if (net.gpu_index != cuda_get_device())
+        cuda_set_device(net.gpu_index);
+    int size = get_network_input_size(net) * net.batch;
+
+    // Map the OpenGL texture resource so CUDA can access it.
+    cudaGraphicsResource_t graphics_resource = NULL;
+    unsigned int flags = cudaGraphicsRegisterFlagsReadOnly;
+    CHECK_CUDA(cudaGraphicsGLRegisterImage(&graphics_resource, texture_id, GL_TEXTURE_2D, flags));
+    CHECK_CUDA(cudaGraphicsMapResources(1, &graphics_resource, 0));
+
+    //void* dev_ptr = NULL;
+    cudaArray_t dev_array = NULL;
+    CHECK_CUDA(cudaGraphicsSubResourceGetMappedArray(&dev_array, graphics_resource, 0, 0));
+
+    size_t width = net.w;
+    size_t height = net.h;
+    size_t pitch = width * sizeof(float);
+
+    CHECK_CUDA(cudaMemcpy2DFromArray(
+            net.input_state_gpu,     // dst
+            pitch,                   // dst_pitch
+            dev_array,               // src
+            0,                       // width offset
+            0,                       // height offset
+            width * sizeof(float),   // width (in bytes)
+            height * net.c,          // height (in rows)
+            cudaMemcpyDeviceToDevice // Transfer type
+    ));
+
+    network_state state;
+    state.index = 0;
+    state.net = net;
+    state.input = net.input_state_gpu;
+    state.truth = 0;
+    state.train = 0;
+    state.delta = 0;
+
+    //cudaGraphExec_t instance = (cudaGraphExec_t)net.cuda_graph_exec;
+    static cudaGraphExec_t instance;
+
+    if ((*net.cuda_graph_ready) == 0) {
+        static cudaGraph_t graph;
+        if (net.use_cuda_graph == 1) {
+            int i;
+            for (i = 0; i < 16; ++i) switch_stream(i);
+
+            cudaStream_t stream0 = switch_stream(0);
+            CHECK_CUDA(cudaDeviceSynchronize());
+            printf("Try to capture graph... \n");
+            //cudaGraph_t graph = (cudaGraph_t)net.cuda_graph;
+            CHECK_CUDA(cudaStreamBeginCapture(stream0, cudaStreamCaptureModeGlobal));
+        }
+
+        // cuda_push_array(state.input, net.input_pinned_cpu, size);
+        forward_network_gpu(net, state);
+
+        if (net.use_cuda_graph == 1) {
+            cudaStream_t stream0 = switch_stream(0);
+            CHECK_CUDA(cudaStreamEndCapture(stream0, &graph));
+            CHECK_CUDA(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
+            (*net.cuda_graph_ready) = 1;
+            printf(" graph is captured... \n");
+            CHECK_CUDA(cudaDeviceSynchronize());
+        }
+        CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
+    }
+    else {
+        cudaStream_t stream0 = switch_stream(0);
+        //printf(" cudaGraphLaunch \n");
+        CHECK_CUDA( cudaGraphLaunch(instance, stream0) );
+        CHECK_CUDA( cudaStreamSynchronize(stream0) );
+        //printf(" ~cudaGraphLaunch \n");
+    }
+
+    float *out = get_network_output_gpu(net);
+    reset_wait_stream_events();
+    //cuda_free(state.input);   // will be freed in the free_network()
+
+    // Unmap the OpenGL texture.
+    cudaGraphicsUnmapResources(1, &graphics_resource, 0);
+    cudaGraphicsUnregisterResource(graphics_resource);
+
+    return out;
+}
+#endif // CUDA_OPENGL_INTEGRATION
diff --git a/darknet-master/src/nightmare.c b/darknet-master/src/nightmare.c
new file mode 100644
index 0000000..5c1ca04
--- /dev/null
+++ b/darknet-master/src/nightmare.c
@@ -0,0 +1,303 @@
+
+#include "network.h"
+#include "parser.h"
+#include "blas.h"
+#include "utils.h"
+
+// ./darknet nightmare cfg/extractor.recon.cfg ~/trained/yolo-coco.conv frame6.png -reconstruct -iters 500 -i 3 -lambda .1 -rate .01 -smooth 2
+
+float abs_mean(float *x, int n)
+{
+    int i;
+    float sum = 0;
+    for (i = 0; i < n; ++i){
+        sum += fabs(x[i]);
+    }
+    return sum/n;
+}
+
+void calculate_loss(float *output, float *delta, int n, float thresh)
+{
+    int i;
+    float mean = mean_array(output, n);
+    float var = variance_array(output, n);
+    for(i = 0; i < n; ++i){
+        if(delta[i] > mean + thresh*sqrt(var)) delta[i] = output[i];
+        else delta[i] = 0;
+    }
+}
+
+void optimize_picture(network *net, image orig, int max_layer, float scale, float rate, float thresh, int norm)
+{
+    //scale_image(orig, 2);
+    //translate_image(orig, -1);
+    net->n = max_layer + 1;
+
+    int dx = rand()%16 - 8;
+    int dy = rand()%16 - 8;
+    int flip = rand()%2;
+
+    image crop = crop_image(orig, dx, dy, orig.w, orig.h);
+    image im = resize_image(crop, (int)(orig.w * scale), (int)(orig.h * scale));
+    if(flip) flip_image(im);
+
+    resize_network(net, im.w, im.h);
+    layer last = net->layers[net->n-1];
+    //net->layers[net->n - 1].activation = LINEAR;
+
+    image delta = make_image(im.w, im.h, im.c);
+
+    network_state state = {0};
+
+#ifdef GPU
+    state.input = cuda_make_array(im.data, im.w*im.h*im.c);
+    state.delta = cuda_make_array(im.data, im.w*im.h*im.c);
+
+    forward_network_gpu(*net, state);
+    copy_ongpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1);
+
+    cuda_pull_array(last.delta_gpu, last.delta, last.outputs);
+    calculate_loss(last.delta, last.delta, last.outputs, thresh);
+    cuda_push_array(last.delta_gpu, last.delta, last.outputs);
+
+    backward_network_gpu(*net, state);
+
+    cuda_pull_array(state.delta, delta.data, im.w*im.h*im.c);
+    cuda_free(state.input);
+    cuda_free(state.delta);
+#else
+    state.input = im.data;
+    state.delta = delta.data;
+    forward_network(*net, state);
+    copy_cpu(last.outputs, last.output, 1, last.delta, 1);
+    calculate_loss(last.output, last.delta, last.outputs, thresh);
+    backward_network(*net, state);
+#endif
+
+    if(flip) flip_image(delta);
+    //normalize_array(delta.data, delta.w*delta.h*delta.c);
+    image resized = resize_image(delta, orig.w, orig.h);
+    image out = crop_image(resized, -dx, -dy, orig.w, orig.h);
+
+    /*
+       image g = grayscale_image(out);
+       free_image(out);
+       out = g;
+     */
+
+    //rate = rate / abs_mean(out.data, out.w*out.h*out.c);
+
+    if(norm) normalize_array(out.data, out.w*out.h*out.c);
+    axpy_cpu(orig.w*orig.h*orig.c, rate, out.data, 1, orig.data, 1);
+
+    /*
+       normalize_array(orig.data, orig.w*orig.h*orig.c);
+       scale_image(orig, sqrt(var));
+       translate_image(orig, mean);
+     */
+
+    //translate_image(orig, 1);
+    //scale_image(orig, .5);
+    //normalize_image(orig);
+
+    constrain_image(orig);
+
+    free_image(crop);
+    free_image(im);
+    free_image(delta);
+    free_image(resized);
+    free_image(out);
+
+}
+
+void smooth(image recon, image update, float lambda, int num)
+{
+    int i, j, k;
+    int ii, jj;
+    for(k = 0; k < recon.c; ++k){
+        for(j = 0; j < recon.h; ++j){
+            for(i = 0; i < recon.w; ++i){
+                int out_index = i + recon.w*(j + recon.h*k);
+                for(jj = j-num; jj <= j + num && jj < recon.h; ++jj){
+                    if (jj < 0) continue;
+                    for(ii = i-num; ii <= i + num && ii < recon.w; ++ii){
+                        if (ii < 0) continue;
+                        int in_index = ii + recon.w*(jj + recon.h*k);
+                        update.data[out_index] += lambda * (recon.data[in_index] - recon.data[out_index]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters)
+{
+    int iter = 0;
+    for (iter = 0; iter < iters; ++iter) {
+        image delta = make_image(recon.w, recon.h, recon.c);
+
+        network_state state = {0};
+#ifdef GPU
+        state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c);
+        state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c);
+        state.truth = cuda_make_array(features, get_network_output_size(net));
+
+        forward_network_gpu(net, state);
+        backward_network_gpu(net, state);
+
+        cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c);
+
+        cuda_free(state.input);
+        cuda_free(state.delta);
+        cuda_free(state.truth);
+#else
+        state.input = recon.data;
+        state.delta = delta.data;
+        state.truth = features;
+
+        forward_network(net, state);
+        backward_network(net, state);
+#endif
+
+        axpy_cpu(recon.w*recon.h*recon.c, 1, delta.data, 1, update.data, 1);
+        smooth(recon, update, lambda, smooth_size);
+
+        axpy_cpu(recon.w*recon.h*recon.c, rate, update.data, 1, recon.data, 1);
+        scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1);
+
+        //float mag = mag_array(recon.data, recon.w*recon.h*recon.c);
+        //scal_cpu(recon.w*recon.h*recon.c, 600/mag, recon.data, 1);
+
+        constrain_image(recon);
+        free_image(delta);
+    }
+}
+
+
+void run_nightmare(int argc, char **argv)
+{
+    srand(time(0));
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [cfg] [weights] [image] [layer] [options! (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[2];
+    char *weights = argv[3];
+    char *input = argv[4];
+    int max_layer = atoi(argv[5]);
+
+    int range = find_int_arg(argc, argv, "-range", 1);
+    int norm = find_int_arg(argc, argv, "-norm", 1);
+    int rounds = find_int_arg(argc, argv, "-rounds", 1);
+    int iters = find_int_arg(argc, argv, "-iters", 10);
+    int octaves = find_int_arg(argc, argv, "-octaves", 4);
+    float zoom = find_float_arg(argc, argv, "-zoom", 1.);
+    float rate = find_float_arg(argc, argv, "-rate", .04);
+    float thresh = find_float_arg(argc, argv, "-thresh", 1.);
+    float rotate = find_float_arg(argc, argv, "-rotate", 0);
+    float momentum = find_float_arg(argc, argv, "-momentum", .9);
+    float lambda = find_float_arg(argc, argv, "-lambda", .01);
+    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
+    int reconstruct = find_arg(argc, argv, "-reconstruct");
+    int smooth_size = find_int_arg(argc, argv, "-smooth", 1);
+
+    network net = parse_network_cfg(cfg);
+    load_weights(&net, weights);
+    char *cfgbase = basecfg(cfg);
+    char *imbase = basecfg(input);
+
+    set_batch_network(&net, 1);
+    image im = load_image_color(input, 0, 0);
+    if(0){
+        float scale = 1;
+        if(im.w > 512 || im.h > 512){
+            if(im.w > im.h) scale = 512.0/im.w;
+            else scale = 512.0/im.h;
+        }
+        image resized = resize_image(im, scale*im.w, scale*im.h);
+        free_image(im);
+        im = resized;
+    }
+
+    float *features = 0;
+    image update;
+    if (reconstruct){
+        resize_network(&net, im.w, im.h);
+
+        int zz = 0;
+        network_predict(net, im.data);
+        image out_im = get_network_image(net);
+        image crop = crop_image(out_im, zz, zz, out_im.w-2*zz, out_im.h-2*zz);
+        //flip_image(crop);
+        image f_im = resize_image(crop, out_im.w, out_im.h);
+        free_image(crop);
+        printf("%d features\n", out_im.w*out_im.h*out_im.c);
+
+
+        im = resize_image(im, im.w, im.h);
+        f_im = resize_image(f_im, f_im.w, f_im.h);
+        features = f_im.data;
+
+        int i;
+        for(i = 0; i < 14*14*512; ++i){
+            features[i] += rand_uniform(-.19, .19);
+        }
+
+        free_image(im);
+        im = make_random_image(im.w, im.h, im.c);
+        update = make_image(im.w, im.h, im.c);
+
+    }
+
+    int e;
+    int n;
+    for(e = 0; e < rounds; ++e){
+        fprintf(stderr, "Iteration: ");
+        fflush(stderr);
+        for(n = 0; n < iters; ++n){
+            fprintf(stderr, "%d, ", n);
+            fflush(stderr);
+            if(reconstruct){
+                reconstruct_picture(net, features, im, update, rate, momentum, lambda, smooth_size, 1);
+                //if ((n+1)%30 == 0) rate *= .5;
+                show_image(im, "reconstruction");
+#ifdef OPENCV
+                wait_key_cv(10);
+#endif
+            }else{
+                int layer = max_layer + rand()%range - range/2;
+                int octave = rand()%octaves;
+                optimize_picture(&net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm);
+            }
+        }
+        fprintf(stderr, "done\n");
+        if(0){
+            image g = grayscale_image(im);
+            free_image(im);
+            im = g;
+        }
+        char buff[256];
+        if (prefix){
+            sprintf(buff, "%s/%s_%s_%d_%06d",prefix, imbase, cfgbase, max_layer, e);
+        }else{
+            sprintf(buff, "%s_%s_%d_%06d",imbase, cfgbase, max_layer, e);
+        }
+        printf("%d %s\n", e, buff);
+        save_image(im, buff);
+        //show_image(im, buff);
+        //wait_key_cv(0);
+
+        if(rotate){
+            image rot = rotate_image(im, rotate);
+            free_image(im);
+            im = rot;
+        }
+        image crop = crop_image(im, im.w * (1. - zoom)/2., im.h * (1.-zoom)/2., im.w*zoom, im.h*zoom);
+        image resized = resize_image(crop, im.w, im.h);
+        free_image(im);
+        free_image(crop);
+        im = resized;
+    }
+}
diff --git a/darknet-master/src/normalization_layer.c b/darknet-master/src/normalization_layer.c
new file mode 100644
index 0000000..d6af621
--- /dev/null
+++ b/darknet-master/src/normalization_layer.c
@@ -0,0 +1,151 @@
+#include "normalization_layer.h"
+#include "blas.h"
+#include "utils.h"
+#include <stdio.h>
+
+layer make_normalization_layer(int batch, int w, int h, int c, int size, float alpha, float beta, float kappa)
+{
+    fprintf(stderr, "Local Response Normalization Layer: %d x %d x %d image, %d size\n", w,h,c,size);
+    layer layer = { (LAYER_TYPE)0 };
+    layer.type = NORMALIZATION;
+    layer.batch = batch;
+    layer.h = layer.out_h = h;
+    layer.w = layer.out_w = w;
+    layer.c = layer.out_c = c;
+    layer.kappa = kappa;
+    layer.size = size;
+    layer.alpha = alpha;
+    layer.beta = beta;
+    layer.output = (float*)xcalloc(h * w * c * batch, sizeof(float));
+    layer.delta = (float*)xcalloc(h * w * c * batch, sizeof(float));
+    layer.squared = (float*)xcalloc(h * w * c * batch, sizeof(float));
+    layer.norms = (float*)xcalloc(h * w * c * batch, sizeof(float));
+    layer.inputs = w*h*c;
+    layer.outputs = layer.inputs;
+
+    layer.forward = forward_normalization_layer;
+    layer.backward = backward_normalization_layer;
+    #ifdef GPU
+    layer.forward_gpu = forward_normalization_layer_gpu;
+    layer.backward_gpu = backward_normalization_layer_gpu;
+
+    layer.output_gpu =  cuda_make_array(layer.output, h * w * c * batch);
+    layer.delta_gpu =   cuda_make_array(layer.delta, h * w * c * batch);
+    layer.squared_gpu = cuda_make_array(layer.squared, h * w * c * batch);
+    layer.norms_gpu =   cuda_make_array(layer.norms, h * w * c * batch);
+    #endif
+    return layer;
+}
+
+void resize_normalization_layer(layer *layer, int w, int h)
+{
+    int c = layer->c;
+    int batch = layer->batch;
+    layer->h = h;
+    layer->w = w;
+    layer->out_h = h;
+    layer->out_w = w;
+    layer->inputs = w*h*c;
+    layer->outputs = layer->inputs;
+    layer->output = (float*)xrealloc(layer->output, h * w * c * batch * sizeof(float));
+    layer->delta = (float*)xrealloc(layer->delta, h * w * c * batch * sizeof(float));
+    layer->squared = (float*)xrealloc(layer->squared, h * w * c * batch * sizeof(float));
+    layer->norms = (float*)xrealloc(layer->norms, h * w * c * batch * sizeof(float));
+#ifdef GPU
+    cuda_free(layer->output_gpu);
+    cuda_free(layer->delta_gpu);
+    cuda_free(layer->squared_gpu);
+    cuda_free(layer->norms_gpu);
+    layer->output_gpu =  cuda_make_array(layer->output, h * w * c * batch);
+    layer->delta_gpu =   cuda_make_array(layer->delta, h * w * c * batch);
+    layer->squared_gpu = cuda_make_array(layer->squared, h * w * c * batch);
+    layer->norms_gpu =   cuda_make_array(layer->norms, h * w * c * batch);
+#endif
+}
+
+void forward_normalization_layer(const layer layer, network_state state)
+{
+    int k,b;
+    int w = layer.w;
+    int h = layer.h;
+    int c = layer.c;
+    scal_cpu(w*h*c*layer.batch, 0, layer.squared, 1);
+
+    for(b = 0; b < layer.batch; ++b){
+        float *squared = layer.squared + w*h*c*b;
+        float *norms   = layer.norms + w*h*c*b;
+        float *input   = state.input + w*h*c*b;
+        pow_cpu(w*h*c, 2, input, 1, squared, 1);
+
+        const_cpu(w*h, layer.kappa, norms, 1);
+        for(k = 0; k < layer.size/2; ++k){
+            axpy_cpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1);
+        }
+
+        for(k = 1; k < layer.c; ++k){
+            copy_cpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1);
+            int prev = k - ((layer.size-1)/2) - 1;
+            int next = k + (layer.size/2);
+            if(prev >= 0)      axpy_cpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1);
+            if(next < layer.c) axpy_cpu(w*h,  layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1);
+        }
+    }
+    pow_cpu(w*h*c*layer.batch, -layer.beta, layer.norms, 1, layer.output, 1);
+    mul_cpu(w*h*c*layer.batch, state.input, 1, layer.output, 1);
+}
+
+void backward_normalization_layer(const layer layer, network_state state)
+{
+    // TODO This is approximate ;-)
+    // Also this should add in to delta instead of overwritting.
+
+    int w = layer.w;
+    int h = layer.h;
+    int c = layer.c;
+    pow_cpu(w*h*c*layer.batch, -layer.beta, layer.norms, 1, state.delta, 1);
+    mul_cpu(w*h*c*layer.batch, layer.delta, 1, state.delta, 1);
+}
+
+#ifdef GPU
+void forward_normalization_layer_gpu(const layer layer, network_state state)
+{
+    int k,b;
+    int w = layer.w;
+    int h = layer.h;
+    int c = layer.c;
+    scal_ongpu(w*h*c*layer.batch, 0, layer.squared_gpu, 1);
+
+    for(b = 0; b < layer.batch; ++b){
+        float *squared = layer.squared_gpu + w*h*c*b;
+        float *norms   = layer.norms_gpu + w*h*c*b;
+        float *input   = state.input + w*h*c*b;
+        pow_ongpu(w*h*c, 2, input, 1, squared, 1);
+
+        const_ongpu(w*h, layer.kappa, norms, 1);
+        for(k = 0; k < layer.size/2; ++k){
+            axpy_ongpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1);
+        }
+
+        for(k = 1; k < layer.c; ++k){
+            copy_ongpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1);
+            int prev = k - ((layer.size-1)/2) - 1;
+            int next = k + (layer.size/2);
+            if(prev >= 0)      axpy_ongpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1);
+            if(next < layer.c) axpy_ongpu(w*h,  layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1);
+        }
+    }
+    pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1);
+    mul_ongpu(w*h*c*layer.batch, state.input, 1, layer.output_gpu, 1);
+}
+
+void backward_normalization_layer_gpu(const layer layer, network_state state)
+{
+    // TODO This is approximate ;-)
+
+    int w = layer.w;
+    int h = layer.h;
+    int c = layer.c;
+    pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, state.delta, 1);
+    mul_ongpu(w*h*c*layer.batch, layer.delta_gpu, 1, state.delta, 1);
+}
+#endif
diff --git a/darknet-master/src/normalization_layer.h b/darknet-master/src/normalization_layer.h
new file mode 100644
index 0000000..2ac9b0f
--- /dev/null
+++ b/darknet-master/src/normalization_layer.h
@@ -0,0 +1,25 @@
+#ifndef NORMALIZATION_LAYER_H
+#define NORMALIZATION_LAYER_H
+
+#include "image.h"
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_normalization_layer(int batch, int w, int h, int c, int size, float alpha, float beta, float kappa);
+void resize_normalization_layer(layer *layer, int w, int h);
+void forward_normalization_layer(const layer layer, network_state state);
+void backward_normalization_layer(const layer layer, network_state state);
+void visualize_normalization_layer(layer layer, char *window);
+
+#ifdef GPU
+void forward_normalization_layer_gpu(const layer layer, network_state state);
+void backward_normalization_layer_gpu(const layer layer, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/option_list.c b/darknet-master/src/option_list.c
new file mode 100644
index 0000000..306f0e3
--- /dev/null
+++ b/darknet-master/src/option_list.c
@@ -0,0 +1,152 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "option_list.h"
+#include "utils.h"
+#include "data.h"
+
+list *read_data_cfg(char *filename)
+{
+    FILE *file = fopen(filename, "r");
+    if(file == 0) file_error(filename);
+    char *line;
+    int nu = 0;
+    list *options = make_list();
+    while((line=fgetl(file)) != 0){
+        ++nu;
+        strip(line);
+        switch(line[0]){
+            case '\0':
+            case '#':
+            case ';':
+                free(line);
+                break;
+            default:
+                if(!read_option(line, options)){
+                    fprintf(stderr, "Config file error line %d, could parse: %s\n", nu, line);
+                    free(line);
+                }
+                break;
+        }
+    }
+    fclose(file);
+    return options;
+}
+
+metadata get_metadata(char *file)
+{
+    metadata m = { 0 };
+    list *options = read_data_cfg(file);
+
+    char *name_list = option_find_str(options, "names", 0);
+    if (!name_list) name_list = option_find_str(options, "labels", 0);
+    if (!name_list) {
+        fprintf(stderr, "No names or labels found\n");
+    }
+    else {
+        m.names = get_labels(name_list);
+    }
+    m.classes = option_find_int(options, "classes", 2);
+    free_list(options);
+    if(name_list) {
+        printf("Loaded - names_list: %s, classes = %d \n", name_list, m.classes);
+    }
+    return m;
+}
+
+int read_option(char *s, list *options)
+{
+    size_t i;
+    size_t len = strlen(s);
+    char *val = 0;
+    for(i = 0; i < len; ++i){
+        if(s[i] == '='){
+            s[i] = '\0';
+            val = s+i+1;
+            break;
+        }
+    }
+    if(i == len-1) return 0;
+    char *key = s;
+    option_insert(options, key, val);
+    return 1;
+}
+
+void option_insert(list *l, char *key, char *val)
+{
+    kvp* p = (kvp*)xmalloc(sizeof(kvp));
+    p->key = key;
+    p->val = val;
+    p->used = 0;
+    list_insert(l, p);
+}
+
+void option_unused(list *l)
+{
+    node *n = l->front;
+    while(n){
+        kvp *p = (kvp *)n->val;
+        if(!p->used){
+            fprintf(stderr, "Unused field: '%s = %s'\n", p->key, p->val);
+        }
+        n = n->next;
+    }
+}
+
+char *option_find(list *l, char *key)
+{
+    node *n = l->front;
+    while(n){
+        kvp *p = (kvp *)n->val;
+        if(strcmp(p->key, key) == 0){
+            p->used = 1;
+            return p->val;
+        }
+        n = n->next;
+    }
+    return 0;
+}
+char *option_find_str(list *l, char *key, char *def)
+{
+    char *v = option_find(l, key);
+    if(v) return v;
+    if(def) fprintf(stderr, "%s: Using default '%s'\n", key, def);
+    return def;
+}
+
+char *option_find_str_quiet(list *l, char *key, char *def)
+{
+    char *v = option_find(l, key);
+    if (v) return v;
+    return def;
+}
+
+int option_find_int(list *l, char *key, int def)
+{
+    char *v = option_find(l, key);
+    if(v) return atoi(v);
+    fprintf(stderr, "%s: Using default '%d'\n", key, def);
+    return def;
+}
+
+int option_find_int_quiet(list *l, char *key, int def)
+{
+    char *v = option_find(l, key);
+    if(v) return atoi(v);
+    return def;
+}
+
+float option_find_float_quiet(list *l, char *key, float def)
+{
+    char *v = option_find(l, key);
+    if(v) return atof(v);
+    return def;
+}
+
+float option_find_float(list *l, char *key, float def)
+{
+    char *v = option_find(l, key);
+    if(v) return atof(v);
+    fprintf(stderr, "%s: Using default '%lf'\n", key, def);
+    return def;
+}
diff --git a/darknet-master/src/option_list.h b/darknet-master/src/option_list.h
new file mode 100644
index 0000000..3dd66e4
--- /dev/null
+++ b/darknet-master/src/option_list.h
@@ -0,0 +1,38 @@
+#ifndef OPTION_LIST_H
+#define OPTION_LIST_H
+#include "darknet.h"
+#include "list.h"
+
+typedef struct{
+    char *key;
+    char *val;
+    int used;
+} kvp;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+list *read_data_cfg(char *filename);
+int read_option(char *s, list *options);
+void option_insert(list *l, char *key, char *val);
+char *option_find(list *l, char *key);
+char *option_find_str(list *l, char *key, char *def);
+char *option_find_str_quiet(list *l, char *key, char *def);
+int option_find_int(list *l, char *key, int def);
+int option_find_int_quiet(list *l, char *key, int def);
+float option_find_float(list *l, char *key, float def);
+float option_find_float_quiet(list *l, char *key, float def);
+void option_unused(list *l);
+
+//typedef struct {
+//    int classes;
+//    char **names;
+//} metadata;
+
+//LIB_API metadata get_metadata(char *file);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/parser.c b/darknet-master/src/parser.c
new file mode 100644
index 0000000..65606de
--- /dev/null
+++ b/darknet-master/src/parser.c
@@ -0,0 +1,2395 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "activation_layer.h"
+#include "activations.h"
+#include "assert.h"
+#include "avgpool_layer.h"
+#include "batchnorm_layer.h"
+#include "blas.h"
+#include "connected_layer.h"
+#include "convolutional_layer.h"
+#include "cost_layer.h"
+#include "crnn_layer.h"
+#include "crop_layer.h"
+#include "detection_layer.h"
+#include "dropout_layer.h"
+#include "gru_layer.h"
+#include "list.h"
+#include "local_layer.h"
+#include "lstm_layer.h"
+#include "conv_lstm_layer.h"
+#include "maxpool_layer.h"
+#include "normalization_layer.h"
+#include "option_list.h"
+#include "parser.h"
+#include "region_layer.h"
+#include "reorg_layer.h"
+#include "reorg_old_layer.h"
+#include "rnn_layer.h"
+#include "route_layer.h"
+#include "shortcut_layer.h"
+#include "scale_channels_layer.h"
+#include "sam_layer.h"
+#include "softmax_layer.h"
+#include "utils.h"
+#include "upsample_layer.h"
+#include "version.h"
+#include "yolo_layer.h"
+#include "gaussian_yolo_layer.h"
+#include "representation_layer.h"
+
+void empty_func(dropout_layer l, network_state state) {
+    //l.output_gpu = state.input;
+}
+
+typedef struct{
+    char *type;
+    list *options;
+}section;
+
+list *read_cfg(char *filename);
+
+LAYER_TYPE string_to_layer_type(char * type)
+{
+
+    if (strcmp(type, "[shortcut]")==0) return SHORTCUT;
+    if (strcmp(type, "[scale_channels]") == 0) return SCALE_CHANNELS;
+    if (strcmp(type, "[sam]") == 0) return SAM;
+    if (strcmp(type, "[crop]")==0) return CROP;
+    if (strcmp(type, "[cost]")==0) return COST;
+    if (strcmp(type, "[detection]")==0) return DETECTION;
+    if (strcmp(type, "[region]")==0) return REGION;
+    if (strcmp(type, "[yolo]") == 0) return YOLO;
+    if (strcmp(type, "[Gaussian_yolo]") == 0) return GAUSSIAN_YOLO;
+    if (strcmp(type, "[local]")==0) return LOCAL;
+    if (strcmp(type, "[conv]")==0
+            || strcmp(type, "[convolutional]")==0) return CONVOLUTIONAL;
+    if (strcmp(type, "[activation]")==0) return ACTIVE;
+    if (strcmp(type, "[net]")==0
+            || strcmp(type, "[network]")==0) return NETWORK;
+    if (strcmp(type, "[crnn]")==0) return CRNN;
+    if (strcmp(type, "[gru]")==0) return GRU;
+    if (strcmp(type, "[lstm]")==0) return LSTM;
+    if (strcmp(type, "[conv_lstm]") == 0) return CONV_LSTM;
+    if (strcmp(type, "[history]") == 0) return HISTORY;
+    if (strcmp(type, "[rnn]")==0) return RNN;
+    if (strcmp(type, "[conn]")==0
+            || strcmp(type, "[connected]")==0) return CONNECTED;
+    if (strcmp(type, "[max]")==0
+            || strcmp(type, "[maxpool]")==0) return MAXPOOL;
+    if (strcmp(type, "[local_avg]") == 0
+        || strcmp(type, "[local_avgpool]") == 0) return LOCAL_AVGPOOL;
+    if (strcmp(type, "[reorg3d]")==0) return REORG;
+    if (strcmp(type, "[reorg]") == 0) return REORG_OLD;
+    if (strcmp(type, "[avg]")==0
+            || strcmp(type, "[avgpool]")==0) return AVGPOOL;
+    if (strcmp(type, "[dropout]")==0) return DROPOUT;
+    if (strcmp(type, "[lrn]")==0
+            || strcmp(type, "[normalization]")==0) return NORMALIZATION;
+    if (strcmp(type, "[batchnorm]")==0) return BATCHNORM;
+    if (strcmp(type, "[soft]")==0
+            || strcmp(type, "[softmax]")==0) return SOFTMAX;
+    if (strcmp(type, "[contrastive]") == 0) return CONTRASTIVE;
+    if (strcmp(type, "[route]")==0) return ROUTE;
+    if (strcmp(type, "[upsample]") == 0) return UPSAMPLE;
+    if (strcmp(type, "[empty]") == 0
+        || strcmp(type, "[silence]") == 0) return EMPTY;
+    if (strcmp(type, "[implicit]") == 0) return IMPLICIT;
+    return BLANK;
+}
+
+void free_section(section *s)
+{
+    free(s->type);
+    node *n = s->options->front;
+    while(n){
+        kvp *pair = (kvp *)n->val;
+        free(pair->key);
+        free(pair);
+        node *next = n->next;
+        free(n);
+        n = next;
+    }
+    free(s->options);
+    free(s);
+}
+
+void parse_data(char *data, float *a, int n)
+{
+    int i;
+    if(!data) return;
+    char *curr = data;
+    char *next = data;
+    int done = 0;
+    for(i = 0; i < n && !done; ++i){
+        while(*++next !='\0' && *next != ',');
+        if(*next == '\0') done = 1;
+        *next = '\0';
+        sscanf(curr, "%g", &a[i]);
+        curr = next+1;
+    }
+}
+
+typedef struct size_params{
+    int batch;
+    int inputs;
+    int h;
+    int w;
+    int c;
+    int index;
+    int time_steps;
+    int train;
+    network net;
+} size_params;
+
+local_layer parse_local(list *options, size_params params)
+{
+    int n = option_find_int(options, "filters",1);
+    int size = option_find_int(options, "size",1);
+    int stride = option_find_int(options, "stride",1);
+    int pad = option_find_int(options, "pad",0);
+    char *activation_s = option_find_str(options, "activation", "logistic");
+    ACTIVATION activation = get_activation(activation_s);
+
+    int batch,h,w,c;
+    h = params.h;
+    w = params.w;
+    c = params.c;
+    batch=params.batch;
+    if(!(h && w && c)) error("Layer before local layer must output image.", DARKNET_LOC);
+
+    local_layer layer = make_local_layer(batch,h,w,c,n,size,stride,pad,activation);
+
+    return layer;
+}
+
+convolutional_layer parse_convolutional(list *options, size_params params)
+{
+    int n = option_find_int(options, "filters",1);
+    int groups = option_find_int_quiet(options, "groups", 1);
+    int size = option_find_int(options, "size",1);
+    int stride = -1;
+    //int stride = option_find_int(options, "stride",1);
+    int stride_x = option_find_int_quiet(options, "stride_x", -1);
+    int stride_y = option_find_int_quiet(options, "stride_y", -1);
+    if (stride_x < 1 || stride_y < 1) {
+        stride = option_find_int(options, "stride", 1);
+        if (stride_x < 1) stride_x = stride;
+        if (stride_y < 1) stride_y = stride;
+    }
+    else {
+        stride = option_find_int_quiet(options, "stride", 1);
+    }
+    int dilation = option_find_int_quiet(options, "dilation", 1);
+    int antialiasing = option_find_int_quiet(options, "antialiasing", 0);
+    if (size == 1) dilation = 1;
+    int pad = option_find_int_quiet(options, "pad",0);
+    int padding = option_find_int_quiet(options, "padding",0);
+    if(pad) padding = size/2;
+
+    char *activation_s = option_find_str(options, "activation", "logistic");
+    ACTIVATION activation = get_activation(activation_s);
+
+    int assisted_excitation = option_find_float_quiet(options, "assisted_excitation", 0);
+
+    int share_index = option_find_int_quiet(options, "share_index", -1000000000);
+    convolutional_layer *share_layer = NULL;
+    if(share_index >= 0) share_layer = &params.net.layers[share_index];
+    else if(share_index != -1000000000) share_layer = &params.net.layers[params.index + share_index];
+
+    int batch,h,w,c;
+    h = params.h;
+    w = params.w;
+    c = params.c;
+    batch=params.batch;
+    if(!(h && w && c)) error("Layer before convolutional layer must output image.", DARKNET_LOC);
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+    int cbn = option_find_int_quiet(options, "cbn", 0);
+    if (cbn) batch_normalize = 2;
+    int binary = option_find_int_quiet(options, "binary", 0);
+    int xnor = option_find_int_quiet(options, "xnor", 0);
+    int use_bin_output = option_find_int_quiet(options, "bin_output", 0);
+    int sway = option_find_int_quiet(options, "sway", 0);
+    int rotate = option_find_int_quiet(options, "rotate", 0);
+    int stretch = option_find_int_quiet(options, "stretch", 0);
+    int stretch_sway = option_find_int_quiet(options, "stretch_sway", 0);
+    if ((sway + rotate + stretch + stretch_sway) > 1) {
+        error("Error: should be used only 1 param: sway=1, rotate=1 or stretch=1 in the [convolutional] layer", DARKNET_LOC);
+    }
+    int deform = sway || rotate || stretch || stretch_sway;
+    if (deform && size == 1) {
+        error("Error: params (sway=1, rotate=1 or stretch=1) should be used only with size >=3 in the [convolutional] layer", DARKNET_LOC);
+    }
+
+    convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,groups,size,stride_x,stride_y,dilation,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index, antialiasing, share_layer, assisted_excitation, deform, params.train);
+    layer.flipped = option_find_int_quiet(options, "flipped", 0);
+    layer.dot = option_find_float_quiet(options, "dot", 0);
+    layer.sway = sway;
+    layer.rotate = rotate;
+    layer.stretch = stretch;
+    layer.stretch_sway = stretch_sway;
+    layer.angle = option_find_float_quiet(options, "angle", 15);
+    layer.grad_centr = option_find_int_quiet(options, "grad_centr", 0);
+    layer.reverse = option_find_float_quiet(options, "reverse", 0);
+    layer.coordconv = option_find_int_quiet(options, "coordconv", 0);
+
+    layer.stream = option_find_int_quiet(options, "stream", -1);
+    layer.wait_stream_id = option_find_int_quiet(options, "wait_stream", -1);
+
+    if(params.net.adam){
+        layer.B1 = params.net.B1;
+        layer.B2 = params.net.B2;
+        layer.eps = params.net.eps;
+    }
+
+    return layer;
+}
+
+layer parse_crnn(list *options, size_params params)
+{
+    int size = option_find_int_quiet(options, "size", 3);
+    int stride = option_find_int_quiet(options, "stride", 1);
+    int dilation = option_find_int_quiet(options, "dilation", 1);
+    int pad = option_find_int_quiet(options, "pad", 0);
+    int padding = option_find_int_quiet(options, "padding", 0);
+    if (pad) padding = size / 2;
+
+    int output_filters = option_find_int(options, "output",1);
+    int hidden_filters = option_find_int(options, "hidden",1);
+    int groups = option_find_int_quiet(options, "groups", 1);
+    char *activation_s = option_find_str(options, "activation", "logistic");
+    ACTIVATION activation = get_activation(activation_s);
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+    int xnor = option_find_int_quiet(options, "xnor", 0);
+
+    layer l = make_crnn_layer(params.batch, params.h, params.w, params.c, hidden_filters, output_filters, groups, params.time_steps, size, stride, dilation, padding, activation, batch_normalize, xnor, params.train);
+
+    l.shortcut = option_find_int_quiet(options, "shortcut", 0);
+
+    return l;
+}
+
+layer parse_rnn(list *options, size_params params)
+{
+    int output = option_find_int(options, "output",1);
+    int hidden = option_find_int(options, "hidden",1);
+    char *activation_s = option_find_str(options, "activation", "logistic");
+    ACTIVATION activation = get_activation(activation_s);
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+    int logistic = option_find_int_quiet(options, "logistic", 0);
+
+    layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic);
+
+    l.shortcut = option_find_int_quiet(options, "shortcut", 0);
+
+    return l;
+}
+
+layer parse_gru(list *options, size_params params)
+{
+    int output = option_find_int(options, "output",1);
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+
+    layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize);
+
+    return l;
+}
+
+layer parse_lstm(list *options, size_params params)
+{
+    int output = option_find_int(options, "output",1);
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+
+    layer l = make_lstm_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize);
+
+    return l;
+}
+
+layer parse_conv_lstm(list *options, size_params params)
+{
+    // a ConvLSTM with a larger transitional kernel should be able to capture faster motions
+    int size = option_find_int_quiet(options, "size", 3);
+    int stride = option_find_int_quiet(options, "stride", 1);
+    int dilation = option_find_int_quiet(options, "dilation", 1);
+    int pad = option_find_int_quiet(options, "pad", 0);
+    int padding = option_find_int_quiet(options, "padding", 0);
+    if (pad) padding = size / 2;
+
+    int output_filters = option_find_int(options, "output", 1);
+    int groups = option_find_int_quiet(options, "groups", 1);
+    char *activation_s = option_find_str(options, "activation", "linear");
+    ACTIVATION activation = get_activation(activation_s);
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+    int xnor = option_find_int_quiet(options, "xnor", 0);
+    int peephole = option_find_int_quiet(options, "peephole", 0);
+    int bottleneck = option_find_int_quiet(options, "bottleneck", 0);
+
+    layer l = make_conv_lstm_layer(params.batch, params.h, params.w, params.c, output_filters, groups, params.time_steps, size, stride, dilation, padding, activation, batch_normalize, peephole, xnor, bottleneck, params.train);
+
+    l.state_constrain = option_find_int_quiet(options, "state_constrain", params.time_steps * 32);
+    l.shortcut = option_find_int_quiet(options, "shortcut", 0);
+
+    char *lstm_activation_s = option_find_str(options, "lstm_activation", "tanh");
+    l.lstm_activation = get_activation(lstm_activation_s);
+    l.time_normalizer = option_find_float_quiet(options, "time_normalizer", 1.0);
+
+    return l;
+}
+
+layer parse_history(list *options, size_params params)
+{
+    int history_size = option_find_int(options, "history_size", 4);
+    layer l = make_history_layer(params.batch, params.h, params.w, params.c, history_size, params.time_steps, params.train);
+    return l;
+}
+
+connected_layer parse_connected(list *options, size_params params)
+{
+    int output = option_find_int(options, "output",1);
+    char *activation_s = option_find_str(options, "activation", "logistic");
+    ACTIVATION activation = get_activation(activation_s);
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+
+    connected_layer layer = make_connected_layer(params.batch, 1, params.inputs, output, activation, batch_normalize);
+
+    return layer;
+}
+
+softmax_layer parse_softmax(list *options, size_params params)
+{
+    int groups = option_find_int_quiet(options, "groups", 1);
+    softmax_layer layer = make_softmax_layer(params.batch, params.inputs, groups);
+    layer.temperature = option_find_float_quiet(options, "temperature", 1);
+    char *tree_file = option_find_str(options, "tree", 0);
+    if (tree_file) layer.softmax_tree = read_tree(tree_file);
+    layer.w = params.w;
+    layer.h = params.h;
+    layer.c = params.c;
+    layer.spatial = option_find_float_quiet(options, "spatial", 0);
+    layer.noloss = option_find_int_quiet(options, "noloss", 0);
+    return layer;
+}
+
+contrastive_layer parse_contrastive(list *options, size_params params)
+{
+    int classes = option_find_int(options, "classes", 1000);
+    layer *yolo_layer = NULL;
+    int yolo_layer_id = option_find_int_quiet(options, "yolo_layer", 0);
+    if (yolo_layer_id < 0) yolo_layer_id = params.index + yolo_layer_id;
+    if(yolo_layer_id != 0) yolo_layer = params.net.layers + yolo_layer_id;
+    if (yolo_layer->type != YOLO) {
+        printf(" Error: [contrastive] layer should point to the [yolo] layer instead of %d layer! \n", yolo_layer_id);
+        error("Error!", DARKNET_LOC);
+    }
+
+    contrastive_layer layer = make_contrastive_layer(params.batch, params.w, params.h, params.c, classes, params.inputs, yolo_layer);
+    layer.temperature = option_find_float_quiet(options, "temperature", 1);
+    layer.steps = params.time_steps;
+    layer.cls_normalizer = option_find_float_quiet(options, "cls_normalizer", 1);
+    layer.max_delta = option_find_float_quiet(options, "max_delta", FLT_MAX);   // set 10
+    layer.contrastive_neg_max = option_find_int_quiet(options, "contrastive_neg_max", 3);
+    return layer;
+}
+
+int *parse_yolo_mask(char *a, int *num)
+{
+    int *mask = 0;
+    if (a) {
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for (i = 0; i < len; ++i) {
+            if (a[i] == '#') break;
+            if (a[i] == ',') ++n;
+        }
+        mask = (int*)xcalloc(n, sizeof(int));
+        for (i = 0; i < n; ++i) {
+            int val = atoi(a);
+            mask[i] = val;
+            a = strchr(a, ',') + 1;
+        }
+        *num = n;
+    }
+    return mask;
+}
+
+float *get_classes_multipliers(char *cpc, const int classes, const float max_delta)
+{
+    float *classes_multipliers = NULL;
+    if (cpc) {
+        int classes_counters = classes;
+        int *counters_per_class = parse_yolo_mask(cpc, &classes_counters);
+        if (classes_counters != classes) {
+            printf(" number of values in counters_per_class = %d doesn't match with classes = %d \n", classes_counters, classes);
+            error("Error!", DARKNET_LOC);
+        }
+        float max_counter = 0;
+        int i;
+        for (i = 0; i < classes_counters; ++i) {
+            if (counters_per_class[i] < 1) counters_per_class[i] = 1;
+            if (max_counter < counters_per_class[i]) max_counter = counters_per_class[i];
+        }
+        classes_multipliers = (float *)calloc(classes_counters, sizeof(float));
+        for (i = 0; i < classes_counters; ++i) {
+            classes_multipliers[i] = max_counter / counters_per_class[i];
+            if(classes_multipliers[i] > max_delta) classes_multipliers[i] = max_delta;
+        }
+        free(counters_per_class);
+        printf(" classes_multipliers: ");
+        for (i = 0; i < classes_counters; ++i) printf("%.1f, ", classes_multipliers[i]);
+        printf("\n");
+    }
+    return classes_multipliers;
+}
+
+layer parse_yolo(list *options, size_params params)
+{
+    int classes = option_find_int(options, "classes", 20);
+    int total = option_find_int(options, "num", 1);
+    int num = total;
+    char *a = option_find_str(options, "mask", 0);
+    int *mask = parse_yolo_mask(a, &num);
+    int max_boxes = option_find_int_quiet(options, "max", 200);
+    layer l = make_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes, max_boxes);
+    if (l.outputs != params.inputs) {
+        error("Error: l.outputs == params.inputs, filters= in the [convolutional]-layer doesn't correspond to classes= or mask= in [yolo]-layer", DARKNET_LOC);
+    }
+    l.show_details = option_find_int_quiet(options, "show_details", 1);
+    l.max_delta = option_find_float_quiet(options, "max_delta", FLT_MAX);   // set 10
+    char *cpc = option_find_str(options, "counters_per_class", 0);
+    l.classes_multipliers = get_classes_multipliers(cpc, classes, l.max_delta);
+
+    l.label_smooth_eps = option_find_float_quiet(options, "label_smooth_eps", 0.0f);
+    l.scale_x_y = option_find_float_quiet(options, "scale_x_y", 1);
+    l.objectness_smooth = option_find_int_quiet(options, "objectness_smooth", 0);
+    l.new_coords = option_find_int_quiet(options, "new_coords", 0);
+    l.iou_normalizer = option_find_float_quiet(options, "iou_normalizer", 0.75);
+    l.obj_normalizer = option_find_float_quiet(options, "obj_normalizer", 1);
+    l.cls_normalizer = option_find_float_quiet(options, "cls_normalizer", 1);
+    l.delta_normalizer = option_find_float_quiet(options, "delta_normalizer", 1);
+    char *iou_loss = option_find_str_quiet(options, "iou_loss", "mse");   //  "iou");
+
+    if (strcmp(iou_loss, "mse") == 0) l.iou_loss = MSE;
+    else if (strcmp(iou_loss, "giou") == 0) l.iou_loss = GIOU;
+    else if (strcmp(iou_loss, "diou") == 0) l.iou_loss = DIOU;
+    else if (strcmp(iou_loss, "ciou") == 0) l.iou_loss = CIOU;
+    else l.iou_loss = IOU;
+    fprintf(stderr, "[yolo] params: iou loss: %s (%d), iou_norm: %2.2f, obj_norm: %2.2f, cls_norm: %2.2f, delta_norm: %2.2f, scale_x_y: %2.2f\n",
+        iou_loss, l.iou_loss, l.iou_normalizer, l.obj_normalizer, l.cls_normalizer, l.delta_normalizer, l.scale_x_y);
+
+    char *iou_thresh_kind_str = option_find_str_quiet(options, "iou_thresh_kind", "iou");
+    if (strcmp(iou_thresh_kind_str, "iou") == 0) l.iou_thresh_kind = IOU;
+    else if (strcmp(iou_thresh_kind_str, "giou") == 0) l.iou_thresh_kind = GIOU;
+    else if (strcmp(iou_thresh_kind_str, "diou") == 0) l.iou_thresh_kind = DIOU;
+    else if (strcmp(iou_thresh_kind_str, "ciou") == 0) l.iou_thresh_kind = CIOU;
+    else {
+        fprintf(stderr, " Wrong iou_thresh_kind = %s \n", iou_thresh_kind_str);
+        l.iou_thresh_kind = IOU;
+    }
+
+    l.beta_nms = option_find_float_quiet(options, "beta_nms", 0.6);
+    char *nms_kind = option_find_str_quiet(options, "nms_kind", "default");
+    if (strcmp(nms_kind, "default") == 0) l.nms_kind = DEFAULT_NMS;
+    else {
+        if (strcmp(nms_kind, "greedynms") == 0) l.nms_kind = GREEDY_NMS;
+        else if (strcmp(nms_kind, "diounms") == 0) l.nms_kind = DIOU_NMS;
+        else l.nms_kind = DEFAULT_NMS;
+        printf("nms_kind: %s (%d), beta = %f \n", nms_kind, l.nms_kind, l.beta_nms);
+    }
+
+    l.jitter = option_find_float(options, "jitter", .2);
+    l.resize = option_find_float_quiet(options, "resize", 1.0);
+    l.focal_loss = option_find_int_quiet(options, "focal_loss", 0);
+
+    l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
+    l.truth_thresh = option_find_float(options, "truth_thresh", 1);
+    l.iou_thresh = option_find_float_quiet(options, "iou_thresh", 1); // recommended to use iou_thresh=0.213 in [yolo]
+    l.random = option_find_float_quiet(options, "random", 0);
+
+    l.track_history_size = option_find_int_quiet(options, "track_history_size", 5);
+    l.sim_thresh = option_find_float_quiet(options, "sim_thresh", 0.8);
+    l.dets_for_track = option_find_int_quiet(options, "dets_for_track", 1);
+    l.dets_for_show = option_find_int_quiet(options, "dets_for_show", 1);
+    l.track_ciou_norm = option_find_float_quiet(options, "track_ciou_norm", 0.01);
+    int embedding_layer_id = option_find_int_quiet(options, "embedding_layer", 999999);
+    if (embedding_layer_id < 0) embedding_layer_id = params.index + embedding_layer_id;
+    if (embedding_layer_id != 999999) {
+        printf(" embedding_layer_id = %d, ", embedding_layer_id);
+        layer le = params.net.layers[embedding_layer_id];
+        l.embedding_layer_id = embedding_layer_id;
+        l.embedding_output = (float*)xcalloc(le.batch * le.outputs, sizeof(float));
+        l.embedding_size = le.n / l.n;
+        printf(" embedding_size = %d \n", l.embedding_size);
+        if (le.n % l.n != 0) {
+            printf(" Warning: filters=%d number in embedding_layer=%d isn't divisable by number of anchors %d \n", le.n, embedding_layer_id, l.n);
+        }
+    }
+
+    char *map_file = option_find_str(options, "map", 0);
+    if (map_file) l.map = read_map(map_file);
+
+    a = option_find_str(options, "anchors", 0);
+    if (a) {
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for (i = 0; i < len; ++i) {
+            if (a[i] == '#') break;
+            if (a[i] == ',') ++n;
+        }
+        for (i = 0; i < n && i < total*2; ++i) {
+            float bias = atof(a);
+            l.biases[i] = bias;
+            a = strchr(a, ',') + 1;
+        }
+    }
+    return l;
+}
+
+
+int *parse_gaussian_yolo_mask(char *a, int *num) // Gaussian_YOLOv3
+{
+    int *mask = 0;
+    if (a) {
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for (i = 0; i < len; ++i) {
+            if (a[i] == '#') break;
+            if (a[i] == ',') ++n;
+        }
+        mask = (int *)calloc(n, sizeof(int));
+        for (i = 0; i < n; ++i) {
+            int val = atoi(a);
+            mask[i] = val;
+            a = strchr(a, ',') + 1;
+        }
+        *num = n;
+    }
+    return mask;
+}
+
+
+layer parse_gaussian_yolo(list *options, size_params params) // Gaussian_YOLOv3
+{
+    int classes = option_find_int(options, "classes", 20);
+    int max_boxes = option_find_int_quiet(options, "max", 200);
+    int total = option_find_int(options, "num", 1);
+    int num = total;
+
+    char *a = option_find_str(options, "mask", 0);
+    int *mask = parse_gaussian_yolo_mask(a, &num);
+    layer l = make_gaussian_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes, max_boxes);
+    if (l.outputs != params.inputs) {
+        error("Error: l.outputs == params.inputs, filters= in the [convolutional]-layer doesn't correspond to classes= or mask= in [Gaussian_yolo]-layer", DARKNET_LOC);
+    }
+    l.max_delta = option_find_float_quiet(options, "max_delta", FLT_MAX);   // set 10
+    char *cpc = option_find_str(options, "counters_per_class", 0);
+    l.classes_multipliers = get_classes_multipliers(cpc, classes, l.max_delta);
+
+    l.label_smooth_eps = option_find_float_quiet(options, "label_smooth_eps", 0.0f);
+    l.scale_x_y = option_find_float_quiet(options, "scale_x_y", 1);
+    l.objectness_smooth = option_find_int_quiet(options, "objectness_smooth", 0);
+    l.uc_normalizer = option_find_float_quiet(options, "uc_normalizer", 1.0);
+    l.iou_normalizer = option_find_float_quiet(options, "iou_normalizer", 0.75);
+    l.obj_normalizer = option_find_float_quiet(options, "obj_normalizer", 1.0);
+    l.cls_normalizer = option_find_float_quiet(options, "cls_normalizer", 1);
+    l.delta_normalizer = option_find_float_quiet(options, "delta_normalizer", 1);
+    char *iou_loss = option_find_str_quiet(options, "iou_loss", "mse");   //  "iou");
+
+    if (strcmp(iou_loss, "mse") == 0) l.iou_loss = MSE;
+    else if (strcmp(iou_loss, "giou") == 0) l.iou_loss = GIOU;
+    else if (strcmp(iou_loss, "diou") == 0) l.iou_loss = DIOU;
+    else if (strcmp(iou_loss, "ciou") == 0) l.iou_loss = CIOU;
+    else l.iou_loss = IOU;
+
+    char *iou_thresh_kind_str = option_find_str_quiet(options, "iou_thresh_kind", "iou");
+    if (strcmp(iou_thresh_kind_str, "iou") == 0) l.iou_thresh_kind = IOU;
+    else if (strcmp(iou_thresh_kind_str, "giou") == 0) l.iou_thresh_kind = GIOU;
+    else if (strcmp(iou_thresh_kind_str, "diou") == 0) l.iou_thresh_kind = DIOU;
+    else if (strcmp(iou_thresh_kind_str, "ciou") == 0) l.iou_thresh_kind = CIOU;
+    else {
+        fprintf(stderr, " Wrong iou_thresh_kind = %s \n", iou_thresh_kind_str);
+        l.iou_thresh_kind = IOU;
+    }
+
+    l.beta_nms = option_find_float_quiet(options, "beta_nms", 0.6);
+    char *nms_kind = option_find_str_quiet(options, "nms_kind", "default");
+    if (strcmp(nms_kind, "default") == 0) l.nms_kind = DEFAULT_NMS;
+    else {
+        if (strcmp(nms_kind, "greedynms") == 0) l.nms_kind = GREEDY_NMS;
+        else if (strcmp(nms_kind, "diounms") == 0) l.nms_kind = DIOU_NMS;
+        else if (strcmp(nms_kind, "cornersnms") == 0) l.nms_kind = CORNERS_NMS;
+        else l.nms_kind = DEFAULT_NMS;
+        printf("nms_kind: %s (%d), beta = %f \n", nms_kind, l.nms_kind, l.beta_nms);
+    }
+
+    char *yolo_point = option_find_str_quiet(options, "yolo_point", "center");
+    if (strcmp(yolo_point, "left_top") == 0) l.yolo_point = YOLO_LEFT_TOP;
+    else if (strcmp(yolo_point, "right_bottom") == 0) l.yolo_point = YOLO_RIGHT_BOTTOM;
+    else l.yolo_point = YOLO_CENTER;
+
+    fprintf(stderr, "[Gaussian_yolo] iou loss: %s (%d), iou_norm: %2.2f, obj_norm: %2.2f, cls_norm: %2.2f, delta_norm: %2.2f, scale: %2.2f, point: %d\n",
+        iou_loss, l.iou_loss, l.iou_normalizer, l.obj_normalizer, l.cls_normalizer, l.delta_normalizer, l.scale_x_y, l.yolo_point);
+
+    l.jitter = option_find_float(options, "jitter", .2);
+    l.resize = option_find_float_quiet(options, "resize", 1.0);
+
+    l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
+    l.truth_thresh = option_find_float(options, "truth_thresh", 1);
+    l.iou_thresh = option_find_float_quiet(options, "iou_thresh", 1); // recommended to use iou_thresh=0.213 in [yolo]
+    l.random = option_find_float_quiet(options, "random", 0);
+
+    char *map_file = option_find_str(options, "map", 0);
+    if (map_file) l.map = read_map(map_file);
+
+    a = option_find_str(options, "anchors", 0);
+    if (a) {
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for (i = 0; i < len; ++i) {
+            if (a[i] == ',') ++n;
+        }
+        for (i = 0; i < n; ++i) {
+            float bias = atof(a);
+            l.biases[i] = bias;
+            a = strchr(a, ',') + 1;
+        }
+    }
+    return l;
+}
+
+layer parse_region(list *options, size_params params)
+{
+    int coords = option_find_int(options, "coords", 4);
+    int classes = option_find_int(options, "classes", 20);
+    int num = option_find_int(options, "num", 1);
+    int max_boxes = option_find_int_quiet(options, "max", 200);
+
+    layer l = make_region_layer(params.batch, params.w, params.h, num, classes, coords, max_boxes);
+    if (l.outputs != params.inputs) {
+        error("Error: l.outputs == params.inputs, filters= in the [convolutional]-layer doesn't correspond to classes= or num= in [region]-layer", DARKNET_LOC);
+    }
+    //assert(l.outputs == params.inputs);
+
+    l.log = option_find_int_quiet(options, "log", 0);
+    l.sqrt = option_find_int_quiet(options, "sqrt", 0);
+
+    l.softmax = option_find_int(options, "softmax", 0);
+    l.focal_loss = option_find_int_quiet(options, "focal_loss", 0);
+    //l.max_boxes = option_find_int_quiet(options, "max",30);
+    l.jitter = option_find_float(options, "jitter", .2);
+    l.resize = option_find_float_quiet(options, "resize", 1.0);
+    l.rescore = option_find_int_quiet(options, "rescore",0);
+
+    l.thresh = option_find_float(options, "thresh", .5);
+    l.classfix = option_find_int_quiet(options, "classfix", 0);
+    l.absolute = option_find_int_quiet(options, "absolute", 0);
+    l.random = option_find_float_quiet(options, "random", 0);
+
+    l.coord_scale = option_find_float(options, "coord_scale", 1);
+    l.object_scale = option_find_float(options, "object_scale", 1);
+    l.noobject_scale = option_find_float(options, "noobject_scale", 1);
+    l.mask_scale = option_find_float(options, "mask_scale", 1);
+    l.class_scale = option_find_float(options, "class_scale", 1);
+    l.bias_match = option_find_int_quiet(options, "bias_match",0);
+
+    char *tree_file = option_find_str(options, "tree", 0);
+    if (tree_file) l.softmax_tree = read_tree(tree_file);
+    char *map_file = option_find_str(options, "map", 0);
+    if (map_file) l.map = read_map(map_file);
+
+    char *a = option_find_str(options, "anchors", 0);
+    if(a){
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (a[i] == ',') ++n;
+        }
+        for(i = 0; i < n && i < num*2; ++i){
+            float bias = atof(a);
+            l.biases[i] = bias;
+            a = strchr(a, ',')+1;
+        }
+    }
+    return l;
+}
+detection_layer parse_detection(list *options, size_params params)
+{
+    int coords = option_find_int(options, "coords", 1);
+    int classes = option_find_int(options, "classes", 1);
+    int rescore = option_find_int(options, "rescore", 0);
+    int num = option_find_int(options, "num", 1);
+    int side = option_find_int(options, "side", 7);
+    detection_layer layer = make_detection_layer(params.batch, params.inputs, num, side, classes, coords, rescore);
+
+    layer.softmax = option_find_int(options, "softmax", 0);
+    layer.sqrt = option_find_int(options, "sqrt", 0);
+
+    layer.max_boxes = option_find_int_quiet(options, "max",200);
+    layer.coord_scale = option_find_float(options, "coord_scale", 1);
+    layer.forced = option_find_int(options, "forced", 0);
+    layer.object_scale = option_find_float(options, "object_scale", 1);
+    layer.noobject_scale = option_find_float(options, "noobject_scale", 1);
+    layer.class_scale = option_find_float(options, "class_scale", 1);
+    layer.jitter = option_find_float(options, "jitter", .2);
+    layer.resize = option_find_float_quiet(options, "resize", 1.0);
+    layer.random = option_find_float_quiet(options, "random", 0);
+    layer.reorg = option_find_int_quiet(options, "reorg", 0);
+    return layer;
+}
+
+cost_layer parse_cost(list *options, size_params params)
+{
+    char *type_s = option_find_str(options, "type", "sse");
+    COST_TYPE type = get_cost_type(type_s);
+    float scale = option_find_float_quiet(options, "scale",1);
+    cost_layer layer = make_cost_layer(params.batch, params.inputs, type, scale);
+    layer.ratio =  option_find_float_quiet(options, "ratio",0);
+    return layer;
+}
+
+crop_layer parse_crop(list *options, size_params params)
+{
+    int crop_height = option_find_int(options, "crop_height",1);
+    int crop_width = option_find_int(options, "crop_width",1);
+    int flip = option_find_int(options, "flip",0);
+    float angle = option_find_float(options, "angle",0);
+    float saturation = option_find_float(options, "saturation",1);
+    float exposure = option_find_float(options, "exposure",1);
+
+    int batch,h,w,c;
+    h = params.h;
+    w = params.w;
+    c = params.c;
+    batch=params.batch;
+    if(!(h && w && c)) error("Layer before crop layer must output image.", DARKNET_LOC);
+
+    int noadjust = option_find_int_quiet(options, "noadjust",0);
+
+    crop_layer l = make_crop_layer(batch,h,w,c,crop_height,crop_width,flip, angle, saturation, exposure);
+    l.shift = option_find_float(options, "shift", 0);
+    l.noadjust = noadjust;
+    return l;
+}
+
+layer parse_reorg(list *options, size_params params)
+{
+    int stride = option_find_int(options, "stride",1);
+    int reverse = option_find_int_quiet(options, "reverse",0);
+
+    int batch,h,w,c;
+    h = params.h;
+    w = params.w;
+    c = params.c;
+    batch=params.batch;
+    if(!(h && w && c)) error("Layer before reorg layer must output image.", DARKNET_LOC);
+
+    layer layer = make_reorg_layer(batch,w,h,c,stride,reverse);
+    return layer;
+}
+
+layer parse_reorg_old(list *options, size_params params)
+{
+    printf("\n reorg_old \n");
+    int stride = option_find_int(options, "stride", 1);
+    int reverse = option_find_int_quiet(options, "reverse", 0);
+
+    int batch, h, w, c;
+    h = params.h;
+    w = params.w;
+    c = params.c;
+    batch = params.batch;
+    if (!(h && w && c)) error("Layer before reorg layer must output image.", DARKNET_LOC);
+
+    layer layer = make_reorg_old_layer(batch, w, h, c, stride, reverse);
+    return layer;
+}
+
+maxpool_layer parse_local_avgpool(list *options, size_params params)
+{
+    int stride = option_find_int(options, "stride", 1);
+    int stride_x = option_find_int_quiet(options, "stride_x", stride);
+    int stride_y = option_find_int_quiet(options, "stride_y", stride);
+    int size = option_find_int(options, "size", stride);
+    int padding = option_find_int_quiet(options, "padding", size - 1);
+    int maxpool_depth = 0;
+    int out_channels = 1;
+    int antialiasing = 0;
+    const int avgpool = 1;
+
+    int batch, h, w, c;
+    h = params.h;
+    w = params.w;
+    c = params.c;
+    batch = params.batch;
+    if (!(h && w && c)) error("Layer before [local_avgpool] layer must output image.", DARKNET_LOC);
+
+    maxpool_layer layer = make_maxpool_layer(batch, h, w, c, size, stride_x, stride_y, padding, maxpool_depth, out_channels, antialiasing, avgpool, params.train);
+    return layer;
+}
+
+maxpool_layer parse_maxpool(list *options, size_params params)
+{
+    int stride = option_find_int(options, "stride",1);
+    int stride_x = option_find_int_quiet(options, "stride_x", stride);
+    int stride_y = option_find_int_quiet(options, "stride_y", stride);
+    int size = option_find_int(options, "size",stride);
+    int padding = option_find_int_quiet(options, "padding", size-1);
+    int maxpool_depth = option_find_int_quiet(options, "maxpool_depth", 0);
+    int out_channels = option_find_int_quiet(options, "out_channels", 1);
+    int antialiasing = option_find_int_quiet(options, "antialiasing", 0);
+    const int avgpool = 0;
+
+    int batch,h,w,c;
+    h = params.h;
+    w = params.w;
+    c = params.c;
+    batch=params.batch;
+    if(!(h && w && c)) error("Layer before [maxpool] layer must output image.", DARKNET_LOC);
+
+    maxpool_layer layer = make_maxpool_layer(batch, h, w, c, size, stride_x, stride_y, padding, maxpool_depth, out_channels, antialiasing, avgpool, params.train);
+    layer.maxpool_zero_nonmax = option_find_int_quiet(options, "maxpool_zero_nonmax", 0);
+    return layer;
+}
+
+avgpool_layer parse_avgpool(list *options, size_params params)
+{
+    int batch,w,h,c;
+    w = params.w;
+    h = params.h;
+    c = params.c;
+    batch=params.batch;
+    if(!(h && w && c)) error("Layer before avgpool layer must output image.", DARKNET_LOC);
+
+    avgpool_layer layer = make_avgpool_layer(batch,w,h,c);
+    return layer;
+}
+
+dropout_layer parse_dropout(list *options, size_params params)
+{
+    float probability = option_find_float(options, "probability", .2);
+    int dropblock = option_find_int_quiet(options, "dropblock", 0);
+    float dropblock_size_rel = option_find_float_quiet(options, "dropblock_size_rel", 0);
+    int dropblock_size_abs = option_find_float_quiet(options, "dropblock_size_abs", 0);
+    if (dropblock_size_abs > params.w || dropblock_size_abs > params.h) {
+        printf(" [dropout] - dropblock_size_abs = %d that is bigger than layer size %d x %d \n", dropblock_size_abs, params.w, params.h);
+        dropblock_size_abs = min_val_cmp(params.w, params.h);
+    }
+    if (dropblock && !dropblock_size_rel && !dropblock_size_abs) {
+        printf(" [dropout] - None of the parameters (dropblock_size_rel or dropblock_size_abs) are set, will be used: dropblock_size_abs = 7 \n");
+        dropblock_size_abs = 7;
+    }
+    if (dropblock_size_rel && dropblock_size_abs) {
+        printf(" [dropout] - Both parameters are set, only the parameter will be used: dropblock_size_abs = %d \n", dropblock_size_abs);
+        dropblock_size_rel = 0;
+    }
+    dropout_layer layer = make_dropout_layer(params.batch, params.inputs, probability, dropblock, dropblock_size_rel, dropblock_size_abs, params.w, params.h, params.c);
+    layer.out_w = params.w;
+    layer.out_h = params.h;
+    layer.out_c = params.c;
+    return layer;
+}
+
+layer parse_normalization(list *options, size_params params)
+{
+    float alpha = option_find_float(options, "alpha", .0001);
+    float beta =  option_find_float(options, "beta" , .75);
+    float kappa = option_find_float(options, "kappa", 1);
+    int size = option_find_int(options, "size", 5);
+    layer l = make_normalization_layer(params.batch, params.w, params.h, params.c, size, alpha, beta, kappa);
+    return l;
+}
+
+layer parse_batchnorm(list *options, size_params params)
+{
+    layer l = make_batchnorm_layer(params.batch, params.w, params.h, params.c, params.train);
+    return l;
+}
+
+layer parse_shortcut(list *options, size_params params, network net)
+{
+    char *activation_s = option_find_str(options, "activation", "linear");
+    ACTIVATION activation = get_activation(activation_s);
+
+    char *weights_type_str = option_find_str_quiet(options, "weights_type", "none");
+    WEIGHTS_TYPE_T weights_type = NO_WEIGHTS;
+    if(strcmp(weights_type_str, "per_feature") == 0 || strcmp(weights_type_str, "per_layer") == 0) weights_type = PER_FEATURE;
+    else if (strcmp(weights_type_str, "per_channel") == 0) weights_type = PER_CHANNEL;
+    else if (strcmp(weights_type_str, "none") != 0) {
+        printf("Error: Incorrect weights_type = %s \n Use one of: none, per_feature, per_channel \n", weights_type_str);
+        error("Error!", DARKNET_LOC);
+    }
+
+    char *weights_normalization_str = option_find_str_quiet(options, "weights_normalization", "none");
+    WEIGHTS_NORMALIZATION_T weights_normalization = NO_NORMALIZATION;
+    if (strcmp(weights_normalization_str, "relu") == 0 || strcmp(weights_normalization_str, "avg_relu") == 0) weights_normalization = RELU_NORMALIZATION;
+    else if (strcmp(weights_normalization_str, "softmax") == 0) weights_normalization = SOFTMAX_NORMALIZATION;
+    else if (strcmp(weights_type_str, "none") != 0) {
+        printf("Error: Incorrect weights_normalization = %s \n Use one of: none, relu, softmax \n", weights_normalization_str);
+        error("Error!", DARKNET_LOC);
+    }
+
+    char *l = option_find(options, "from");
+    int len = strlen(l);
+    if (!l) error("Route Layer must specify input layers: from = ...", DARKNET_LOC);
+    int n = 1;
+    int i;
+    for (i = 0; i < len; ++i) {
+        if (l[i] == ',') ++n;
+    }
+
+    int* layers = (int*)calloc(n, sizeof(int));
+    int* sizes = (int*)calloc(n, sizeof(int));
+    float **layers_output = (float **)calloc(n, sizeof(float *));
+    float **layers_delta = (float **)calloc(n, sizeof(float *));
+    float **layers_output_gpu = (float **)calloc(n, sizeof(float *));
+    float **layers_delta_gpu = (float **)calloc(n, sizeof(float *));
+
+    for (i = 0; i < n; ++i) {
+        int index = atoi(l);
+        l = strchr(l, ',') + 1;
+        if (index < 0) index = params.index + index;
+        layers[i] = index;
+        sizes[i] = params.net.layers[index].outputs;
+        layers_output[i] = params.net.layers[index].output;
+        layers_delta[i] = params.net.layers[index].delta;
+    }
+
+#ifdef GPU
+    for (i = 0; i < n; ++i) {
+        layers_output_gpu[i] = params.net.layers[layers[i]].output_gpu;
+        layers_delta_gpu[i] = params.net.layers[layers[i]].delta_gpu;
+    }
+#endif// GPU
+
+    layer s = make_shortcut_layer(params.batch, n, layers, sizes, params.w, params.h, params.c, layers_output, layers_delta,
+        layers_output_gpu, layers_delta_gpu, weights_type, weights_normalization, activation, params.train);
+
+    free(layers_output_gpu);
+    free(layers_delta_gpu);
+
+    for (i = 0; i < n; ++i) {
+        int index = layers[i];
+        assert(params.w == net.layers[index].out_w && params.h == net.layers[index].out_h);
+
+        if (params.w != net.layers[index].out_w || params.h != net.layers[index].out_h || params.c != net.layers[index].out_c)
+            fprintf(stderr, " (%4d x%4d x%4d) + (%4d x%4d x%4d) \n",
+                params.w, params.h, params.c, net.layers[index].out_w, net.layers[index].out_h, params.net.layers[index].out_c);
+    }
+
+    return s;
+}
+
+
+layer parse_scale_channels(list *options, size_params params, network net)
+{
+    char *l = option_find(options, "from");
+    int index = atoi(l);
+    if (index < 0) index = params.index + index;
+    int scale_wh = option_find_int_quiet(options, "scale_wh", 0);
+
+    int batch = params.batch;
+    layer from = net.layers[index];
+
+    layer s = make_scale_channels_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c, scale_wh);
+
+    char *activation_s = option_find_str_quiet(options, "activation", "linear");
+    ACTIVATION activation = get_activation(activation_s);
+    s.activation = activation;
+    if (activation == SWISH || activation == MISH) {
+        printf(" [scale_channels] layer doesn't support SWISH or MISH activations \n");
+    }
+    return s;
+}
+
+layer parse_sam(list *options, size_params params, network net)
+{
+    char *l = option_find(options, "from");
+    int index = atoi(l);
+    if (index < 0) index = params.index + index;
+
+    int batch = params.batch;
+    layer from = net.layers[index];
+
+    layer s = make_sam_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c);
+
+    char *activation_s = option_find_str_quiet(options, "activation", "linear");
+    ACTIVATION activation = get_activation(activation_s);
+    s.activation = activation;
+    if (activation == SWISH || activation == MISH) {
+        printf(" [sam] layer doesn't support SWISH or MISH activations \n");
+    }
+    return s;
+}
+
+layer parse_implicit(list *options, size_params params, network net)
+{
+    float mean_init = option_find_float(options, "mean", 0.0);
+    float std_init = option_find_float(options, "std", 0.2);
+    int filters = option_find_int(options, "filters", 128);
+    int atoms = option_find_int_quiet(options, "atoms", 1);
+
+    layer s = make_implicit_layer(params.batch, params.index, mean_init, std_init, filters, atoms);
+
+    return s;
+}
+
+layer parse_activation(list *options, size_params params)
+{
+    char *activation_s = option_find_str(options, "activation", "linear");
+    ACTIVATION activation = get_activation(activation_s);
+
+    layer l = make_activation_layer(params.batch, params.inputs, activation);
+
+    l.out_h = params.h;
+    l.out_w = params.w;
+    l.out_c = params.c;
+    l.h = params.h;
+    l.w = params.w;
+    l.c = params.c;
+
+    return l;
+}
+
+layer parse_upsample(list *options, size_params params, network net)
+{
+
+    int stride = option_find_int(options, "stride", 2);
+    layer l = make_upsample_layer(params.batch, params.w, params.h, params.c, stride);
+    l.scale = option_find_float_quiet(options, "scale", 1);
+    return l;
+}
+
+route_layer parse_route(list *options, size_params params)
+{
+    char *l = option_find(options, "layers");
+    if(!l) error("Route Layer must specify input layers", DARKNET_LOC);
+    int len = strlen(l);
+    int n = 1;
+    int i;
+    for(i = 0; i < len; ++i){
+        if (l[i] == ',') ++n;
+    }
+
+    int* layers = (int*)xcalloc(n, sizeof(int));
+    int* sizes = (int*)xcalloc(n, sizeof(int));
+    for(i = 0; i < n; ++i){
+        int index = atoi(l);
+        l = strchr(l, ',')+1;
+        if(index < 0) index = params.index + index;
+        layers[i] = index;
+        sizes[i] = params.net.layers[index].outputs;
+    }
+    int batch = params.batch;
+
+    int groups = option_find_int_quiet(options, "groups", 1);
+    int group_id = option_find_int_quiet(options, "group_id", 0);
+
+    route_layer layer = make_route_layer(batch, n, layers, sizes, groups, group_id);
+
+    convolutional_layer first = params.net.layers[layers[0]];
+    layer.out_w = first.out_w;
+    layer.out_h = first.out_h;
+    layer.out_c = first.out_c;
+    for(i = 1; i < n; ++i){
+        int index = layers[i];
+        convolutional_layer next = params.net.layers[index];
+        if(next.out_w == first.out_w && next.out_h == first.out_h){
+            layer.out_c += next.out_c;
+        }else{
+            fprintf(stderr, " The width and height of the input layers are different. \n");
+            layer.out_h = layer.out_w = layer.out_c = 0;
+        }
+    }
+    layer.out_c = layer.out_c / layer.groups;
+
+    layer.w = first.w;
+    layer.h = first.h;
+    layer.c = layer.out_c;
+
+    layer.stream = option_find_int_quiet(options, "stream", -1);
+    layer.wait_stream_id = option_find_int_quiet(options, "wait_stream", -1);
+
+    if (n > 3) fprintf(stderr, " \t    ");
+    else if (n > 1) fprintf(stderr, " \t            ");
+    else fprintf(stderr, " \t\t            ");
+
+    fprintf(stderr, "           ");
+    if (layer.groups > 1) fprintf(stderr, "%d/%d", layer.group_id, layer.groups);
+    else fprintf(stderr, "   ");
+    fprintf(stderr, " -> %4d x%4d x%4d \n", layer.out_w, layer.out_h, layer.out_c);
+
+    return layer;
+}
+
+learning_rate_policy get_policy(char *s)
+{
+    if (strcmp(s, "random")==0) return RANDOM;
+    if (strcmp(s, "poly")==0) return POLY;
+    if (strcmp(s, "constant")==0) return CONSTANT;
+    if (strcmp(s, "step")==0) return STEP;
+    if (strcmp(s, "exp")==0) return EXP;
+    if (strcmp(s, "sigmoid")==0) return SIG;
+    if (strcmp(s, "steps")==0) return STEPS;
+    if (strcmp(s, "sgdr")==0) return SGDR;
+    fprintf(stderr, "Couldn't find policy %s, going with constant\n", s);
+    return CONSTANT;
+}
+
+void parse_net_options(list *options, network *net)
+{
+    net->max_batches = option_find_int(options, "max_batches", 0);
+    net->batch = option_find_int(options, "batch",1);
+    net->learning_rate = option_find_float(options, "learning_rate", .001);
+    net->learning_rate_min = option_find_float_quiet(options, "learning_rate_min", .00001);
+    net->batches_per_cycle = option_find_int_quiet(options, "sgdr_cycle", net->max_batches);
+    net->batches_cycle_mult = option_find_int_quiet(options, "sgdr_mult", 2);
+    net->momentum = option_find_float(options, "momentum", .9);
+    net->decay = option_find_float(options, "decay", .0001);
+    int subdivs = option_find_int(options, "subdivisions",1);
+    net->time_steps = option_find_int_quiet(options, "time_steps",1);
+    net->track = option_find_int_quiet(options, "track", 0);
+    net->augment_speed = option_find_int_quiet(options, "augment_speed", 2);
+    net->init_sequential_subdivisions = net->sequential_subdivisions = option_find_int_quiet(options, "sequential_subdivisions", subdivs);
+    if (net->sequential_subdivisions > subdivs) net->init_sequential_subdivisions = net->sequential_subdivisions = subdivs;
+    net->try_fix_nan = option_find_int_quiet(options, "try_fix_nan", 0);
+    net->batch /= subdivs;          // mini_batch
+    const int mini_batch = net->batch;
+    net->batch *= net->time_steps;  // mini_batch * time_steps
+    net->subdivisions = subdivs;    // number of mini_batches
+
+    net->weights_reject_freq = option_find_int_quiet(options, "weights_reject_freq", 0);
+    net->equidistant_point = option_find_int_quiet(options, "equidistant_point", 0);
+    net->badlabels_rejection_percentage = option_find_float_quiet(options, "badlabels_rejection_percentage", 0);
+    net->num_sigmas_reject_badlabels = option_find_float_quiet(options, "num_sigmas_reject_badlabels", 0);
+    net->ema_alpha = option_find_float_quiet(options, "ema_alpha", 0);
+    *net->badlabels_reject_threshold = 0;
+    *net->delta_rolling_max = 0;
+    *net->delta_rolling_avg = 0;
+    *net->delta_rolling_std = 0;
+    *net->seen = 0;
+    *net->cur_iteration = 0;
+    *net->cuda_graph_ready = 0;
+    net->use_cuda_graph = option_find_int_quiet(options, "use_cuda_graph", 0);
+    net->loss_scale = option_find_float_quiet(options, "loss_scale", 1);
+    net->dynamic_minibatch = option_find_int_quiet(options, "dynamic_minibatch", 0);
+    net->optimized_memory = option_find_int_quiet(options, "optimized_memory", 0);
+    net->workspace_size_limit = (size_t)1024*1024 * option_find_float_quiet(options, "workspace_size_limit_MB", 1024);  // 1024 MB by default
+
+
+    net->adam = option_find_int_quiet(options, "adam", 0);
+    if(net->adam){
+        net->B1 = option_find_float(options, "B1", .9);
+        net->B2 = option_find_float(options, "B2", .999);
+        net->eps = option_find_float(options, "eps", .000001);
+    }
+
+    net->h = option_find_int_quiet(options, "height",0);
+    net->w = option_find_int_quiet(options, "width",0);
+    net->c = option_find_int_quiet(options, "channels",0);
+    net->inputs = option_find_int_quiet(options, "inputs", net->h * net->w * net->c);
+    net->max_crop = option_find_int_quiet(options, "max_crop",net->w*2);
+    net->min_crop = option_find_int_quiet(options, "min_crop",net->w);
+    net->flip = option_find_int_quiet(options, "flip", 1);
+    net->blur = option_find_int_quiet(options, "blur", 0);
+    net->gaussian_noise = option_find_int_quiet(options, "gaussian_noise", 0);
+    net->mixup = option_find_int_quiet(options, "mixup", 0);
+    int cutmix = option_find_int_quiet(options, "cutmix", 0);
+    int mosaic = option_find_int_quiet(options, "mosaic", 0);
+    if (mosaic && cutmix) net->mixup = 4;
+    else if (cutmix) net->mixup = 2;
+    else if (mosaic) net->mixup = 3;
+    net->letter_box = option_find_int_quiet(options, "letter_box", 0);
+    net->mosaic_bound = option_find_int_quiet(options, "mosaic_bound", 0);
+    net->contrastive = option_find_int_quiet(options, "contrastive", 0);
+    net->contrastive_jit_flip = option_find_int_quiet(options, "contrastive_jit_flip", 0);
+    net->contrastive_color = option_find_int_quiet(options, "contrastive_color", 0);
+    net->unsupervised = option_find_int_quiet(options, "unsupervised", 0);
+    if (net->contrastive && mini_batch < 2) {
+        error("Error: mini_batch size (batch/subdivisions) should be higher than 1 for Contrastive loss!", DARKNET_LOC);
+    }
+    net->label_smooth_eps = option_find_float_quiet(options, "label_smooth_eps", 0.0f);
+    net->resize_step = option_find_float_quiet(options, "resize_step", 32);
+    net->attention = option_find_int_quiet(options, "attention", 0);
+    net->adversarial_lr = option_find_float_quiet(options, "adversarial_lr", 0);
+    net->max_chart_loss = option_find_float_quiet(options, "max_chart_loss", 20.0);
+
+    net->angle = option_find_float_quiet(options, "angle", 0);
+    net->aspect = option_find_float_quiet(options, "aspect", 1);
+    net->saturation = option_find_float_quiet(options, "saturation", 1);
+    net->exposure = option_find_float_quiet(options, "exposure", 1);
+    net->hue = option_find_float_quiet(options, "hue", 0);
+    net->power = option_find_float_quiet(options, "power", 4);
+
+    if(!net->inputs && !(net->h && net->w && net->c)) error("No input parameters supplied", DARKNET_LOC);
+
+    char *policy_s = option_find_str(options, "policy", "constant");
+    net->policy = get_policy(policy_s);
+    net->burn_in = option_find_int_quiet(options, "burn_in", 0);
+#ifdef GPU
+    if (net->gpu_index >= 0) {
+        char device_name[1024];
+        int compute_capability = get_gpu_compute_capability(net->gpu_index, device_name);
+#ifdef CUDNN_HALF
+        if (compute_capability >= 700) net->cudnn_half = 1;
+        else net->cudnn_half = 0;
+#endif// CUDNN_HALF
+        fprintf(stderr, " %d : compute_capability = %d, cudnn_half = %d, GPU: %s \n", net->gpu_index, compute_capability, net->cudnn_half, device_name);
+    }
+    else fprintf(stderr, " GPU isn't used \n");
+#endif// GPU
+    if(net->policy == STEP){
+        net->step = option_find_int(options, "step", 1);
+        net->scale = option_find_float(options, "scale", 1);
+    } else if (net->policy == STEPS || net->policy == SGDR){
+        char *l = option_find(options, "steps");
+        char *p = option_find(options, "scales");
+        char *s = option_find(options, "seq_scales");
+        if(net->policy == STEPS && (!l || !p)) error("STEPS policy must have steps and scales in cfg file", DARKNET_LOC);
+
+        if (l) {
+            int len = strlen(l);
+            int n = 1;
+            int i;
+            for (i = 0; i < len; ++i) {
+                if (l[i] == '#') break;
+                if (l[i] == ',') ++n;
+            }
+            int* steps = (int*)xcalloc(n, sizeof(int));
+            float* scales = (float*)xcalloc(n, sizeof(float));
+            float* seq_scales = (float*)xcalloc(n, sizeof(float));
+            for (i = 0; i < n; ++i) {
+                float scale = 1.0;
+                if (p) {
+                    scale = atof(p);
+                    p = strchr(p, ',') + 1;
+                }
+                float sequence_scale = 1.0;
+                if (s) {
+                    sequence_scale = atof(s);
+                    s = strchr(s, ',') + 1;
+                }
+                int step = atoi(l);
+                l = strchr(l, ',') + 1;
+                steps[i] = step;
+                scales[i] = scale;
+                seq_scales[i] = sequence_scale;
+            }
+            net->scales = scales;
+            net->steps = steps;
+            net->seq_scales = seq_scales;
+            net->num_steps = n;
+        }
+    } else if (net->policy == EXP){
+        net->gamma = option_find_float(options, "gamma", 1);
+    } else if (net->policy == SIG){
+        net->gamma = option_find_float(options, "gamma", 1);
+        net->step = option_find_int(options, "step", 1);
+    } else if (net->policy == POLY || net->policy == RANDOM){
+        //net->power = option_find_float(options, "power", 1);
+    }
+
+}
+
+int is_network(section *s)
+{
+    return (strcmp(s->type, "[net]")==0
+            || strcmp(s->type, "[network]")==0);
+}
+
+void set_train_only_bn(network net)
+{
+    int train_only_bn = 0;
+    int i;
+    for (i = net.n - 1; i >= 0; --i) {
+        if (net.layers[i].train_only_bn) train_only_bn = net.layers[i].train_only_bn;  // set l.train_only_bn for all previous layers
+        if (train_only_bn) {
+            net.layers[i].train_only_bn = train_only_bn;
+
+            if (net.layers[i].type == CONV_LSTM) {
+                net.layers[i].wf->train_only_bn = train_only_bn;
+                net.layers[i].wi->train_only_bn = train_only_bn;
+                net.layers[i].wg->train_only_bn = train_only_bn;
+                net.layers[i].wo->train_only_bn = train_only_bn;
+                net.layers[i].uf->train_only_bn = train_only_bn;
+                net.layers[i].ui->train_only_bn = train_only_bn;
+                net.layers[i].ug->train_only_bn = train_only_bn;
+                net.layers[i].uo->train_only_bn = train_only_bn;
+                if (net.layers[i].peephole) {
+                    net.layers[i].vf->train_only_bn = train_only_bn;
+                    net.layers[i].vi->train_only_bn = train_only_bn;
+                    net.layers[i].vo->train_only_bn = train_only_bn;
+                }
+            }
+            else if (net.layers[i].type == CRNN) {
+                net.layers[i].input_layer->train_only_bn = train_only_bn;
+                net.layers[i].self_layer->train_only_bn = train_only_bn;
+                net.layers[i].output_layer->train_only_bn = train_only_bn;
+            }
+        }
+    }
+}
+
+network parse_network_cfg(char *filename)
+{
+    return parse_network_cfg_custom(filename, 0, 0);
+}
+
+network parse_network_cfg_custom(char *filename, int batch, int time_steps)
+{
+    list *sections = read_cfg(filename);
+    node *n = sections->front;
+    if(!n) error("Config file has no sections", DARKNET_LOC);
+    network net = make_network(sections->size - 1);
+    net.gpu_index = gpu_index;
+    size_params params;
+
+    if (batch > 0) params.train = 0;    // allocates memory for Inference only
+    else params.train = 1;              // allocates memory for Inference & Training
+
+    section *s = (section *)n->val;
+    list *options = s->options;
+    if(!is_network(s)) error("First section must be [net] or [network]", DARKNET_LOC);
+    parse_net_options(options, &net);
+
+#ifdef GPU
+    printf("net.optimized_memory = %d \n", net.optimized_memory);
+    if (net.optimized_memory >= 2 && params.train) {
+        pre_allocate_pinned_memory((size_t)1024 * 1024 * 1024 * 8);   // pre-allocate 8 GB CPU-RAM for pinned memory
+    }
+#endif  // GPU
+
+    params.h = net.h;
+    params.w = net.w;
+    params.c = net.c;
+    params.inputs = net.inputs;
+    if (batch > 0) net.batch = batch;
+    if (time_steps > 0) net.time_steps = time_steps;
+    if (net.batch < 1) net.batch = 1;
+    if (net.time_steps < 1) net.time_steps = 1;
+    if (net.batch < net.time_steps) net.batch = net.time_steps;
+    params.batch = net.batch;
+    params.time_steps = net.time_steps;
+    params.net = net;
+    printf("mini_batch = %d, batch = %d, time_steps = %d, train = %d \n", net.batch, net.batch * net.subdivisions, net.time_steps, params.train);
+
+    int last_stop_backward = -1;
+    int avg_outputs = 0;
+    int avg_counter = 0;
+    float bflops = 0;
+    size_t workspace_size = 0;
+    size_t max_inputs = 0;
+    size_t max_outputs = 0;
+    int receptive_w = 1, receptive_h = 1;
+    int receptive_w_scale = 1, receptive_h_scale = 1;
+    const int show_receptive_field = option_find_float_quiet(options, "show_receptive_field", 0);
+
+    n = n->next;
+    int count = 0;
+    free_section(s);
+
+    // find l.stopbackward = option_find_int_quiet(options, "stopbackward", 0);
+    node *n_tmp = n;
+    int count_tmp = 0;
+    if (params.train == 1) {
+        while (n_tmp) {
+            s = (section *)n_tmp->val;
+            options = s->options;
+            int stopbackward = option_find_int_quiet(options, "stopbackward", 0);
+            if (stopbackward == 1) {
+                last_stop_backward = count_tmp;
+                printf("last_stop_backward = %d \n", last_stop_backward);
+            }
+            n_tmp = n_tmp->next;
+            ++count_tmp;
+        }
+    }
+
+    int old_params_train = params.train;
+
+    fprintf(stderr, "   layer   filters  size/strd(dil)      input                output\n");
+    while(n){
+
+        params.train = old_params_train;
+        if (count < last_stop_backward) params.train = 0;
+
+        params.index = count;
+        fprintf(stderr, "%4d ", count);
+        s = (section *)n->val;
+        options = s->options;
+        layer l = { (LAYER_TYPE)0 };
+        LAYER_TYPE lt = string_to_layer_type(s->type);
+        if(lt == CONVOLUTIONAL){
+            l = parse_convolutional(options, params);
+        }else if(lt == LOCAL){
+            l = parse_local(options, params);
+        }else if(lt == ACTIVE){
+            l = parse_activation(options, params);
+        }else if(lt == RNN){
+            l = parse_rnn(options, params);
+        }else if(lt == GRU){
+            l = parse_gru(options, params);
+        }else if(lt == LSTM){
+            l = parse_lstm(options, params);
+        }else if (lt == CONV_LSTM) {
+            l = parse_conv_lstm(options, params);
+        }else if (lt == HISTORY) {
+            l = parse_history(options, params);
+        }else if(lt == CRNN){
+            l = parse_crnn(options, params);
+        }else if(lt == CONNECTED){
+            l = parse_connected(options, params);
+        }else if(lt == CROP){
+            l = parse_crop(options, params);
+        }else if(lt == COST){
+            l = parse_cost(options, params);
+            l.keep_delta_gpu = 1;
+        }else if(lt == REGION){
+            l = parse_region(options, params);
+            l.keep_delta_gpu = 1;
+        }else if (lt == YOLO) {
+            l = parse_yolo(options, params);
+            l.keep_delta_gpu = 1;
+        }else if (lt == GAUSSIAN_YOLO) {
+            l = parse_gaussian_yolo(options, params);
+            l.keep_delta_gpu = 1;
+        }else if(lt == DETECTION){
+            l = parse_detection(options, params);
+        }else if(lt == SOFTMAX){
+            l = parse_softmax(options, params);
+            net.hierarchy = l.softmax_tree;
+            l.keep_delta_gpu = 1;
+        }else if (lt == CONTRASTIVE) {
+            l = parse_contrastive(options, params);
+            l.keep_delta_gpu = 1;
+        }else if(lt == NORMALIZATION){
+            l = parse_normalization(options, params);
+        }else if(lt == BATCHNORM){
+            l = parse_batchnorm(options, params);
+        }else if(lt == MAXPOOL){
+            l = parse_maxpool(options, params);
+        }else if (lt == LOCAL_AVGPOOL) {
+            l = parse_local_avgpool(options, params);
+        }else if(lt == REORG){
+            l = parse_reorg(options, params);        }
+        else if (lt == REORG_OLD) {
+            l = parse_reorg_old(options, params);
+        }else if(lt == AVGPOOL){
+            l = parse_avgpool(options, params);
+        }else if(lt == ROUTE){
+            l = parse_route(options, params);
+            int k;
+            for (k = 0; k < l.n; ++k) {
+                net.layers[l.input_layers[k]].use_bin_output = 0;
+                if (count >= last_stop_backward)
+                    net.layers[l.input_layers[k]].keep_delta_gpu = 1;
+            }
+        }else if (lt == UPSAMPLE) {
+            l = parse_upsample(options, params, net);
+        }else if(lt == SHORTCUT){
+            l = parse_shortcut(options, params, net);
+            net.layers[count - 1].use_bin_output = 0;
+            net.layers[l.index].use_bin_output = 0;
+            if (count >= last_stop_backward)
+                net.layers[l.index].keep_delta_gpu = 1;
+        }else if (lt == SCALE_CHANNELS) {
+            l = parse_scale_channels(options, params, net);
+            net.layers[count - 1].use_bin_output = 0;
+            net.layers[l.index].use_bin_output = 0;
+            net.layers[l.index].keep_delta_gpu = 1;
+        }
+        else if (lt == SAM) {
+            l = parse_sam(options, params, net);
+            net.layers[count - 1].use_bin_output = 0;
+            net.layers[l.index].use_bin_output = 0;
+            net.layers[l.index].keep_delta_gpu = 1;
+        } else if (lt == IMPLICIT) {
+            l = parse_implicit(options, params, net);
+        }else if(lt == DROPOUT){
+            l = parse_dropout(options, params);
+            l.output = net.layers[count-1].output;
+            l.delta = net.layers[count-1].delta;
+#ifdef GPU
+            l.output_gpu = net.layers[count-1].output_gpu;
+            l.delta_gpu = net.layers[count-1].delta_gpu;
+            l.keep_delta_gpu = 1;
+#endif
+        }
+        else if (lt == EMPTY) {
+            layer empty_layer = {(LAYER_TYPE)0};
+            l = empty_layer;
+            l.type = EMPTY;
+            l.w = l.out_w = params.w;
+            l.h = l.out_h = params.h;
+            l.c = l.out_c = params.c;
+            l.batch = params.batch;
+            l.inputs = l.outputs = params.inputs;
+            l.output = net.layers[count - 1].output;
+            l.delta = net.layers[count - 1].delta;
+            l.forward = empty_func;
+            l.backward = empty_func;
+#ifdef GPU
+            l.output_gpu = net.layers[count - 1].output_gpu;
+            l.delta_gpu = net.layers[count - 1].delta_gpu;
+            l.keep_delta_gpu = 1;
+            l.forward_gpu = empty_func;
+            l.backward_gpu = empty_func;
+#endif
+            fprintf(stderr, "empty \n");
+        }else{
+            fprintf(stderr, "Type not recognized: %s\n", s->type);
+        }
+
+        // calculate receptive field
+        if(show_receptive_field)
+        {
+            int dilation = max_val_cmp(1, l.dilation);
+            int stride = max_val_cmp(1, l.stride);
+            int size = max_val_cmp(1, l.size);
+
+            if (l.type == UPSAMPLE || (l.type == REORG))
+            {
+
+                l.receptive_w = receptive_w;
+                l.receptive_h = receptive_h;
+                l.receptive_w_scale = receptive_w_scale = receptive_w_scale / stride;
+                l.receptive_h_scale = receptive_h_scale = receptive_h_scale / stride;
+
+            }
+            else {
+                if (l.type == ROUTE) {
+                    receptive_w = receptive_h = receptive_w_scale = receptive_h_scale = 0;
+                    int k;
+                    for (k = 0; k < l.n; ++k) {
+                        layer route_l = net.layers[l.input_layers[k]];
+                        receptive_w = max_val_cmp(receptive_w, route_l.receptive_w);
+                        receptive_h = max_val_cmp(receptive_h, route_l.receptive_h);
+                        receptive_w_scale = max_val_cmp(receptive_w_scale, route_l.receptive_w_scale);
+                        receptive_h_scale = max_val_cmp(receptive_h_scale, route_l.receptive_h_scale);
+                    }
+                }
+                else
+                {
+                    int increase_receptive = size + (dilation - 1) * 2 - 1;// stride;
+                    increase_receptive = max_val_cmp(0, increase_receptive);
+
+                    receptive_w += increase_receptive * receptive_w_scale;
+                    receptive_h += increase_receptive * receptive_h_scale;
+                    receptive_w_scale *= stride;
+                    receptive_h_scale *= stride;
+                }
+
+                l.receptive_w = receptive_w;
+                l.receptive_h = receptive_h;
+                l.receptive_w_scale = receptive_w_scale;
+                l.receptive_h_scale = receptive_h_scale;
+            }
+            //printf(" size = %d, dilation = %d, stride = %d, receptive_w = %d, receptive_w_scale = %d - ", size, dilation, stride, receptive_w, receptive_w_scale);
+
+            int cur_receptive_w = receptive_w;
+            int cur_receptive_h = receptive_h;
+
+            fprintf(stderr, "%4d - receptive field: %d x %d \n", count, cur_receptive_w, cur_receptive_h);
+        }
+
+#ifdef GPU
+        // futher GPU-memory optimization: net.optimized_memory == 2
+        l.optimized_memory = net.optimized_memory;
+        if (net.optimized_memory == 1 && params.train && l.type != DROPOUT) {
+            if (l.delta_gpu) {
+                cuda_free(l.delta_gpu);
+                l.delta_gpu = NULL;
+            }
+        } else if (net.optimized_memory >= 2 && params.train && l.type != DROPOUT)
+        {
+            if (l.output_gpu) {
+                cuda_free(l.output_gpu);
+                //l.output_gpu = cuda_make_array_pinned(l.output, l.batch*l.outputs); // l.steps
+                l.output_gpu = cuda_make_array_pinned_preallocated(NULL, l.batch*l.outputs); // l.steps
+            }
+            if (l.activation_input_gpu) {
+                cuda_free(l.activation_input_gpu);
+                l.activation_input_gpu = cuda_make_array_pinned_preallocated(NULL, l.batch*l.outputs); // l.steps
+            }
+
+            if (l.x_gpu) {
+                cuda_free(l.x_gpu);
+                l.x_gpu = cuda_make_array_pinned_preallocated(NULL, l.batch*l.outputs); // l.steps
+            }
+
+            // maximum optimization
+            if (net.optimized_memory >= 3 && l.type != DROPOUT) {
+                if (l.delta_gpu) {
+                    cuda_free(l.delta_gpu);
+                    //l.delta_gpu = cuda_make_array_pinned_preallocated(NULL, l.batch*l.outputs); // l.steps
+                    //printf("\n\n PINNED DELTA GPU = %d \n", l.batch*l.outputs);
+                }
+            }
+
+            if (l.type == CONVOLUTIONAL) {
+                set_specified_workspace_limit(&l, net.workspace_size_limit);   // workspace size limit 1 GB
+            }
+        }
+#endif // GPU
+
+        l.clip = option_find_float_quiet(options, "clip", 0);
+        l.dynamic_minibatch = net.dynamic_minibatch;
+        l.onlyforward = option_find_int_quiet(options, "onlyforward", 0);
+        l.dont_update = option_find_int_quiet(options, "dont_update", 0);
+        l.burnin_update = option_find_int_quiet(options, "burnin_update", 0);
+        l.stopbackward = option_find_int_quiet(options, "stopbackward", 0);
+        l.train_only_bn = option_find_int_quiet(options, "train_only_bn", 0);
+        l.dontload = option_find_int_quiet(options, "dontload", 0);
+        l.dontloadscales = option_find_int_quiet(options, "dontloadscales", 0);
+        l.learning_rate_scale = option_find_float_quiet(options, "learning_rate", 1);
+        option_unused(options);
+
+        if (l.stopbackward == 1) printf(" ------- previous layers are frozen ------- \n");
+
+        net.layers[count] = l;
+        if (l.workspace_size > workspace_size) workspace_size = l.workspace_size;
+        if (l.inputs > max_inputs) max_inputs = l.inputs;
+        if (l.outputs > max_outputs) max_outputs = l.outputs;
+        free_section(s);
+        n = n->next;
+        ++count;
+        if(n){
+            if (l.antialiasing) {
+                params.h = l.input_layer->out_h;
+                params.w = l.input_layer->out_w;
+                params.c = l.input_layer->out_c;
+                params.inputs = l.input_layer->outputs;
+            }
+            else {
+                params.h = l.out_h;
+                params.w = l.out_w;
+                params.c = l.out_c;
+                params.inputs = l.outputs;
+            }
+        }
+        if (l.bflops > 0) bflops += l.bflops;
+
+        if (l.w > 1 && l.h > 1) {
+            avg_outputs += l.outputs;
+            avg_counter++;
+        }
+    }
+
+    if (last_stop_backward > -1) {
+        int k;
+        for (k = 0; k < last_stop_backward; ++k) {
+            layer l = net.layers[k];
+            if (l.keep_delta_gpu) {
+                if (!l.delta) {
+                    net.layers[k].delta = (float*)xcalloc(l.outputs*l.batch, sizeof(float));
+                }
+#ifdef GPU
+                if (!l.delta_gpu) {
+                    net.layers[k].delta_gpu = (float *)cuda_make_array(NULL, l.outputs*l.batch);
+                }
+#endif
+            }
+
+            net.layers[k].onlyforward = 1;
+            net.layers[k].train = 0;
+        }
+    }
+
+    free_list(sections);
+
+#ifdef GPU
+    if (net.optimized_memory && params.train)
+    {
+        int k;
+        for (k = 0; k < net.n; ++k) {
+            layer l = net.layers[k];
+            // delta GPU-memory optimization: net.optimized_memory == 1
+            if (!l.keep_delta_gpu) {
+                const size_t delta_size = l.outputs*l.batch; // l.steps
+                if (net.max_delta_gpu_size < delta_size) {
+                    net.max_delta_gpu_size = delta_size;
+                    if (net.global_delta_gpu) cuda_free(net.global_delta_gpu);
+                    if (net.state_delta_gpu) cuda_free(net.state_delta_gpu);
+                    assert(net.max_delta_gpu_size > 0);
+                    net.global_delta_gpu = (float *)cuda_make_array(NULL, net.max_delta_gpu_size);
+                    net.state_delta_gpu = (float *)cuda_make_array(NULL, net.max_delta_gpu_size);
+                }
+                if (l.delta_gpu) {
+                    if (net.optimized_memory >= 3) {}
+                    else cuda_free(l.delta_gpu);
+                }
+                l.delta_gpu = net.global_delta_gpu;
+            }
+            else {
+                if (!l.delta_gpu) l.delta_gpu = (float *)cuda_make_array(NULL, l.outputs*l.batch);
+            }
+
+            // maximum optimization
+            if (net.optimized_memory >= 3 && l.type != DROPOUT) {
+                if (l.delta_gpu && l.keep_delta_gpu) {
+                    //cuda_free(l.delta_gpu);   // already called above
+                    l.delta_gpu = cuda_make_array_pinned_preallocated(NULL, l.batch*l.outputs); // l.steps
+                    //printf("\n\n PINNED DELTA GPU = %d \n", l.batch*l.outputs);
+                }
+            }
+
+            net.layers[k] = l;
+        }
+    }
+#endif
+
+    set_train_only_bn(net); // set l.train_only_bn for all required layers
+
+    net.outputs = get_network_output_size(net);
+    net.output = get_network_output(net);
+    avg_outputs = avg_outputs / avg_counter;
+    fprintf(stderr, "Total BFLOPS %5.3f \n", bflops);
+    fprintf(stderr, "avg_outputs = %d \n", avg_outputs);
+#ifdef GPU
+    get_cuda_stream();
+    //get_cuda_memcpy_stream();
+    if (gpu_index >= 0)
+    {
+        int size = get_network_input_size(net) * net.batch;
+        net.input_state_gpu = cuda_make_array(0, size);
+        if (cudaSuccess == cudaHostAlloc(&net.input_pinned_cpu, size * sizeof(float), cudaHostRegisterMapped)) net.input_pinned_cpu_flag = 1;
+        else {
+            cudaGetLastError(); // reset CUDA-error
+            net.input_pinned_cpu = (float*)xcalloc(size, sizeof(float));
+        }
+
+        // pre-allocate memory for inference on Tensor Cores (fp16)
+        *net.max_input16_size = 0;
+        *net.max_output16_size = 0;
+        if (net.cudnn_half) {
+            *net.max_input16_size = max_inputs;
+            CHECK_CUDA(cudaMalloc((void **)net.input16_gpu, *net.max_input16_size * sizeof(short))); //sizeof(half)
+            *net.max_output16_size = max_outputs;
+            CHECK_CUDA(cudaMalloc((void **)net.output16_gpu, *net.max_output16_size * sizeof(short))); //sizeof(half)
+        }
+        if (workspace_size) {
+            fprintf(stderr, " Allocate additional workspace_size = %1.2f MB \n", (float)workspace_size/1000000);
+            net.workspace = cuda_make_array(0, workspace_size / sizeof(float) + 1);
+        }
+        else {
+            net.workspace = (float*)xcalloc(1, workspace_size);
+        }
+    }
+#else
+        if (workspace_size) {
+            net.workspace = (float*)xcalloc(1, workspace_size);
+        }
+#endif
+
+    LAYER_TYPE lt = net.layers[net.n - 1].type;
+    if ((net.w % 32 != 0 || net.h % 32 != 0) && (lt == YOLO || lt == REGION || lt == DETECTION)) {
+        printf("\n Warning: width=%d and height=%d in cfg-file must be divisible by 32 for default networks Yolo v1/v2/v3!!! \n\n",
+            net.w, net.h);
+    }
+    return net;
+}
+
+
+
+list *read_cfg(char *filename)
+{
+    FILE *file = fopen(filename, "r");
+    if(file == 0) file_error(filename);
+    char *line;
+    int nu = 0;
+    list *sections = make_list();
+    section *current = 0;
+    while((line=fgetl(file)) != 0){
+        ++ nu;
+        strip(line);
+        switch(line[0]){
+            case '[':
+                current = (section*)xmalloc(sizeof(section));
+                list_insert(sections, current);
+                current->options = make_list();
+                current->type = line;
+                break;
+            case '\0':
+            case '#':
+            case ';':
+                free(line);
+                break;
+            default:
+                if(!read_option(line, current->options)){
+                    fprintf(stderr, "Config file error line %d, could parse: %s\n", nu, line);
+                    free(line);
+                }
+                break;
+        }
+    }
+    fclose(file);
+    return sections;
+}
+
+void save_convolutional_weights_binary(layer l, FILE *fp)
+{
+#ifdef GPU
+    if(gpu_index >= 0){
+        pull_convolutional_layer(l);
+    }
+#endif
+    int size = (l.c/l.groups)*l.size*l.size;
+    binarize_weights(l.weights, l.n, size, l.binary_weights);
+    int i, j, k;
+    fwrite(l.biases, sizeof(float), l.n, fp);
+    if (l.batch_normalize){
+        fwrite(l.scales, sizeof(float), l.n, fp);
+        fwrite(l.rolling_mean, sizeof(float), l.n, fp);
+        fwrite(l.rolling_variance, sizeof(float), l.n, fp);
+    }
+    for(i = 0; i < l.n; ++i){
+        float mean = l.binary_weights[i*size];
+        if(mean < 0) mean = -mean;
+        fwrite(&mean, sizeof(float), 1, fp);
+        for(j = 0; j < size/8; ++j){
+            int index = i*size + j*8;
+            unsigned char c = 0;
+            for(k = 0; k < 8; ++k){
+                if (j*8 + k >= size) break;
+                if (l.binary_weights[index + k] > 0) c = (c | 1<<k);
+            }
+            fwrite(&c, sizeof(char), 1, fp);
+        }
+    }
+}
+
+void save_shortcut_weights(layer l, FILE *fp)
+{
+#ifdef GPU
+    if (gpu_index >= 0) {
+        pull_shortcut_layer(l);
+        printf("\n pull_shortcut_layer \n");
+    }
+#endif
+    int i;
+    //if(l.weight_updates) for (i = 0; i < l.nweights; ++i) printf(" %f, ", l.weight_updates[i]);
+    //printf(" l.nweights = %d - update \n", l.nweights);
+    for (i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
+    printf(" l.nweights = %d \n\n", l.nweights);
+
+    int num = l.nweights;
+    fwrite(l.weights, sizeof(float), num, fp);
+}
+
+void save_implicit_weights(layer l, FILE *fp)
+{
+#ifdef GPU
+    if (gpu_index >= 0) {
+        pull_implicit_layer(l);
+        //printf("\n pull_implicit_layer \n");
+    }
+#endif
+    int i;
+    //if(l.weight_updates) for (i = 0; i < l.nweights; ++i) printf(" %f, ", l.weight_updates[i]);
+    //printf(" l.nweights = %d - update \n", l.nweights);
+    //for (i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
+    //printf(" l.nweights = %d \n\n", l.nweights);
+
+    int num = l.nweights;
+    fwrite(l.weights, sizeof(float), num, fp);
+}
+
+void save_convolutional_weights(layer l, FILE *fp)
+{
+    if(l.binary){
+        //save_convolutional_weights_binary(l, fp);
+        //return;
+    }
+#ifdef GPU
+    if(gpu_index >= 0){
+        pull_convolutional_layer(l);
+    }
+#endif
+    int num = l.nweights;
+    fwrite(l.biases, sizeof(float), l.n, fp);
+    if (l.batch_normalize){
+        fwrite(l.scales, sizeof(float), l.n, fp);
+        fwrite(l.rolling_mean, sizeof(float), l.n, fp);
+        fwrite(l.rolling_variance, sizeof(float), l.n, fp);
+    }
+    fwrite(l.weights, sizeof(float), num, fp);
+    //if(l.adam){
+    //    fwrite(l.m, sizeof(float), num, fp);
+    //    fwrite(l.v, sizeof(float), num, fp);
+    //}
+}
+
+void save_convolutional_weights_ema(layer l, FILE *fp)
+{
+    if (l.binary) {
+        //save_convolutional_weights_binary(l, fp);
+        //return;
+    }
+#ifdef GPU
+    if (gpu_index >= 0) {
+        pull_convolutional_layer(l);
+    }
+#endif
+    int num = l.nweights;
+    fwrite(l.biases_ema, sizeof(float), l.n, fp);
+    if (l.batch_normalize) {
+        fwrite(l.scales_ema, sizeof(float), l.n, fp);
+        fwrite(l.rolling_mean, sizeof(float), l.n, fp);
+        fwrite(l.rolling_variance, sizeof(float), l.n, fp);
+    }
+    fwrite(l.weights_ema, sizeof(float), num, fp);
+    //if(l.adam){
+    //    fwrite(l.m, sizeof(float), num, fp);
+    //    fwrite(l.v, sizeof(float), num, fp);
+    //}
+}
+
+void save_batchnorm_weights(layer l, FILE *fp)
+{
+#ifdef GPU
+    if(gpu_index >= 0){
+        pull_batchnorm_layer(l);
+    }
+#endif
+    fwrite(l.biases, sizeof(float), l.c, fp);
+    fwrite(l.scales, sizeof(float), l.c, fp);
+    fwrite(l.rolling_mean, sizeof(float), l.c, fp);
+    fwrite(l.rolling_variance, sizeof(float), l.c, fp);
+}
+
+void save_connected_weights(layer l, FILE *fp)
+{
+#ifdef GPU
+    if(gpu_index >= 0){
+        pull_connected_layer(l);
+    }
+#endif
+    fwrite(l.biases, sizeof(float), l.outputs, fp);
+    fwrite(l.weights, sizeof(float), l.outputs*l.inputs, fp);
+    if (l.batch_normalize){
+        fwrite(l.scales, sizeof(float), l.outputs, fp);
+        fwrite(l.rolling_mean, sizeof(float), l.outputs, fp);
+        fwrite(l.rolling_variance, sizeof(float), l.outputs, fp);
+    }
+}
+
+void save_weights_upto(network net, char *filename, int cutoff, int save_ema)
+{
+#ifdef GPU
+    if(net.gpu_index >= 0){
+        cuda_set_device(net.gpu_index);
+    }
+#endif
+    fprintf(stderr, "Saving weights to %s\n", filename);
+    FILE *fp = fopen(filename, "wb");
+    if(!fp) file_error(filename);
+
+    int32_t major = MAJOR_VERSION;
+    int32_t minor = MINOR_VERSION;
+    int32_t revision = PATCH_VERSION;
+    fwrite(&major, sizeof(int32_t), 1, fp);
+    fwrite(&minor, sizeof(int32_t), 1, fp);
+    fwrite(&revision, sizeof(int32_t), 1, fp);
+    (*net.seen) = get_current_iteration(net) * net.batch * net.subdivisions; // remove this line, when you will save to weights-file both: seen & cur_iteration
+    fwrite(net.seen, sizeof(uint64_t), 1, fp);
+
+    int i;
+    for(i = 0; i < net.n && i < cutoff; ++i){
+        layer l = net.layers[i];
+        if (l.type == CONVOLUTIONAL && l.share_layer == NULL) {
+            if (save_ema) {
+                save_convolutional_weights_ema(l, fp);
+            }
+            else {
+                save_convolutional_weights(l, fp);
+            }
+        } if (l.type == SHORTCUT && l.nweights > 0) {
+            save_shortcut_weights(l, fp);
+        } if (l.type == IMPLICIT) {
+            save_implicit_weights(l, fp);
+        } if(l.type == CONNECTED){
+            save_connected_weights(l, fp);
+        } if(l.type == BATCHNORM){
+            save_batchnorm_weights(l, fp);
+        } if(l.type == RNN){
+            save_connected_weights(*(l.input_layer), fp);
+            save_connected_weights(*(l.self_layer), fp);
+            save_connected_weights(*(l.output_layer), fp);
+        } if(l.type == GRU){
+            save_connected_weights(*(l.input_z_layer), fp);
+            save_connected_weights(*(l.input_r_layer), fp);
+            save_connected_weights(*(l.input_h_layer), fp);
+            save_connected_weights(*(l.state_z_layer), fp);
+            save_connected_weights(*(l.state_r_layer), fp);
+            save_connected_weights(*(l.state_h_layer), fp);
+        } if(l.type == LSTM){
+            save_connected_weights(*(l.wf), fp);
+            save_connected_weights(*(l.wi), fp);
+            save_connected_weights(*(l.wg), fp);
+            save_connected_weights(*(l.wo), fp);
+            save_connected_weights(*(l.uf), fp);
+            save_connected_weights(*(l.ui), fp);
+            save_connected_weights(*(l.ug), fp);
+            save_connected_weights(*(l.uo), fp);
+        } if (l.type == CONV_LSTM) {
+            if (l.peephole) {
+                save_convolutional_weights(*(l.vf), fp);
+                save_convolutional_weights(*(l.vi), fp);
+                save_convolutional_weights(*(l.vo), fp);
+            }
+            save_convolutional_weights(*(l.wf), fp);
+            if (!l.bottleneck) {
+                save_convolutional_weights(*(l.wi), fp);
+                save_convolutional_weights(*(l.wg), fp);
+                save_convolutional_weights(*(l.wo), fp);
+            }
+            save_convolutional_weights(*(l.uf), fp);
+            save_convolutional_weights(*(l.ui), fp);
+            save_convolutional_weights(*(l.ug), fp);
+            save_convolutional_weights(*(l.uo), fp);
+        } if(l.type == CRNN){
+            save_convolutional_weights(*(l.input_layer), fp);
+            save_convolutional_weights(*(l.self_layer), fp);
+            save_convolutional_weights(*(l.output_layer), fp);
+        } if(l.type == LOCAL){
+#ifdef GPU
+            if(gpu_index >= 0){
+                pull_local_layer(l);
+            }
+#endif
+            int locations = l.out_w*l.out_h;
+            int size = l.size*l.size*l.c*l.n*locations;
+            fwrite(l.biases, sizeof(float), l.outputs, fp);
+            fwrite(l.weights, sizeof(float), size, fp);
+        }
+        fflush(fp);
+    }
+    fclose(fp);
+}
+void save_weights(network net, char *filename)
+{
+    save_weights_upto(net, filename, net.n, 0);
+}
+
+void transpose_matrix(float *a, int rows, int cols)
+{
+    float* transpose = (float*)xcalloc(rows * cols, sizeof(float));
+    int x, y;
+    for(x = 0; x < rows; ++x){
+        for(y = 0; y < cols; ++y){
+            transpose[y*rows + x] = a[x*cols + y];
+        }
+    }
+    memcpy(a, transpose, rows*cols*sizeof(float));
+    free(transpose);
+}
+
+void load_connected_weights(layer l, FILE *fp, int transpose)
+{
+    fread(l.biases, sizeof(float), l.outputs, fp);
+    fread(l.weights, sizeof(float), l.outputs*l.inputs, fp);
+    if(transpose){
+        transpose_matrix(l.weights, l.inputs, l.outputs);
+    }
+    //printf("Biases: %f mean %f variance\n", mean_array(l.biases, l.outputs), variance_array(l.biases, l.outputs));
+    //printf("Weights: %f mean %f variance\n", mean_array(l.weights, l.outputs*l.inputs), variance_array(l.weights, l.outputs*l.inputs));
+    if (l.batch_normalize && (!l.dontloadscales)){
+        fread(l.scales, sizeof(float), l.outputs, fp);
+        fread(l.rolling_mean, sizeof(float), l.outputs, fp);
+        fread(l.rolling_variance, sizeof(float), l.outputs, fp);
+        //printf("Scales: %f mean %f variance\n", mean_array(l.scales, l.outputs), variance_array(l.scales, l.outputs));
+        //printf("rolling_mean: %f mean %f variance\n", mean_array(l.rolling_mean, l.outputs), variance_array(l.rolling_mean, l.outputs));
+        //printf("rolling_variance: %f mean %f variance\n", mean_array(l.rolling_variance, l.outputs), variance_array(l.rolling_variance, l.outputs));
+    }
+#ifdef GPU
+    if(gpu_index >= 0){
+        push_connected_layer(l);
+    }
+#endif
+}
+
+void load_batchnorm_weights(layer l, FILE *fp)
+{
+    fread(l.biases, sizeof(float), l.c, fp);
+    fread(l.scales, sizeof(float), l.c, fp);
+    fread(l.rolling_mean, sizeof(float), l.c, fp);
+    fread(l.rolling_variance, sizeof(float), l.c, fp);
+#ifdef GPU
+    if(gpu_index >= 0){
+        push_batchnorm_layer(l);
+    }
+#endif
+}
+
+void load_convolutional_weights_binary(layer l, FILE *fp)
+{
+    fread(l.biases, sizeof(float), l.n, fp);
+    if (l.batch_normalize && (!l.dontloadscales)){
+        fread(l.scales, sizeof(float), l.n, fp);
+        fread(l.rolling_mean, sizeof(float), l.n, fp);
+        fread(l.rolling_variance, sizeof(float), l.n, fp);
+    }
+    int size = (l.c / l.groups)*l.size*l.size;
+    int i, j, k;
+    for(i = 0; i < l.n; ++i){
+        float mean = 0;
+        fread(&mean, sizeof(float), 1, fp);
+        for(j = 0; j < size/8; ++j){
+            int index = i*size + j*8;
+            unsigned char c = 0;
+            fread(&c, sizeof(char), 1, fp);
+            for(k = 0; k < 8; ++k){
+                if (j*8 + k >= size) break;
+                l.weights[index + k] = (c & 1<<k) ? mean : -mean;
+            }
+        }
+    }
+#ifdef GPU
+    if(gpu_index >= 0){
+        push_convolutional_layer(l);
+    }
+#endif
+}
+
+void load_convolutional_weights(layer l, FILE *fp)
+{
+    if(l.binary){
+        //load_convolutional_weights_binary(l, fp);
+        //return;
+    }
+    int num = l.nweights;
+    int read_bytes;
+    read_bytes = fread(l.biases, sizeof(float), l.n, fp);
+    if (read_bytes > 0 && read_bytes < l.n) printf("\n Warning: Unexpected end of wights-file! l.biases - l.index = %d \n", l.index);
+    //fread(l.weights, sizeof(float), num, fp); // as in connected layer
+    if (l.batch_normalize && (!l.dontloadscales)){
+        read_bytes = fread(l.scales, sizeof(float), l.n, fp);
+        if (read_bytes > 0 && read_bytes < l.n) printf("\n Warning: Unexpected end of wights-file! l.scales - l.index = %d \n", l.index);
+        read_bytes = fread(l.rolling_mean, sizeof(float), l.n, fp);
+        if (read_bytes > 0 && read_bytes < l.n) printf("\n Warning: Unexpected end of wights-file! l.rolling_mean - l.index = %d \n", l.index);
+        read_bytes = fread(l.rolling_variance, sizeof(float), l.n, fp);
+        if (read_bytes > 0 && read_bytes < l.n) printf("\n Warning: Unexpected end of wights-file! l.rolling_variance - l.index = %d \n", l.index);
+        if(0){
+            int i;
+            for(i = 0; i < l.n; ++i){
+                printf("%g, ", l.rolling_mean[i]);
+            }
+            printf("\n");
+            for(i = 0; i < l.n; ++i){
+                printf("%g, ", l.rolling_variance[i]);
+            }
+            printf("\n");
+        }
+        if(0){
+            fill_cpu(l.n, 0, l.rolling_mean, 1);
+            fill_cpu(l.n, 0, l.rolling_variance, 1);
+        }
+    }
+    read_bytes = fread(l.weights, sizeof(float), num, fp);
+    if (read_bytes > 0 && read_bytes < l.n) printf("\n Warning: Unexpected end of wights-file! l.weights - l.index = %d \n", l.index);
+    //if(l.adam){
+    //    fread(l.m, sizeof(float), num, fp);
+    //    fread(l.v, sizeof(float), num, fp);
+    //}
+    //if(l.c == 3) scal_cpu(num, 1./256, l.weights, 1);
+    if (l.flipped) {
+        transpose_matrix(l.weights, (l.c/l.groups)*l.size*l.size, l.n);
+    }
+    //if (l.binary) binarize_weights(l.weights, l.n, (l.c/l.groups)*l.size*l.size, l.weights);
+#ifdef GPU
+    if(gpu_index >= 0){
+        push_convolutional_layer(l);
+    }
+#endif
+}
+
+void load_shortcut_weights(layer l, FILE *fp)
+{
+    int num = l.nweights;
+    int read_bytes;
+    read_bytes = fread(l.weights, sizeof(float), num, fp);
+    if (read_bytes > 0 && read_bytes < num) printf("\n Warning: Unexpected end of wights-file! l.weights - l.index = %d \n", l.index);
+    //for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
+    //printf(" read_bytes = %d \n\n", read_bytes);
+#ifdef GPU
+    if (gpu_index >= 0) {
+        push_shortcut_layer(l);
+    }
+#endif
+}
+
+void load_implicit_weights(layer l, FILE *fp)
+{
+    int num = l.nweights;
+    int read_bytes;
+    read_bytes = fread(l.weights, sizeof(float), num, fp);
+    if (read_bytes > 0 && read_bytes < num) printf("\n Warning: Unexpected end of wights-file! l.weights - l.index = %d \n", l.index);
+    //for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
+    //printf(" read_bytes = %d \n\n", read_bytes);
+#ifdef GPU
+    if (gpu_index >= 0) {
+        push_implicit_layer(l);
+    }
+#endif
+}
+
+void load_weights_upto(network *net, char *filename, int cutoff)
+{
+#ifdef GPU
+    if(net->gpu_index >= 0){
+        cuda_set_device(net->gpu_index);
+    }
+#endif
+    fprintf(stderr, "Loading weights from %s...", filename);
+    fflush(stdout);
+    FILE *fp = fopen(filename, "rb");
+    if(!fp) file_error(filename);
+
+    int32_t major;
+    int32_t minor;
+    int32_t revision;
+    fread(&major, sizeof(int32_t), 1, fp);
+    fread(&minor, sizeof(int32_t), 1, fp);
+    fread(&revision, sizeof(int32_t), 1, fp);
+    if ((major * 10 + minor) >= 2) {
+        printf("\n seen 64");
+        uint64_t iseen = 0;
+        fread(&iseen, sizeof(uint64_t), 1, fp);
+        *net->seen = iseen;
+    }
+    else {
+        printf("\n seen 32");
+        uint32_t iseen = 0;
+        fread(&iseen, sizeof(uint32_t), 1, fp);
+        *net->seen = iseen;
+    }
+    *net->cur_iteration = get_current_batch(*net);
+    printf(", trained: %.0f K-images (%.0f Kilo-batches_64) \n", (float)(*net->seen / 1000), (float)(*net->seen / 64000));
+    int transpose = (major > 1000) || (minor > 1000);
+
+    int i;
+    for(i = 0; i < net->n && i < cutoff; ++i){
+        layer l = net->layers[i];
+        if (l.dontload) continue;
+        if(l.type == CONVOLUTIONAL && l.share_layer == NULL){
+            load_convolutional_weights(l, fp);
+        }
+        if (l.type == SHORTCUT && l.nweights > 0) {
+            load_shortcut_weights(l, fp);
+        }
+        if (l.type == IMPLICIT) {
+            load_implicit_weights(l, fp);
+        }
+        if(l.type == CONNECTED){
+            load_connected_weights(l, fp, transpose);
+        }
+        if(l.type == BATCHNORM){
+            load_batchnorm_weights(l, fp);
+        }
+        if(l.type == CRNN){
+            load_convolutional_weights(*(l.input_layer), fp);
+            load_convolutional_weights(*(l.self_layer), fp);
+            load_convolutional_weights(*(l.output_layer), fp);
+        }
+        if(l.type == RNN){
+            load_connected_weights(*(l.input_layer), fp, transpose);
+            load_connected_weights(*(l.self_layer), fp, transpose);
+            load_connected_weights(*(l.output_layer), fp, transpose);
+        }
+        if(l.type == GRU){
+            load_connected_weights(*(l.input_z_layer), fp, transpose);
+            load_connected_weights(*(l.input_r_layer), fp, transpose);
+            load_connected_weights(*(l.input_h_layer), fp, transpose);
+            load_connected_weights(*(l.state_z_layer), fp, transpose);
+            load_connected_weights(*(l.state_r_layer), fp, transpose);
+            load_connected_weights(*(l.state_h_layer), fp, transpose);
+        }
+        if(l.type == LSTM){
+            load_connected_weights(*(l.wf), fp, transpose);
+            load_connected_weights(*(l.wi), fp, transpose);
+            load_connected_weights(*(l.wg), fp, transpose);
+            load_connected_weights(*(l.wo), fp, transpose);
+            load_connected_weights(*(l.uf), fp, transpose);
+            load_connected_weights(*(l.ui), fp, transpose);
+            load_connected_weights(*(l.ug), fp, transpose);
+            load_connected_weights(*(l.uo), fp, transpose);
+        }
+        if (l.type == CONV_LSTM) {
+            if (l.peephole) {
+                load_convolutional_weights(*(l.vf), fp);
+                load_convolutional_weights(*(l.vi), fp);
+                load_convolutional_weights(*(l.vo), fp);
+            }
+            load_convolutional_weights(*(l.wf), fp);
+            if (!l.bottleneck) {
+                load_convolutional_weights(*(l.wi), fp);
+                load_convolutional_weights(*(l.wg), fp);
+                load_convolutional_weights(*(l.wo), fp);
+            }
+            load_convolutional_weights(*(l.uf), fp);
+            load_convolutional_weights(*(l.ui), fp);
+            load_convolutional_weights(*(l.ug), fp);
+            load_convolutional_weights(*(l.uo), fp);
+        }
+        if(l.type == LOCAL){
+            int locations = l.out_w*l.out_h;
+            int size = l.size*l.size*l.c*l.n*locations;
+            fread(l.biases, sizeof(float), l.outputs, fp);
+            fread(l.weights, sizeof(float), size, fp);
+#ifdef GPU
+            if(gpu_index >= 0){
+                push_local_layer(l);
+            }
+#endif
+        }
+        if (feof(fp)) break;
+    }
+    fprintf(stderr, "Done! Loaded %d layers from weights-file \n", i);
+    fclose(fp);
+}
+
+void load_weights(network *net, char *filename)
+{
+    load_weights_upto(net, filename, net->n);
+}
+
+// load network & force - set batch size
+network *load_network_custom(char *cfg, char *weights, int clear, int batch)
+{
+    printf(" Try to load cfg: %s, weights: %s, clear = %d \n", cfg, weights, clear);
+    network* net = (network*)xcalloc(1, sizeof(network));
+    *net = parse_network_cfg_custom(cfg, batch, 1);
+    if (weights && weights[0] != 0) {
+        printf(" Try to load weights: %s \n", weights);
+        load_weights(net, weights);
+    }
+    fuse_conv_batchnorm(*net);
+    if (clear) {
+        (*net->seen) = 0;
+        (*net->cur_iteration) = 0;
+    }
+    return net;
+}
+
+// load network & get batch size from cfg-file
+network *load_network(char *cfg, char *weights, int clear)
+{
+    printf(" Try to load cfg: %s, clear = %d \n", cfg, clear);
+    network* net = (network*)xcalloc(1, sizeof(network));
+    *net = parse_network_cfg(cfg);
+    if (weights && weights[0] != 0) {
+        printf(" Try to load weights: %s \n", weights);
+        load_weights(net, weights);
+    }
+    if (clear) {
+        (*net->seen) = 0;
+        (*net->cur_iteration) = 0;
+    }
+    return net;
+}
diff --git a/darknet-master/src/parser.h b/darknet-master/src/parser.h
new file mode 100644
index 0000000..0524116
--- /dev/null
+++ b/darknet-master/src/parser.h
@@ -0,0 +1,20 @@
+#ifndef PARSER_H
+#define PARSER_H
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+network parse_network_cfg(char *filename);
+network parse_network_cfg_custom(char *filename, int batch, int time_steps);
+void save_network(network net, char *filename);
+void save_weights(network net, char *filename);
+void save_weights_upto(network net, char *filename, int cutoff, int save_ema);
+void save_weights_double(network net, char *filename);
+void load_weights(network *net, char *filename);
+void load_weights_upto(network *net, char *filename, int cutoff);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/region_layer.c b/darknet-master/src/region_layer.c
new file mode 100644
index 0000000..506cc9f
--- /dev/null
+++ b/darknet-master/src/region_layer.c
@@ -0,0 +1,599 @@
+#include "region_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "box.h"
+#include "dark_cuda.h"
+#include "utils.h"
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define DOABS 1
+
+region_layer make_region_layer(int batch, int w, int h, int n, int classes, int coords, int max_boxes)
+{
+    region_layer l = { (LAYER_TYPE)0 };
+    l.type = REGION;
+
+    l.n = n;
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = n*(classes + coords + 1);
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
+    l.classes = classes;
+    l.coords = coords;
+    l.cost = (float*)xcalloc(1, sizeof(float));
+    l.biases = (float*)xcalloc(n * 2, sizeof(float));
+    l.bias_updates = (float*)xcalloc(n * 2, sizeof(float));
+    l.outputs = h*w*n*(classes + coords + 1);
+    l.inputs = l.outputs;
+    l.max_boxes = max_boxes;
+    l.truth_size = 4 + 2;
+    l.truths = max_boxes*l.truth_size;
+    l.delta = (float*)xcalloc(batch * l.outputs, sizeof(float));
+    l.output = (float*)xcalloc(batch * l.outputs, sizeof(float));
+    int i;
+    for(i = 0; i < n*2; ++i){
+        l.biases[i] = .5;
+    }
+
+    l.forward = forward_region_layer;
+    l.backward = backward_region_layer;
+#ifdef GPU
+    l.forward_gpu = forward_region_layer_gpu;
+    l.backward_gpu = backward_region_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
+    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
+#endif
+
+    fprintf(stderr, "detection\n");
+    srand(time(0));
+
+    return l;
+}
+
+void resize_region_layer(layer *l, int w, int h)
+{
+#ifdef GPU
+    int old_w = l->w;
+    int old_h = l->h;
+#endif
+    l->w = w;
+    l->h = h;
+
+    l->outputs = h*w*l->n*(l->classes + l->coords + 1);
+    l->inputs = l->outputs;
+
+    l->output = (float*)xrealloc(l->output, l->batch * l->outputs * sizeof(float));
+    l->delta = (float*)xrealloc(l->delta, l->batch * l->outputs * sizeof(float));
+
+#ifdef GPU
+    //if (old_w < w || old_h < h)
+    {
+        cuda_free(l->delta_gpu);
+        cuda_free(l->output_gpu);
+
+        l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
+        l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+    }
+#endif
+}
+
+box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h)
+{
+    box b;
+    b.x = (i + logistic_activate(x[index + 0])) / w;
+    b.y = (j + logistic_activate(x[index + 1])) / h;
+    b.w = exp(x[index + 2]) * biases[2*n];
+    b.h = exp(x[index + 3]) * biases[2*n+1];
+    if(DOABS){
+        b.w = exp(x[index + 2]) * biases[2*n]   / w;
+        b.h = exp(x[index + 3]) * biases[2*n+1] / h;
+    }
+    return b;
+}
+
+float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale)
+{
+    box pred = get_region_box(x, biases, n, index, i, j, w, h);
+    float iou = box_iou(pred, truth);
+
+    float tx = (truth.x*w - i);
+    float ty = (truth.y*h - j);
+    float tw = log(truth.w / biases[2*n]);
+    float th = log(truth.h / biases[2*n + 1]);
+    if(DOABS){
+        tw = log(truth.w*w / biases[2*n]);
+        th = log(truth.h*h / biases[2*n + 1]);
+    }
+
+    delta[index + 0] = scale * (tx - logistic_activate(x[index + 0])) * logistic_gradient(logistic_activate(x[index + 0]));
+    delta[index + 1] = scale * (ty - logistic_activate(x[index + 1])) * logistic_gradient(logistic_activate(x[index + 1]));
+    delta[index + 2] = scale * (tw - x[index + 2]);
+    delta[index + 3] = scale * (th - x[index + 3]);
+    return iou;
+}
+
+void delta_region_class(float *output, float *delta, int index, int class_id, int classes, tree *hier, float scale, float *avg_cat, int focal_loss)
+{
+    int i, n;
+    if(hier){
+        float pred = 1;
+        while(class_id >= 0){
+            pred *= output[index + class_id];
+            int g = hier->group[class_id];
+            int offset = hier->group_offset[g];
+            for(i = 0; i < hier->group_size[g]; ++i){
+                delta[index + offset + i] = scale * (0 - output[index + offset + i]);
+            }
+            delta[index + class_id] = scale * (1 - output[index + class_id]);
+
+            class_id = hier->parent[class_id];
+        }
+        *avg_cat += pred;
+    } else {
+        // Focal loss
+        if (focal_loss) {
+            // Focal Loss
+            float alpha = 0.5;    // 0.25 or 0.5
+            //float gamma = 2;    // hardcoded in many places of the grad-formula
+
+            int ti = index + class_id;
+            float pt = output[ti] + 0.000000000000001F;
+            // http://fooplot.com/#W3sidHlwZSI6MCwiZXEiOiItKDEteCkqKDIqeCpsb2coeCkreC0xKSIsImNvbG9yIjoiIzAwMDAwMCJ9LHsidHlwZSI6MTAwMH1d
+            float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1);    // http://blog.csdn.net/linmingan/article/details/77885832
+            //float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1);    // https://github.com/unsky/focal-loss
+
+            for (n = 0; n < classes; ++n) {
+                delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]);
+
+                delta[index + n] *= alpha*grad;
+
+                if (n == class_id) *avg_cat += output[index + n];
+            }
+        }
+        else {
+            // default
+            for (n = 0; n < classes; ++n) {
+                delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]);
+                if (n == class_id) *avg_cat += output[index + n];
+            }
+        }
+    }
+}
+
+float logit(float x)
+{
+    return log(x/(1.-x));
+}
+
+float tisnan(float x)
+{
+    return (x != x);
+}
+
+static int entry_index(layer l, int batch, int location, int entry)
+{
+    int n = location / (l.w*l.h);
+    int loc = location % (l.w*l.h);
+    return batch*l.outputs + n*l.w*l.h*(l.coords + l.classes + 1) + entry*l.w*l.h + loc;
+}
+
+void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output);
+void forward_region_layer(const region_layer l, network_state state)
+{
+    int i,j,b,t,n;
+    int size = l.coords + l.classes + 1;
+    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
+    #ifndef GPU
+    flatten(l.output, l.w*l.h, size*l.n, l.batch, 1);
+    #endif
+    for (b = 0; b < l.batch; ++b){
+        for(i = 0; i < l.h*l.w*l.n; ++i){
+            int index = size*i + b*l.outputs;
+            l.output[index + 4] = logistic_activate(l.output[index + 4]);
+        }
+    }
+
+
+#ifndef GPU
+    if (l.softmax_tree){
+        for (b = 0; b < l.batch; ++b){
+            for(i = 0; i < l.h*l.w*l.n; ++i){
+                int index = size*i + b*l.outputs;
+                softmax_tree(l.output + index + 5, 1, 0, 1, l.softmax_tree, l.output + index + 5);
+            }
+        }
+    } else if (l.softmax){
+        for (b = 0; b < l.batch; ++b){
+            for(i = 0; i < l.h*l.w*l.n; ++i){
+                int index = size*i + b*l.outputs;
+                softmax(l.output + index + 5, l.classes, 1, l.output + index + 5, 1);
+            }
+        }
+    }
+#endif
+    if(!state.train) return;
+    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+    float avg_iou = 0;
+    float recall = 0;
+    float avg_cat = 0;
+    float avg_obj = 0;
+    float avg_anyobj = 0;
+    int count = 0;
+    int class_count = 0;
+    *(l.cost) = 0;
+    for (b = 0; b < l.batch; ++b) {
+        if(l.softmax_tree){
+            int onlyclass_id = 0;
+            for(t = 0; t < l.max_boxes; ++t){
+                box truth = float_to_box(state.truth + t*l.truth_size + b*l.truths);
+                if(!truth.x) break; // continue;
+                int class_id = state.truth[t*l.truth_size + b*l.truths + 4];
+                float maxp = 0;
+                int maxi = 0;
+                if(truth.x > 100000 && truth.y > 100000){
+                    for(n = 0; n < l.n*l.w*l.h; ++n){
+                        int index = size*n + b*l.outputs + 5;
+                        float scale =  l.output[index-1];
+                        float p = scale*get_hierarchy_probability(l.output + index, l.softmax_tree, class_id);
+                        if(p > maxp){
+                            maxp = p;
+                            maxi = n;
+                        }
+                    }
+                    int index = size*maxi + b*l.outputs + 5;
+                    delta_region_class(l.output, l.delta, index, class_id, l.classes, l.softmax_tree, l.class_scale, &avg_cat, l.focal_loss);
+                    ++class_count;
+                    onlyclass_id = 1;
+                    break;
+                }
+            }
+            if(onlyclass_id) continue;
+        }
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
+                    box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
+                    float best_iou = 0;
+                    int best_class_id = -1;
+                    for(t = 0; t < l.max_boxes; ++t){
+                        box truth = float_to_box(state.truth + t*l.truth_size + b*l.truths);
+                        int class_id = state.truth[t * l.truth_size + b*l.truths + 4];
+                        if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file
+                        if(!truth.x) break; // continue;
+                        float iou = box_iou(pred, truth);
+                        if (iou > best_iou) {
+                            best_class_id = state.truth[t*l.truth_size + b*l.truths + 4];
+                            best_iou = iou;
+                        }
+                    }
+                    avg_anyobj += l.output[index + 4];
+                    l.delta[index + 4] = l.noobject_scale * ((0 - l.output[index + 4]) * logistic_gradient(l.output[index + 4]));
+                    if(l.classfix == -1) l.delta[index + 4] = l.noobject_scale * ((best_iou - l.output[index + 4]) * logistic_gradient(l.output[index + 4]));
+                    else{
+                        if (best_iou > l.thresh) {
+                            l.delta[index + 4] = 0;
+                            if(l.classfix > 0){
+                                delta_region_class(l.output, l.delta, index + 5, best_class_id, l.classes, l.softmax_tree, l.class_scale*(l.classfix == 2 ? l.output[index + 4] : 1), &avg_cat, l.focal_loss);
+                                ++class_count;
+                            }
+                        }
+                    }
+
+                    if(*(state.net.seen) < 12800){
+                        box truth = {0};
+                        truth.x = (i + .5)/l.w;
+                        truth.y = (j + .5)/l.h;
+                        truth.w = l.biases[2*n];
+                        truth.h = l.biases[2*n+1];
+                        if(DOABS){
+                            truth.w = l.biases[2*n]/l.w;
+                            truth.h = l.biases[2*n+1]/l.h;
+                        }
+                        delta_region_box(truth, l.output, l.biases, n, index, i, j, l.w, l.h, l.delta, .01);
+                    }
+                }
+            }
+        }
+        for(t = 0; t < l.max_boxes; ++t){
+            box truth = float_to_box(state.truth + t*l.truth_size + b*l.truths);
+            int class_id = state.truth[t * l.truth_size + b*l.truths + 4];
+            if (class_id >= l.classes) {
+                printf("\n Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes-1);
+                continue; // if label contains class_id more than number of classes in the cfg-file
+            }
+
+            if(!truth.x) break; // continue;
+            float best_iou = 0;
+            int best_index = 0;
+            int best_n = 0;
+            i = (truth.x * l.w);
+            j = (truth.y * l.h);
+            //printf("%d %f %d %f\n", i, truth.x*l.w, j, truth.y*l.h);
+            box truth_shift = truth;
+            truth_shift.x = 0;
+            truth_shift.y = 0;
+            //printf("index %d %d\n",i, j);
+            for(n = 0; n < l.n; ++n){
+                int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
+                box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
+                if(l.bias_match){
+                    pred.w = l.biases[2*n];
+                    pred.h = l.biases[2*n+1];
+                    if(DOABS){
+                        pred.w = l.biases[2*n]/l.w;
+                        pred.h = l.biases[2*n+1]/l.h;
+                    }
+                }
+                //printf("pred: (%f, %f) %f x %f\n", pred.x, pred.y, pred.w, pred.h);
+                pred.x = 0;
+                pred.y = 0;
+                float iou = box_iou(pred, truth_shift);
+                if (iou > best_iou){
+                    best_index = index;
+                    best_iou = iou;
+                    best_n = n;
+                }
+            }
+            //printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h);
+
+            float iou = delta_region_box(truth, l.output, l.biases, best_n, best_index, i, j, l.w, l.h, l.delta, l.coord_scale);
+            if(iou > .5) recall += 1;
+            avg_iou += iou;
+
+            //l.delta[best_index + 4] = iou - l.output[best_index + 4];
+            avg_obj += l.output[best_index + 4];
+            l.delta[best_index + 4] = l.object_scale * (1 - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]);
+            if (l.rescore) {
+                l.delta[best_index + 4] = l.object_scale * (iou - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]);
+            }
+
+            if (l.map) class_id = l.map[class_id];
+            delta_region_class(l.output, l.delta, best_index + 5, class_id, l.classes, l.softmax_tree, l.class_scale, &avg_cat, l.focal_loss);
+            ++count;
+            ++class_count;
+        }
+    }
+    //printf("\n");
+    #ifndef GPU
+    flatten(l.delta, l.w*l.h, size*l.n, l.batch, 0);
+    #endif
+    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+    printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f,  count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count);
+}
+
+void backward_region_layer(const region_layer l, network_state state)
+{
+    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
+}
+
+void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map)
+{
+    int i;
+    float *const predictions = l.output;
+    #pragma omp parallel for
+    for (i = 0; i < l.w*l.h; ++i){
+        int j, n;
+        int row = i / l.w;
+        int col = i % l.w;
+        for(n = 0; n < l.n; ++n){
+            int index = i*l.n + n;
+            int p_index = index * (l.classes + 5) + 4;
+            float scale = predictions[p_index];
+            if(l.classfix == -1 && scale < .5) scale = 0;
+            int box_index = index * (l.classes + 5);
+            boxes[index] = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h);
+            boxes[index].x *= w;
+            boxes[index].y *= h;
+            boxes[index].w *= w;
+            boxes[index].h *= h;
+
+            int class_index = index * (l.classes + 5) + 5;
+            if(l.softmax_tree){
+
+                hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0);
+                int found = 0;
+                if(map){
+                    for(j = 0; j < 200; ++j){
+                        float prob = scale*predictions[class_index+map[j]];
+                        probs[index][j] = (prob > thresh) ? prob : 0;
+                    }
+                } else {
+                    for(j = l.classes - 1; j >= 0; --j){
+                        if(!found && predictions[class_index + j] > .5){
+                            found = 1;
+                        } else {
+                            predictions[class_index + j] = 0;
+                        }
+                        float prob = predictions[class_index+j];
+                        probs[index][j] = (scale > thresh) ? prob : 0;
+                    }
+                }
+            } else {
+                for(j = 0; j < l.classes; ++j){
+                    float prob = scale*predictions[class_index+j];
+                    probs[index][j] = (prob > thresh) ? prob : 0;
+                }
+            }
+            if(only_objectness){
+                probs[index][0] = scale;
+            }
+        }
+    }
+}
+
+#ifdef GPU
+
+void forward_region_layer_gpu(const region_layer l, network_state state)
+{
+    /*
+       if(!state.train){
+       copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
+       return;
+       }
+     */
+    flatten_ongpu(state.input, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 1, l.output_gpu);
+    if(l.softmax_tree){
+        int i;
+        int count = 5;
+        for (i = 0; i < l.softmax_tree->groups; ++i) {
+            int group_size = l.softmax_tree->group_size[i];
+            softmax_gpu(l.output_gpu+count, group_size, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + count);
+            count += group_size;
+        }
+    }else if (l.softmax){
+        softmax_gpu(l.output_gpu+5, l.classes, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + 5);
+    }
+
+    float* in_cpu = (float*)xcalloc(l.batch * l.inputs, sizeof(float));
+    float *truth_cpu = 0;
+    if(state.truth){
+        int num_truth = l.batch*l.truths;
+        truth_cpu = (float*)xcalloc(num_truth, sizeof(float));
+        cuda_pull_array(state.truth, truth_cpu, num_truth);
+    }
+    cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs);
+    //cudaStreamSynchronize(get_cuda_stream());
+    network_state cpu_state = state;
+    cpu_state.train = state.train;
+    cpu_state.truth = truth_cpu;
+    cpu_state.input = in_cpu;
+    forward_region_layer(l, cpu_state);
+    //cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs);
+    free(cpu_state.input);
+    if(!state.train) return;
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
+    //cudaStreamSynchronize(get_cuda_stream());
+    if(cpu_state.truth) free(cpu_state.truth);
+}
+
+void backward_region_layer_gpu(region_layer l, network_state state)
+{
+    flatten_ongpu(l.delta_gpu, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 0, state.delta);
+}
+#endif
+
+
+void correct_region_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
+{
+    int i;
+    int new_w = 0;
+    int new_h = 0;
+    if (((float)netw / w) < ((float)neth / h)) {
+        new_w = netw;
+        new_h = (h * netw) / w;
+    }
+    else {
+        new_h = neth;
+        new_w = (w * neth) / h;
+    }
+    for (i = 0; i < n; ++i) {
+        box b = dets[i].bbox;
+        b.x = (b.x - (netw - new_w) / 2. / netw) / ((float)new_w / netw);
+        b.y = (b.y - (neth - new_h) / 2. / neth) / ((float)new_h / neth);
+        b.w *= (float)netw / new_w;
+        b.h *= (float)neth / new_h;
+        if (!relative) {
+            b.x *= w;
+            b.w *= w;
+            b.y *= h;
+            b.h *= h;
+        }
+        dets[i].bbox = b;
+    }
+}
+
+
+void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets)
+{
+    int i, j, n, z;
+    float *predictions = l.output;
+    if (l.batch == 2) {
+        float *flip = l.output + l.outputs;
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w / 2; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    for (z = 0; z < l.classes + l.coords + 1; ++z) {
+                        int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
+                        int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
+                        float swap = flip[i1];
+                        flip[i1] = flip[i2];
+                        flip[i2] = swap;
+                        if (z == 0) {
+                            flip[i1] = -flip[i1];
+                            flip[i2] = -flip[i2];
+                        }
+                    }
+                }
+            }
+        }
+        for (i = 0; i < l.outputs; ++i) {
+            l.output[i] = (l.output[i] + flip[i]) / 2.;
+        }
+    }
+    for (i = 0; i < l.w*l.h; ++i) {
+        int row = i / l.w;
+        int col = i % l.w;
+        for (n = 0; n < l.n; ++n) {
+            int index = n*l.w*l.h + i;
+            for (j = 0; j < l.classes; ++j) {
+                dets[index].prob[j] = 0;
+            }
+            int obj_index = entry_index(l, 0, n*l.w*l.h + i, l.coords);
+            int box_index = entry_index(l, 0, n*l.w*l.h + i, 0);
+            int mask_index = entry_index(l, 0, n*l.w*l.h + i, 4);
+            float scale = l.background ? 1 : predictions[obj_index];
+            dets[index].bbox = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h);// , l.w*l.h);
+            dets[index].objectness = scale > thresh ? scale : 0;
+            if (dets[index].mask) {
+                for (j = 0; j < l.coords - 4; ++j) {
+                    dets[index].mask[j] = l.output[mask_index + j*l.w*l.h];
+                }
+            }
+
+            int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + !l.background);
+            if (l.softmax_tree) {
+
+                hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0);// , l.w*l.h);
+                if (map) {
+                    for (j = 0; j < 200; ++j) {
+                        int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + map[j]);
+                        float prob = scale*predictions[class_index];
+                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
+                    }
+                }
+                else {
+                    int j = hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h);
+                    dets[index].prob[j] = (scale > thresh) ? scale : 0;
+                }
+            }
+            else {
+                if (dets[index].objectness) {
+                    for (j = 0; j < l.classes; ++j) {
+                        int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + j);
+                        float prob = scale*predictions[class_index];
+                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
+                    }
+                }
+            }
+        }
+    }
+    correct_region_boxes(dets, l.w*l.h*l.n, w, h, netw, neth, relative);
+}
+
+void zero_objectness(layer l)
+{
+    int i, n;
+    for (i = 0; i < l.w*l.h; ++i) {
+        for (n = 0; n < l.n; ++n) {
+            int obj_index = entry_index(l, 0, n*l.w*l.h + i, l.coords);
+            l.output[obj_index] = 0;
+        }
+    }
+}
diff --git a/darknet-master/src/region_layer.h b/darknet-master/src/region_layer.h
new file mode 100644
index 0000000..e616624
--- /dev/null
+++ b/darknet-master/src/region_layer.h
@@ -0,0 +1,29 @@
+#ifndef REGION_LAYER_H
+#define REGION_LAYER_H
+
+#include "layer.h"
+#include "network.h"
+
+typedef layer region_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+region_layer make_region_layer(int batch, int w, int h, int n, int classes, int coords, int max_boxes);
+void forward_region_layer(const region_layer l, network_state state);
+void backward_region_layer(const region_layer l, network_state state);
+void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map);
+void resize_region_layer(layer *l, int w, int h);
+void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets);
+void correct_region_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative);
+void zero_objectness(layer l);
+
+#ifdef GPU
+void forward_region_layer_gpu(const region_layer l, network_state state);
+void backward_region_layer_gpu(region_layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/reorg_layer.c b/darknet-master/src/reorg_layer.c
new file mode 100644
index 0000000..7a4c0ae
--- /dev/null
+++ b/darknet-master/src/reorg_layer.c
@@ -0,0 +1,119 @@
+#include "reorg_layer.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include "utils.h"
+#include <stdio.h>
+
+
+layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse)
+{
+    layer l = { (LAYER_TYPE)0 };
+    l.type = REORG;
+    l.batch = batch;
+    l.stride = stride;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    if(reverse){
+        l.out_w = w*stride;
+        l.out_h = h*stride;
+        l.out_c = c/(stride*stride);
+    }else{
+        l.out_w = w/stride;
+        l.out_h = h/stride;
+        l.out_c = c*(stride*stride);
+    }
+    l.reverse = reverse;
+    fprintf(stderr, "reorg                    /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n",  stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    l.outputs = l.out_h * l.out_w * l.out_c;
+    l.inputs = h*w*c;
+    int output_size = l.out_h * l.out_w * l.out_c * batch;
+    l.output = (float*)xcalloc(output_size, sizeof(float));
+    l.delta = (float*)xcalloc(output_size, sizeof(float));
+
+    l.forward = forward_reorg_layer;
+    l.backward = backward_reorg_layer;
+#ifdef GPU
+    l.forward_gpu = forward_reorg_layer_gpu;
+    l.backward_gpu = backward_reorg_layer_gpu;
+
+    l.output_gpu  = cuda_make_array(l.output, output_size);
+    l.delta_gpu   = cuda_make_array(l.delta, output_size);
+#endif
+    return l;
+}
+
+void resize_reorg_layer(layer *l, int w, int h)
+{
+    int stride = l->stride;
+    int c = l->c;
+
+    l->h = h;
+    l->w = w;
+
+    if(l->reverse){
+        l->out_w = w*stride;
+        l->out_h = h*stride;
+        l->out_c = c/(stride*stride);
+    }else{
+        l->out_w = w/stride;
+        l->out_h = h/stride;
+        l->out_c = c*(stride*stride);
+    }
+
+    l->outputs = l->out_h * l->out_w * l->out_c;
+    l->inputs = l->outputs;
+    int output_size = l->outputs * l->batch;
+
+    l->output = (float*)xrealloc(l->output, output_size * sizeof(float));
+    l->delta = (float*)xrealloc(l->delta, output_size * sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu  = cuda_make_array(l->output, output_size);
+    l->delta_gpu   = cuda_make_array(l->delta,  output_size);
+#endif
+}
+
+void forward_reorg_layer(const layer l, network_state state)
+{
+    if (l.reverse) {
+        reorg_cpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, l.output);
+    }
+    else {
+        reorg_cpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, l.output);
+    }
+}
+
+void backward_reorg_layer(const layer l, network_state state)
+{
+    if (l.reverse) {
+        reorg_cpu(l.delta, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, state.delta);
+    }
+    else {
+        reorg_cpu(l.delta, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, state.delta);
+    }
+}
+
+#ifdef GPU
+void forward_reorg_layer_gpu(layer l, network_state state)
+{
+    if (l.reverse) {
+        reorg_ongpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, l.output_gpu);
+    }
+    else {
+        reorg_ongpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, l.output_gpu);
+    }
+}
+
+void backward_reorg_layer_gpu(layer l, network_state state)
+{
+    if (l.reverse) {
+        reorg_ongpu(l.delta_gpu, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, state.delta);
+    }
+    else {
+        reorg_ongpu(l.delta_gpu, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, state.delta);
+    }
+}
+#endif
diff --git a/darknet-master/src/reorg_layer.h b/darknet-master/src/reorg_layer.h
new file mode 100644
index 0000000..6318568
--- /dev/null
+++ b/darknet-master/src/reorg_layer.h
@@ -0,0 +1,26 @@
+#ifndef REORG_LAYER_H
+#define REORG_LAYER_H
+
+#include "image.h"
+#include "dark_cuda.h"
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse);
+void resize_reorg_layer(layer *l, int w, int h);
+void forward_reorg_layer(const layer l, network_state state);
+void backward_reorg_layer(const layer l, network_state state);
+
+#ifdef GPU
+void forward_reorg_layer_gpu(layer l, network_state state);
+void backward_reorg_layer_gpu(layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/reorg_old_layer.c b/darknet-master/src/reorg_old_layer.c
new file mode 100644
index 0000000..cb715e6
--- /dev/null
+++ b/darknet-master/src/reorg_old_layer.c
@@ -0,0 +1,119 @@
+#include "reorg_old_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include <stdio.h>
+
+
+layer make_reorg_old_layer(int batch, int w, int h, int c, int stride, int reverse)
+{
+    layer l = { (LAYER_TYPE)0 };
+    l.type = REORG_OLD;
+    l.batch = batch;
+    l.stride = stride;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    if(reverse){
+        l.out_w = w*stride;
+        l.out_h = h*stride;
+        l.out_c = c/(stride*stride);
+    }else{
+        l.out_w = w/stride;
+        l.out_h = h/stride;
+        l.out_c = c*(stride*stride);
+    }
+    l.reverse = reverse;
+    fprintf(stderr, "reorg_old              /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n",  stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    l.outputs = l.out_h * l.out_w * l.out_c;
+    l.inputs = h*w*c;
+    int output_size = l.out_h * l.out_w * l.out_c * batch;
+    l.output = (float*)xcalloc(output_size, sizeof(float));
+    l.delta = (float*)xcalloc(output_size, sizeof(float));
+
+    l.forward = forward_reorg_old_layer;
+    l.backward = backward_reorg_old_layer;
+#ifdef GPU
+    l.forward_gpu = forward_reorg_old_layer_gpu;
+    l.backward_gpu = backward_reorg_old_layer_gpu;
+
+    l.output_gpu  = cuda_make_array(l.output, output_size);
+    l.delta_gpu   = cuda_make_array(l.delta, output_size);
+#endif
+    return l;
+}
+
+void resize_reorg_old_layer(layer *l, int w, int h)
+{
+    int stride = l->stride;
+    int c = l->c;
+
+    l->h = h;
+    l->w = w;
+
+    if(l->reverse){
+        l->out_w = w*stride;
+        l->out_h = h*stride;
+        l->out_c = c/(stride*stride);
+    }else{
+        l->out_w = w/stride;
+        l->out_h = h/stride;
+        l->out_c = c*(stride*stride);
+    }
+
+    l->outputs = l->out_h * l->out_w * l->out_c;
+    l->inputs = l->outputs;
+    int output_size = l->outputs * l->batch;
+
+    l->output = (float*)xrealloc(l->output, output_size * sizeof(float));
+    l->delta = (float*)xrealloc(l->delta, output_size * sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu  = cuda_make_array(l->output, output_size);
+    l->delta_gpu   = cuda_make_array(l->delta,  output_size);
+#endif
+}
+
+void forward_reorg_old_layer(const layer l, network_state state)
+{
+    if (l.reverse) {
+        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output);
+    }
+    else {
+        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output);
+    }
+}
+
+void backward_reorg_old_layer(const layer l, network_state state)
+{
+    if (l.reverse) {
+        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
+    }
+    else {
+        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
+    }
+}
+
+#ifdef GPU
+void forward_reorg_old_layer_gpu(layer l, network_state state)
+{
+    if (l.reverse) {
+        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
+    }
+    else {
+        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
+    }
+}
+
+void backward_reorg_old_layer_gpu(layer l, network_state state)
+{
+    if (l.reverse) {
+        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
+    }
+    else {
+        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
+    }
+}
+#endif
diff --git a/darknet-master/src/reorg_old_layer.h b/darknet-master/src/reorg_old_layer.h
new file mode 100644
index 0000000..caa8c91
--- /dev/null
+++ b/darknet-master/src/reorg_old_layer.h
@@ -0,0 +1,26 @@
+#ifndef REORG_OLD_LAYER_H
+#define REORG_OLD_LAYER_H
+
+#include "image.h"
+#include "dark_cuda.h"
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_reorg_old_layer(int batch, int w, int h, int c, int stride, int reverse);
+void resize_reorg_old_layer(layer *l, int w, int h);
+void forward_reorg_old_layer(const layer l, network_state state);
+void backward_reorg_old_layer(const layer l, network_state state);
+
+#ifdef GPU
+void forward_reorg_old_layer_gpu(layer l, network_state state);
+void backward_reorg_old_layer_gpu(layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/representation_layer.c b/darknet-master/src/representation_layer.c
new file mode 100644
index 0000000..3efe74f
--- /dev/null
+++ b/darknet-master/src/representation_layer.c
@@ -0,0 +1,157 @@
+#include "representation_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include <stdio.h>
+#include <assert.h>
+
+layer make_implicit_layer(int batch, int index, float mean_init, float std_init, int filters, int atoms)
+{
+    fprintf(stderr,"implicit Layer: %d x %d \t mean=%.2f, std=%.2f \n", filters, atoms, mean_init, std_init);
+    layer l = { (LAYER_TYPE)0 };
+    l.type = IMPLICIT;
+    l.batch = batch;
+    l.w = 1;
+    l.h = 1;
+    l.c = 1;
+
+    l.out_w = 1;
+    l.out_h = atoms;
+    l.out_c = filters;
+
+    l.outputs = l.out_w*l.out_h*l.out_c;
+    l.inputs = 1;
+    l.index = index;
+
+    l.nweights = l.out_w * l.out_h * l.out_c;
+
+    l.weight_updates = (float*)xcalloc(l.nweights, sizeof(float));
+    l.weights = (float*)xcalloc(l.nweights, sizeof(float));
+    int i;
+    for (i = 0; i < l.nweights; ++i) l.weights[i] = mean_init + rand_uniform(-std_init, std_init);
+
+
+    l.delta = (float*)xcalloc(l.outputs * batch, sizeof(float));
+    l.output = (float*)xcalloc(l.outputs * batch, sizeof(float));
+
+    l.forward = forward_implicit_layer;
+    l.backward = backward_implicit_layer;
+    l.update = update_implicit_layer;
+#ifdef GPU
+    l.forward_gpu = forward_implicit_layer_gpu;
+    l.backward_gpu = backward_implicit_layer_gpu;
+    l.update_gpu = update_implicit_layer_gpu;
+
+    l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+
+    l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
+    l.weights_gpu = cuda_make_array(l.weights, l.nweights);
+#endif
+    return l;
+}
+
+void resize_implicit_layer(layer *l, int w, int h)
+{
+}
+
+void forward_implicit_layer(const layer l, network_state state)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < l.nweights * l.batch; ++i) {
+        l.output[i] = l.weights[i % l.nweights];
+    }
+}
+
+void backward_implicit_layer(const layer l, network_state state)
+{
+    int i;
+    for (i = 0; i < l.nweights * l.batch; ++i) {
+        l.weight_updates[i % l.nweights] += l.delta[i];
+    }
+}
+
+void update_implicit_layer(layer l, int batch, float learning_rate_init, float momentum, float decay)
+{
+    float learning_rate = learning_rate_init*l.learning_rate_scale;
+    //float momentum = a.momentum;
+    //float decay = a.decay;
+    //int batch = a.batch;
+
+    axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(l.nweights, momentum, l.weight_updates, 1);
+
+}
+
+
+#ifdef GPU
+void forward_implicit_layer_gpu(const layer l, network_state state)
+{
+    forward_implicit_gpu(l.batch, l.nweights, l.weights_gpu, l.output_gpu);
+}
+
+void backward_implicit_layer_gpu(const layer l, network_state state)
+{
+    backward_implicit_gpu(l.batch, l.nweights, l.weight_updates_gpu, l.delta_gpu);
+}
+
+void update_implicit_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
+{
+    // Loss scale for Mixed-Precision on Tensor-Cores
+    float learning_rate = learning_rate_init*l.learning_rate_scale / loss_scale;
+    //float momentum = a.momentum;
+    //float decay = a.decay;
+    //int batch = a.batch;
+
+    reset_nan_and_inf(l.weight_updates_gpu, l.nweights);
+    fix_nan_and_inf(l.weights_gpu, l.nweights);
+
+    if (l.adam) {
+        //adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t);
+    }
+    else {
+        //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        //axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+        //scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
+
+        axpy_ongpu(l.nweights, -decay*batch*loss_scale, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+
+        scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
+    }
+
+    if (l.clip) {
+        constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1);
+    }
+}
+
+void pull_implicit_layer(layer l)
+{
+    cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
+    cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
+
+    if (l.adam) {
+        cuda_pull_array_async(l.m_gpu, l.m, l.nweights);
+        cuda_pull_array_async(l.v_gpu, l.v, l.nweights);
+    }
+    CHECK_CUDA(cudaPeekAtLastError());
+    cudaStreamSynchronize(get_cuda_stream());
+}
+
+void push_implicit_layer(layer l)
+{
+    cuda_push_array(l.weights_gpu, l.weights, l.nweights);
+
+    if (l.train) {
+        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+    }
+    if (l.adam) {
+        cuda_push_array(l.m_gpu, l.m, l.nweights);
+        cuda_push_array(l.v_gpu, l.v, l.nweights);
+    }
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+#endif
diff --git a/darknet-master/src/representation_layer.h b/darknet-master/src/representation_layer.h
new file mode 100644
index 0000000..8b2a9da
--- /dev/null
+++ b/darknet-master/src/representation_layer.h
@@ -0,0 +1,29 @@
+#ifndef REPRESENTATION_LAYER_H
+#define REPRESENTATION_LAYER_H
+
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_implicit_layer(int batch, int index, float mean_init, float std_init, int filters, int atoms);
+void forward_implicit_layer(const layer l, network_state state);
+void backward_implicit_layer(const layer l, network_state state);
+void update_implicit_layer(layer l, int batch, float learning_rate_init, float momentum, float decay);
+
+void resize_implicit_layer(layer *l, int w, int h);
+
+#ifdef GPU
+void forward_implicit_layer_gpu(const layer l, network_state state);
+void backward_implicit_layer_gpu(const layer l, network_state state);
+
+void update_implicit_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale);
+void pull_implicit_layer(layer l);
+void push_implicit_layer(layer l);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // REPRESENTATION_LAYER_H
diff --git a/darknet-master/src/rnn.c b/darknet-master/src/rnn.c
new file mode 100644
index 0000000..ef2c7cc
--- /dev/null
+++ b/darknet-master/src/rnn.c
@@ -0,0 +1,498 @@
+#include "network.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "blas.h"
+#include "parser.h"
+
+typedef struct {
+    float *x;
+    float *y;
+} float_pair;
+
+int *read_tokenized_data(char *filename, size_t *read)
+{
+    size_t size = 512;
+    size_t count = 0;
+    FILE *fp = fopen(filename, "r");
+    int* d = (int*)xcalloc(size, sizeof(int));
+    int n, one;
+    one = fscanf(fp, "%d", &n);
+    while(one == 1){
+        ++count;
+        if(count > size){
+            size = size*2;
+            d = (int*)xrealloc(d, size * sizeof(int));
+        }
+        d[count-1] = n;
+        one = fscanf(fp, "%d", &n);
+    }
+    fclose(fp);
+    d = (int*)xrealloc(d, count * sizeof(int));
+    *read = count;
+    return d;
+}
+
+char **read_tokens(char *filename, size_t *read)
+{
+    size_t size = 512;
+    size_t count = 0;
+    FILE *fp = fopen(filename, "r");
+    char** d = (char**)xcalloc(size, sizeof(char*));
+    char *line;
+    while((line=fgetl(fp)) != 0){
+        ++count;
+        if(count > size){
+            size = size*2;
+            d = (char**)xrealloc(d, size * sizeof(char*));
+        }
+        d[count-1] = line;
+    }
+    fclose(fp);
+    d = (char**)xrealloc(d, count * sizeof(char*));
+    *read = count;
+    return d;
+}
+
+float_pair get_rnn_token_data(int *tokens, size_t *offsets, int characters, size_t len, int batch, int steps)
+{
+    float* x = (float*)xcalloc(batch * steps * characters, sizeof(float));
+    float* y = (float*)xcalloc(batch * steps * characters, sizeof(float));
+    int i,j;
+    for(i = 0; i < batch; ++i){
+        for(j = 0; j < steps; ++j){
+            int curr = tokens[(offsets[i])%len];
+            int next = tokens[(offsets[i] + 1)%len];
+
+            x[(j*batch + i)*characters + curr] = 1;
+            y[(j*batch + i)*characters + next] = 1;
+
+            offsets[i] = (offsets[i] + 1) % len;
+
+            if(curr >= characters || curr < 0 || next >= characters || next < 0){
+                error("Bad char", DARKNET_LOC);
+            }
+        }
+    }
+    float_pair p;
+    p.x = x;
+    p.y = y;
+    return p;
+}
+
+float_pair get_rnn_data(unsigned char *text, size_t *offsets, int characters, size_t len, int batch, int steps)
+{
+    float* x = (float*)xcalloc(batch * steps * characters, sizeof(float));
+    float* y = (float*)xcalloc(batch * steps * characters, sizeof(float));
+    int i,j;
+    for(i = 0; i < batch; ++i){
+        for(j = 0; j < steps; ++j){
+            unsigned char curr = text[(offsets[i])%len];
+            unsigned char next = text[(offsets[i] + 1)%len];
+
+            x[(j*batch + i)*characters + curr] = 1;
+            y[(j*batch + i)*characters + next] = 1;
+
+            offsets[i] = (offsets[i] + 1) % len;
+
+            if(curr > 255 || curr <= 0 || next > 255 || next <= 0){
+                /*text[(index+j+2)%len] = 0;
+                printf("%ld %d %d %d %d\n", index, j, len, (int)text[index+j], (int)text[index+j+1]);
+                printf("%s", text+index);
+                */
+                error("Bad char", DARKNET_LOC);
+            }
+        }
+    }
+    float_pair p;
+    p.x = x;
+    p.y = y;
+    return p;
+}
+
+void reset_rnn_state(network net, int b)
+{
+    int i;
+    for (i = 0; i < net.n; ++i) {
+        #ifdef GPU
+        layer l = net.layers[i];
+        if(l.state_gpu){
+            fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
+        }
+        #endif
+    }
+}
+
+void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear, int tokenized)
+{
+    srand(time(0));
+    unsigned char *text = 0;
+    int *tokens = 0;
+    size_t size;
+    if(tokenized){
+        tokens = read_tokenized_data(filename, &size);
+    } else {
+        FILE *fp = fopen(filename, "rb");
+
+        fseek(fp, 0, SEEK_END);
+        size = ftell(fp);
+        fseek(fp, 0, SEEK_SET);
+
+        text = (unsigned char *)xcalloc(size + 1, sizeof(char));
+        fread(text, 1, size, fp);
+        fclose(fp);
+    }
+
+    char* backup_directory = "backup/";
+    char *base = basecfg(cfgfile);
+    fprintf(stderr, "%s\n", base);
+    float avg_loss = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+
+    int inputs = get_network_input_size(net);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int batch = net.batch;
+    int steps = net.time_steps;
+    if (clear) {
+        *net.seen = 0;
+        *net.cur_iteration = 0;
+    }
+    int i = (*net.seen)/net.batch;
+
+    int streams = batch/steps;
+    printf("\n batch = %d, steps = %d, streams = %d, subdivisions = %d, text_size = %ld \n", batch, steps, streams, net.subdivisions, size);
+    printf(" global_batch = %d \n", batch*net.subdivisions);
+    size_t* offsets = (size_t*)xcalloc(streams, sizeof(size_t));
+    int j;
+    for(j = 0; j < streams; ++j){
+        offsets[j] = rand_size_t()%size;
+        //printf(" offset[%d] = %d, ", j, offsets[j]);
+    }
+    //printf("\n");
+
+    clock_t time;
+    while(get_current_batch(net) < net.max_batches){
+        i += 1;
+        time=clock();
+        float_pair p;
+        if(tokenized){
+            p = get_rnn_token_data(tokens, offsets, inputs, size, streams, steps);
+        }else{
+            p = get_rnn_data(text, offsets, inputs, size, streams, steps);
+        }
+
+        float loss = train_network_datum(net, p.x, p.y) / (batch);
+        free(p.x);
+        free(p.y);
+        if (avg_loss < 0) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+
+        int chars = get_current_batch(net)*batch;
+        fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds, %f epochs\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), (float) chars/size);
+
+        for(j = 0; j < streams; ++j){
+            //printf("%d\n", j);
+            if(rand()%10 == 0){
+                //fprintf(stderr, "Reset\n");
+                offsets[j] = rand_size_t()%size;
+                reset_rnn_state(net, j);
+            }
+        }
+
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
+        if(i%10==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+}
+
+void print_symbol(int n, char **tokens){
+    if(tokens){
+        printf("%s ", tokens[n]);
+    } else {
+        printf("%c", n);
+    }
+}
+
+void test_char_rnn(char *cfgfile, char *weightfile, int num, char *seed, float temp, int rseed, char *token_file)
+{
+    char **tokens = 0;
+    if(token_file){
+        size_t n;
+        tokens = read_tokens(token_file, &n);
+    }
+
+    srand(rseed);
+    char *base = basecfg(cfgfile);
+    fprintf(stderr, "%s\n", base);
+
+    network net = parse_network_cfg_custom(cfgfile, 1, 1);  // batch=1, time_steps=1
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    int inputs = get_network_input_size(net);
+
+    int i, j;
+    for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp;
+    int c = 0;
+    int len = strlen(seed);
+    float* input = (float*)xcalloc(inputs, sizeof(float));
+
+    /*
+       fill_cpu(inputs, 0, input, 1);
+       for(i = 0; i < 10; ++i){
+       network_predict(net, input);
+       }
+       fill_cpu(inputs, 0, input, 1);
+     */
+
+    for(i = 0; i < len-1; ++i){
+        c = seed[i];
+        input[c] = 1;
+        network_predict(net, input);
+        input[c] = 0;
+        print_symbol(c, tokens);
+    }
+    if(len) c = seed[len-1];
+    print_symbol(c, tokens);
+    for(i = 0; i < num; ++i){
+        input[c] = 1;
+        float *out = network_predict(net, input);
+        input[c] = 0;
+        for(j = 32; j < 127; ++j){
+            //printf("%d %c %f\n",j, j, out[j]);
+        }
+        for(j = 0; j < inputs; ++j){
+            if (out[j] < .0001) out[j] = 0;
+        }
+        c = sample_array(out, inputs);
+        //c = sample_array_custom(out, inputs);
+        //c = max_index(out, inputs);
+        //c = top_max_index(out, inputs, 2);
+        print_symbol(c, tokens);
+    }
+    printf("\n");
+}
+
+void test_tactic_rnn(char *cfgfile, char *weightfile, int num, float temp, int rseed, char *token_file)
+{
+    char **tokens = 0;
+    if(token_file){
+        size_t n;
+        tokens = read_tokens(token_file, &n);
+    }
+
+    srand(rseed);
+    char *base = basecfg(cfgfile);
+    fprintf(stderr, "%s\n", base);
+
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    int inputs = get_network_input_size(net);
+
+    int i, j;
+    for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp;
+    int c = 0;
+    float* input = (float*)xcalloc(inputs, sizeof(float));
+    float *out = 0;
+
+    while((c = getc(stdin)) != EOF){
+        input[c] = 1;
+        out = network_predict(net, input);
+        input[c] = 0;
+    }
+    for(i = 0; i < num; ++i){
+        for(j = 0; j < inputs; ++j){
+            if (out[j] < .0001) out[j] = 0;
+        }
+        int next = sample_array(out, inputs);
+        if(c == '.' && next == '\n') break;
+        c = next;
+        print_symbol(c, tokens);
+
+        input[c] = 1;
+        out = network_predict(net, input);
+        input[c] = 0;
+    }
+    printf("\n");
+}
+
+void valid_tactic_rnn(char *cfgfile, char *weightfile, char *seed)
+{
+    char *base = basecfg(cfgfile);
+    fprintf(stderr, "%s\n", base);
+
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    int inputs = get_network_input_size(net);
+
+    int count = 0;
+    int words = 1;
+    int c;
+    int len = strlen(seed);
+    float* input = (float*)xcalloc(inputs, sizeof(float));
+    int i;
+    for(i = 0; i < len; ++i){
+        c = seed[i];
+        input[(int)c] = 1;
+        network_predict(net, input);
+        input[(int)c] = 0;
+    }
+    float sum = 0;
+    c = getc(stdin);
+    float log2 = log(2);
+    int in = 0;
+    while(c != EOF){
+        int next = getc(stdin);
+        if(next == EOF) break;
+        if(next < 0 || next >= 255) error("Out of range character", DARKNET_LOC);
+
+        input[c] = 1;
+        float *out = network_predict(net, input);
+        input[c] = 0;
+
+        if(c == '.' && next == '\n') in = 0;
+        if(!in) {
+            if(c == '>' && next == '>'){
+                in = 1;
+                ++words;
+            }
+            c = next;
+            continue;
+        }
+        ++count;
+        sum += log(out[next])/log2;
+        c = next;
+        printf("%d %d Perplexity: %4.4f    Word Perplexity: %4.4f\n", count, words, pow(2, -sum/count), pow(2, -sum/words));
+    }
+}
+
+void valid_char_rnn(char *cfgfile, char *weightfile, char *seed)
+{
+    char *base = basecfg(cfgfile);
+    fprintf(stderr, "%s\n", base);
+
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    int inputs = get_network_input_size(net);
+
+    int count = 0;
+    int words = 1;
+    int c;
+    int len = strlen(seed);
+    float* input = (float*)xcalloc(inputs, sizeof(float));
+    int i;
+    for(i = 0; i < len; ++i){
+        c = seed[i];
+        input[(int)c] = 1;
+        network_predict(net, input);
+        input[(int)c] = 0;
+    }
+    float sum = 0;
+    c = getc(stdin);
+    float log2 = log(2);
+    while(c != EOF){
+        int next = getc(stdin);
+        if(next == EOF) break;
+        if(next < 0 || next >= 255) error("Out of range character", DARKNET_LOC);
+        ++count;
+        if(next == ' ' || next == '\n' || next == '\t') ++words;
+        input[c] = 1;
+        float *out = network_predict(net, input);
+        input[c] = 0;
+        sum += log(out[next])/log2;
+        c = next;
+        printf("%d Perplexity: %4.4f    Word Perplexity: %4.4f\n", count, pow(2, -sum/count), pow(2, -sum/words));
+    }
+}
+
+void vec_char_rnn(char *cfgfile, char *weightfile, char *seed)
+{
+    char *base = basecfg(cfgfile);
+    fprintf(stderr, "%s\n", base);
+
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    int inputs = get_network_input_size(net);
+
+    int c;
+    int seed_len = strlen(seed);
+    float* input = (float*)xcalloc(inputs, sizeof(float));
+    int i;
+    char *line;
+    while((line=fgetl(stdin)) != 0){
+        reset_rnn_state(net, 0);
+        for(i = 0; i < seed_len; ++i){
+            c = seed[i];
+            input[(int)c] = 1;
+            network_predict(net, input);
+            input[(int)c] = 0;
+        }
+        strip(line);
+        int str_len = strlen(line);
+        for(i = 0; i < str_len; ++i){
+            c = line[i];
+            input[(int)c] = 1;
+            network_predict(net, input);
+            input[(int)c] = 0;
+        }
+        c = ' ';
+        input[(int)c] = 1;
+        network_predict(net, input);
+        input[(int)c] = 0;
+
+        layer l = net.layers[0];
+        #ifdef GPU
+        cuda_pull_array(l.output_gpu, l.output, l.outputs);
+        #endif
+        printf("%s", line);
+        for(i = 0; i < l.outputs; ++i){
+            printf(",%g", l.output[i]);
+        }
+        printf("\n");
+    }
+}
+
+void run_char_rnn(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+    char *filename = find_char_arg(argc, argv, "-file", "data/shakespeare.txt");
+    char *seed = find_char_arg(argc, argv, "-seed", "\n\n");
+    int len = find_int_arg(argc, argv, "-len", 1000);
+    float temp = find_float_arg(argc, argv, "-temp", .7);
+    int rseed = find_int_arg(argc, argv, "-srand", time(0));
+    int clear = find_arg(argc, argv, "-clear");
+    int tokenized = find_arg(argc, argv, "-tokenized");
+    char *tokens = find_char_arg(argc, argv, "-tokens", 0);
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    if(0==strcmp(argv[2], "train")) train_char_rnn(cfg, weights, filename, clear, tokenized);
+    else if(0==strcmp(argv[2], "valid")) valid_char_rnn(cfg, weights, seed);
+    else if(0==strcmp(argv[2], "validtactic")) valid_tactic_rnn(cfg, weights, seed);
+    else if(0==strcmp(argv[2], "vec")) vec_char_rnn(cfg, weights, seed);
+    else if(0==strcmp(argv[2], "generate")) test_char_rnn(cfg, weights, len, seed, temp, rseed, tokens);
+    else if(0==strcmp(argv[2], "generatetactic")) test_tactic_rnn(cfg, weights, len, temp, rseed, tokens);
+}
diff --git a/darknet-master/src/rnn_layer.c b/darknet-master/src/rnn_layer.c
new file mode 100644
index 0000000..98f0d48
--- /dev/null
+++ b/darknet-master/src/rnn_layer.c
@@ -0,0 +1,289 @@
+#include "rnn_layer.h"
+#include "connected_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include "gemm.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void increment_layer(layer *l, int steps)
+{
+    int num = l->outputs*l->batch*steps;
+    l->output += num;
+    l->delta += num;
+    l->x += num;
+    l->x_norm += num;
+
+#ifdef GPU
+    l->output_gpu += num;
+    l->delta_gpu += num;
+    l->x_gpu += num;
+    l->x_norm_gpu += num;
+#endif
+}
+
+layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log)
+{
+    fprintf(stderr, "RNN Layer: %d inputs, %d outputs\n", inputs, outputs);
+    batch = batch / steps;
+    layer l = { (LAYER_TYPE)0 };
+    l.batch = batch;
+    l.type = RNN;
+    l.steps = steps;
+    l.hidden = hidden;
+    l.inputs = inputs;
+    l.out_w = 1;
+    l.out_h = 1;
+    l.out_c = outputs;
+
+    l.state = (float*)xcalloc(batch * hidden * (steps + 1), sizeof(float));
+
+    l.input_layer = (layer*)xcalloc(1, sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.input_layer) = make_connected_layer(batch, steps, inputs, hidden, activation, batch_normalize);
+    l.input_layer->batch = batch;
+    if (l.workspace_size < l.input_layer->workspace_size) l.workspace_size = l.input_layer->workspace_size;
+
+    l.self_layer = (layer*)xcalloc(1, sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.self_layer) = make_connected_layer(batch, steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize);
+    l.self_layer->batch = batch;
+    if (l.workspace_size < l.self_layer->workspace_size) l.workspace_size = l.self_layer->workspace_size;
+
+    l.output_layer = (layer*)xcalloc(1, sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.output_layer) = make_connected_layer(batch, steps, hidden, outputs, activation, batch_normalize);
+    l.output_layer->batch = batch;
+    if (l.workspace_size < l.output_layer->workspace_size) l.workspace_size = l.output_layer->workspace_size;
+
+    l.outputs = outputs;
+    l.output = l.output_layer->output;
+    l.delta = l.output_layer->delta;
+
+    l.forward = forward_rnn_layer;
+    l.backward = backward_rnn_layer;
+    l.update = update_rnn_layer;
+#ifdef GPU
+    l.forward_gpu = forward_rnn_layer_gpu;
+    l.backward_gpu = backward_rnn_layer_gpu;
+    l.update_gpu = update_rnn_layer_gpu;
+    l.state_gpu = cuda_make_array(l.state, batch*hidden*(steps+1));
+    l.output_gpu = l.output_layer->output_gpu;
+    l.delta_gpu = l.output_layer->delta_gpu;
+#endif
+
+    return l;
+}
+
+void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+{
+    update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
+}
+
+void forward_rnn_layer(layer l, network_state state)
+{
+    network_state s = {0};
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer input_layer = *(l.input_layer);
+    layer self_layer = *(l.self_layer);
+    layer output_layer = *(l.output_layer);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1);
+    fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1);
+    fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1);
+    if(state.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+
+    for (i = 0; i < l.steps; ++i) {
+
+        s.input = state.input;
+        forward_connected_layer(input_layer, s);
+
+        s.input = l.state;
+        forward_connected_layer(self_layer, s);
+
+        float *old_state = l.state;
+        if(state.train) l.state += l.hidden*l.batch;
+        if(l.shortcut){
+            copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1);
+        }else{
+            fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+        }
+        axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1);
+        axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
+
+        s.input = l.state;
+        forward_connected_layer(output_layer, s);
+
+        state.input += l.inputs*l.batch;
+        increment_layer(&input_layer, 1);
+        increment_layer(&self_layer, 1);
+        increment_layer(&output_layer, 1);
+    }
+}
+
+void backward_rnn_layer(layer l, network_state state)
+{
+    network_state s = {0};
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer input_layer = *(l.input_layer);
+    layer self_layer = *(l.self_layer);
+    layer output_layer = *(l.output_layer);
+
+    increment_layer(&input_layer, l.steps-1);
+    increment_layer(&self_layer, l.steps-1);
+    increment_layer(&output_layer, l.steps-1);
+
+    l.state += l.hidden*l.batch*l.steps;
+    for (i = l.steps-1; i >= 0; --i) {
+        copy_cpu(l.hidden * l.batch, input_layer.output, 1, l.state, 1);
+        axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
+
+        s.input = l.state;
+        s.delta = self_layer.delta;
+        backward_connected_layer(output_layer, s);
+
+        l.state -= l.hidden*l.batch;
+        /*
+           if(i > 0){
+           copy_cpu(l.hidden * l.batch, input_layer.output - l.hidden*l.batch, 1, l.state, 1);
+           axpy_cpu(l.hidden * l.batch, 1, self_layer.output - l.hidden*l.batch, 1, l.state, 1);
+           }else{
+           fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+           }
+         */
+
+        s.input = l.state;
+        s.delta = self_layer.delta - l.hidden*l.batch;
+        if (i == 0) s.delta = 0;
+        backward_connected_layer(self_layer, s);
+
+        copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1);
+        if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1);
+        s.input = state.input + i*l.inputs*l.batch;
+        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
+        else s.delta = 0;
+        backward_connected_layer(input_layer, s);
+
+        increment_layer(&input_layer, -1);
+        increment_layer(&self_layer, -1);
+        increment_layer(&output_layer, -1);
+    }
+}
+
+#ifdef GPU
+
+void pull_rnn_layer(layer l)
+{
+    pull_connected_layer(*(l.input_layer));
+    pull_connected_layer(*(l.self_layer));
+    pull_connected_layer(*(l.output_layer));
+}
+
+void push_rnn_layer(layer l)
+{
+    push_connected_layer(*(l.input_layer));
+    push_connected_layer(*(l.self_layer));
+    push_connected_layer(*(l.output_layer));
+}
+
+void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
+{
+    update_connected_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay, loss_scale);
+}
+
+void forward_rnn_layer_gpu(layer l, network_state state)
+{
+    network_state s = {0};
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer input_layer = *(l.input_layer);
+    layer self_layer = *(l.self_layer);
+    layer output_layer = *(l.output_layer);
+
+    fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
+    fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
+    fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
+    if(state.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
+
+    for (i = 0; i < l.steps; ++i) {
+
+        s.input = state.input;
+        forward_connected_layer_gpu(input_layer, s);
+
+        s.input = l.state_gpu;
+        forward_connected_layer_gpu(self_layer, s);
+
+        float *old_state = l.state_gpu;
+        if(state.train) l.state_gpu += l.hidden*l.batch;
+        if(l.shortcut){
+            copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
+        }else{
+            fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
+        }
+        axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
+
+        s.input = l.state_gpu;
+        forward_connected_layer_gpu(output_layer, s);
+
+        state.input += l.inputs*l.batch;
+        increment_layer(&input_layer, 1);
+        increment_layer(&self_layer, 1);
+        increment_layer(&output_layer, 1);
+    }
+}
+
+void backward_rnn_layer_gpu(layer l, network_state state)
+{
+    network_state s = {0};
+    s.train = state.train;
+    s.workspace = state.workspace;
+    int i;
+    layer input_layer = *(l.input_layer);
+    layer self_layer = *(l.self_layer);
+    layer output_layer = *(l.output_layer);
+    increment_layer(&input_layer,  l.steps - 1);
+    increment_layer(&self_layer,   l.steps - 1);
+    increment_layer(&output_layer, l.steps - 1);
+    l.state_gpu += l.hidden*l.batch*l.steps;
+    for (i = l.steps-1; i >= 0; --i) {
+
+        s.input = l.state_gpu;
+        s.delta = self_layer.delta_gpu;
+        backward_connected_layer_gpu(output_layer, s);
+
+        l.state_gpu -= l.hidden*l.batch;
+
+        copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);    // the same delta for Input and Self layers
+
+        s.input = l.state_gpu;
+        s.delta = self_layer.delta_gpu - l.hidden*l.batch;
+        if (i == 0) s.delta = 0;
+        backward_connected_layer_gpu(self_layer, s);
+
+        //copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
+        if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
+        s.input = state.input + i*l.inputs*l.batch;
+        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
+        else s.delta = 0;
+        backward_connected_layer_gpu(input_layer, s);
+
+        increment_layer(&input_layer,  -1);
+        increment_layer(&self_layer,   -1);
+        increment_layer(&output_layer, -1);
+    }
+}
+#endif
diff --git a/darknet-master/src/rnn_layer.h b/darknet-master/src/rnn_layer.h
new file mode 100644
index 0000000..a2aa0f9
--- /dev/null
+++ b/darknet-master/src/rnn_layer.h
@@ -0,0 +1,31 @@
+
+#ifndef RNN_LAYER_H
+#define RNN_LAYER_H
+
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+#define USET
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log);
+
+void forward_rnn_layer(layer l, network_state state);
+void backward_rnn_layer(layer l, network_state state);
+void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+
+#ifdef GPU
+void forward_rnn_layer_gpu(layer l, network_state state);
+void backward_rnn_layer_gpu(layer l, network_state state);
+void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale);
+void push_rnn_layer(layer l);
+void pull_rnn_layer(layer l);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/rnn_vid.c b/darknet-master/src/rnn_vid.c
new file mode 100644
index 0000000..c521c75
--- /dev/null
+++ b/darknet-master/src/rnn_vid.c
@@ -0,0 +1,208 @@
+#include "network.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "parser.h"
+#include "blas.h"
+
+#ifdef OPENCV
+void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters);
+
+
+typedef struct {
+    float *x;
+    float *y;
+} float_pair;
+
+float_pair get_rnn_vid_data(network net, char **files, int n, int batch, int steps)
+{
+    int b;
+    assert(net.batch == steps + 1);
+    image out_im = get_network_image(net);
+    int output_size = out_im.w*out_im.h*out_im.c;
+    printf("%d %d %d\n", out_im.w, out_im.h, out_im.c);
+    float* feats = (float*)xcalloc(net.batch * batch * output_size, sizeof(float));
+    for(b = 0; b < batch; ++b){
+        int input_size = net.w*net.h*net.c;
+        float* input = (float*)xcalloc(input_size * net.batch, sizeof(float));
+        char *filename = files[rand()%n];
+        cap_cv *cap = get_capture_video_stream(filename);
+        int frames = get_capture_frame_count_cv(cap);
+        int index = rand() % (frames - steps - 2);
+        if (frames < (steps + 4)){
+            --b;
+            free(input);
+            continue;
+        }
+
+        printf("frames: %d, index: %d\n", frames, index);
+        set_capture_position_frame_cv(cap, index);
+
+        int i;
+        for(i = 0; i < net.batch; ++i){
+            mat_cv *src = get_capture_frame_cv(cap);
+            image im = mat_to_image_cv(src);
+            rgbgr_image(im);
+            image re = resize_image(im, net.w, net.h);
+            //show_image(re, "loaded");
+            //cvWaitKey(10);
+            memcpy(input + i*input_size, re.data, input_size*sizeof(float));
+            free_image(im);
+            free_image(re);
+        }
+        float *output = network_predict(net, input);
+
+        free(input);
+
+        for(i = 0; i < net.batch; ++i){
+            memcpy(feats + (b + i*batch)*output_size, output + i*output_size, output_size*sizeof(float));
+        }
+
+        release_capture(cap); //cvReleaseCapture(&cap);
+    }
+
+    //printf("%d %d %d\n", out_im.w, out_im.h, out_im.c);
+    float_pair p = {0};
+    p.x = feats;
+    p.y = feats + output_size*batch; //+ out_im.w*out_im.h*out_im.c;
+
+    return p;
+}
+
+
+void train_vid_rnn(char *cfgfile, char *weightfile)
+{
+    char *train_videos = "data/vid/train.txt";
+    char* backup_directory = "backup/";
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    float avg_loss = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = net.batch*net.subdivisions;
+    int i = *net.seen/imgs;
+
+    list *plist = get_paths(train_videos);
+    int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+    clock_t time;
+    int steps = net.time_steps;
+    int batch = net.batch / net.time_steps;
+
+    network extractor = parse_network_cfg("cfg/extractor.cfg");
+    load_weights(&extractor, "trained/yolo-coco.conv");
+
+    while(get_current_batch(net) < net.max_batches){
+        i += 1;
+        time=clock();
+        float_pair p = get_rnn_vid_data(extractor, paths, N, batch, steps);
+
+        float loss = train_network_datum(net, p.x, p.y) / (net.batch);
+
+
+        free(p.x);
+        if (avg_loss < 0) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+
+        fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time));
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
+        if(i%10==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+}
+
+
+image save_reconstruction(network net, image *init, float *feat, char *name, int i)
+{
+    image recon;
+    if (init) {
+        recon = copy_image(*init);
+    } else {
+        recon = make_random_image(net.w, net.h, 3);
+    }
+
+    image update = make_image(net.w, net.h, 3);
+    reconstruct_picture(net, feat, recon, update, .01, .9, .1, 2, 50);
+    char buff[256];
+    sprintf(buff, "%s%d", name, i);
+    save_image(recon, buff);
+    free_image(update);
+    return recon;
+}
+
+void generate_vid_rnn(char *cfgfile, char *weightfile)
+{
+    network extractor = parse_network_cfg("cfg/extractor.recon.cfg");
+    load_weights(&extractor, "trained/yolo-coco.conv");
+
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&extractor, 1);
+    set_batch_network(&net, 1);
+
+    int i;
+    cap_cv *cap = get_capture_video_stream("extra/vid/ILSVRC2015/Data/VID/snippets/val/ILSVRC2015_val_00007030.mp4");
+    //CvCapture* cap = cvCaptureFromFile("extra/vid/ILSVRC2015/Data/VID/snippets/val/ILSVRC2015_val_00007030.mp4");
+    float *feat;
+    float *next;
+    next = NULL;
+    image last;
+    for(i = 0; i < 25; ++i){
+        image im = get_image_from_stream_cpp(cap);
+        image re = resize_image(im, extractor.w, extractor.h);
+        feat = network_predict(extractor, re.data);
+        if(i > 0){
+            printf("%f %f\n", mean_array(feat, 14*14*512), variance_array(feat, 14*14*512));
+            printf("%f %f\n", mean_array(next, 14*14*512), variance_array(next, 14*14*512));
+            printf("%f\n", mse_array(feat, 14*14*512));
+            axpy_cpu(14*14*512, -1, feat, 1, next, 1);
+            printf("%f\n", mse_array(next, 14*14*512));
+        }
+        next = network_predict(net, feat);
+
+        free_image(im);
+
+        free_image(save_reconstruction(extractor, 0, feat, "feat", i));
+        free_image(save_reconstruction(extractor, 0, next, "next", i));
+        if (i==24) last = copy_image(re);
+        free_image(re);
+    }
+    for(i = 0; i < 30; ++i){
+        next = network_predict(net, next);
+        image newimage = save_reconstruction(extractor, &last, next, "newimage", i);
+        free_image(last);
+        last = newimage;
+    }
+}
+
+void run_vid_rnn(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    //char *filename = (argc > 5) ? argv[5]: 0;
+    if(0==strcmp(argv[2], "train")) train_vid_rnn(cfg, weights);
+    else if(0==strcmp(argv[2], "generate")) generate_vid_rnn(cfg, weights);
+}
+#else
+void run_vid_rnn(int argc, char **argv){}
+#endif
diff --git a/darknet-master/src/route_layer.c b/darknet-master/src/route_layer.c
new file mode 100644
index 0000000..23dfa04
--- /dev/null
+++ b/darknet-master/src/route_layer.c
@@ -0,0 +1,161 @@
+#include "route_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include <stdio.h>
+
+route_layer make_route_layer(int batch, int n, int *input_layers, int *input_sizes, int groups, int group_id)
+{
+    fprintf(stderr,"route ");
+    route_layer l = { (LAYER_TYPE)0 };
+    l.type = ROUTE;
+    l.batch = batch;
+    l.n = n;
+    l.input_layers = input_layers;
+    l.input_sizes = input_sizes;
+    l.groups = groups;
+    l.group_id = group_id;
+    l.wait_stream_id = -1;
+    int i;
+    int outputs = 0;
+    for(i = 0; i < n; ++i){
+        fprintf(stderr," %d", input_layers[i]);
+        outputs += input_sizes[i];
+    }
+    outputs = outputs / groups;
+    l.outputs = outputs;
+    l.inputs = outputs;
+    //fprintf(stderr, " inputs = %d \t outputs = %d, groups = %d, group_id = %d \n", l.inputs, l.outputs, l.groups, l.group_id);
+    l.delta = (float*)xcalloc(outputs * batch, sizeof(float));
+    l.output = (float*)xcalloc(outputs * batch, sizeof(float));
+
+    l.forward = forward_route_layer;
+    l.backward = backward_route_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_route_layer_gpu;
+    l.backward_gpu = backward_route_layer_gpu;
+
+    l.delta_gpu =  cuda_make_array(l.delta, outputs*batch);
+    l.output_gpu = cuda_make_array(l.output, outputs*batch);
+    #endif
+    return l;
+}
+
+void resize_route_layer(route_layer *l, network *net)
+{
+    int i;
+    layer first = net->layers[l->input_layers[0]];
+    l->out_w = first.out_w;
+    l->out_h = first.out_h;
+    l->out_c = first.out_c;
+    l->outputs = first.outputs;
+    l->input_sizes[0] = first.outputs;
+    for(i = 1; i < l->n; ++i){
+        int index = l->input_layers[i];
+        layer next = net->layers[index];
+        l->outputs += next.outputs;
+        l->input_sizes[i] = next.outputs;
+        if(next.out_w == first.out_w && next.out_h == first.out_h){
+            l->out_c += next.out_c;
+        }else{
+            printf("Error: Different size of input layers: %d x %d, %d x %d\n", next.out_w, next.out_h, first.out_w, first.out_h);
+            error("Error!", DARKNET_LOC);
+        }
+    }
+    l->out_c = l->out_c / l->groups;
+    l->outputs = l->outputs / l->groups;
+    l->inputs = l->outputs;
+    l->delta = (float*)xrealloc(l->delta, l->outputs * l->batch * sizeof(float));
+    l->output = (float*)xrealloc(l->output, l->outputs * l->batch * sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu  = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu   = cuda_make_array(l->delta,  l->outputs*l->batch);
+#endif
+
+}
+
+void forward_route_layer(const route_layer l, network_state state)
+{
+    int i, j;
+    int offset = 0;
+    for(i = 0; i < l.n; ++i){
+        int index = l.input_layers[i];
+        float *input = state.net.layers[index].output;
+        int input_size = l.input_sizes[i];
+        int part_input_size = input_size / l.groups;
+        for(j = 0; j < l.batch; ++j){
+            //copy_cpu(input_size, input + j*input_size, 1, l.output + offset + j*l.outputs, 1);
+            copy_cpu(part_input_size, input + j*input_size + part_input_size*l.group_id, 1, l.output + offset + j*l.outputs, 1);
+        }
+        //offset += input_size;
+        offset += part_input_size;
+    }
+}
+
+void backward_route_layer(const route_layer l, network_state state)
+{
+    int i, j;
+    int offset = 0;
+    for(i = 0; i < l.n; ++i){
+        int index = l.input_layers[i];
+        float *delta = state.net.layers[index].delta;
+        int input_size = l.input_sizes[i];
+        int part_input_size = input_size / l.groups;
+        for(j = 0; j < l.batch; ++j){
+            //axpy_cpu(input_size, 1, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1);
+            axpy_cpu(part_input_size, 1, l.delta + offset + j*l.outputs, 1, delta + j*input_size + part_input_size*l.group_id, 1);
+        }
+        //offset += input_size;
+        offset += part_input_size;
+    }
+}
+
+#ifdef GPU
+void forward_route_layer_gpu(const route_layer l, network_state state)
+{
+    if (l.stream >= 0) {
+        switch_stream(l.stream);
+    }
+
+    if (l.wait_stream_id >= 0) {
+        wait_stream(l.wait_stream_id);
+    }
+
+    int i, j;
+    int offset = 0;
+    for(i = 0; i < l.n; ++i){
+        int index = l.input_layers[i];
+        float *input = state.net.layers[index].output_gpu;
+        int input_size = l.input_sizes[i];
+        int part_input_size = input_size / l.groups;
+        for(j = 0; j < l.batch; ++j){
+            //copy_ongpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1);
+            //simple_copy_ongpu(input_size, input + j*input_size, l.output_gpu + offset + j*l.outputs);
+            simple_copy_ongpu(part_input_size, input + j*input_size + part_input_size*l.group_id, l.output_gpu + offset + j*l.outputs);
+        }
+        //offset += input_size;
+        offset += part_input_size;
+    }
+}
+
+void backward_route_layer_gpu(const route_layer l, network_state state)
+{
+    int i, j;
+    int offset = 0;
+    for(i = 0; i < l.n; ++i){
+        int index = l.input_layers[i];
+        float *delta = state.net.layers[index].delta_gpu;
+        int input_size = l.input_sizes[i];
+        int part_input_size = input_size / l.groups;
+        for(j = 0; j < l.batch; ++j){
+            //axpy_ongpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
+            axpy_ongpu(part_input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size + part_input_size*l.group_id, 1);
+        }
+        //offset += input_size;
+        offset += part_input_size;
+    }
+}
+#endif
diff --git a/darknet-master/src/route_layer.h b/darknet-master/src/route_layer.h
new file mode 100644
index 0000000..2ebe396
--- /dev/null
+++ b/darknet-master/src/route_layer.h
@@ -0,0 +1,24 @@
+#ifndef ROUTE_LAYER_H
+#define ROUTE_LAYER_H
+#include "network.h"
+#include "layer.h"
+
+typedef layer route_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+route_layer make_route_layer(int batch, int n, int *input_layers, int *input_size, int groups, int group_id);
+void forward_route_layer(const route_layer l, network_state state);
+void backward_route_layer(const route_layer l, network_state state);
+void resize_route_layer(route_layer *l, network *net);
+
+#ifdef GPU
+void forward_route_layer_gpu(const route_layer l, network_state state);
+void backward_route_layer_gpu(const route_layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/sam_layer.c b/darknet-master/src/sam_layer.c
new file mode 100644
index 0000000..ddb7046
--- /dev/null
+++ b/darknet-master/src/sam_layer.c
@@ -0,0 +1,119 @@
+#include "sam_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include <stdio.h>
+#include <assert.h>
+
+layer make_sam_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2)
+{
+    fprintf(stderr,"scale Layer: %d\n", index);
+    layer l = { (LAYER_TYPE)0 };
+    l.type = SAM;
+    l.batch = batch;
+    l.w = w;
+    l.h = h;
+    l.c = c;
+
+    l.out_w = w2;
+    l.out_h = h2;
+    l.out_c = c2;
+    assert(l.out_c == l.c);
+    assert(l.w == l.out_w && l.h == l.out_h);
+
+    l.outputs = l.out_w*l.out_h*l.out_c;
+    l.inputs = l.outputs;
+    l.index = index;
+
+    l.delta = (float*)xcalloc(l.outputs * batch, sizeof(float));
+    l.output = (float*)xcalloc(l.outputs * batch, sizeof(float));
+
+    l.forward = forward_sam_layer;
+    l.backward = backward_sam_layer;
+#ifdef GPU
+    l.forward_gpu = forward_sam_layer_gpu;
+    l.backward_gpu = backward_sam_layer_gpu;
+
+    l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+#endif
+    return l;
+}
+
+void resize_sam_layer(layer *l, int w, int h)
+{
+    l->out_w = w;
+    l->out_h = h;
+    l->outputs = l->out_w*l->out_h*l->out_c;
+    l->inputs = l->outputs;
+    l->delta = (float*)xrealloc(l->delta, l->outputs * l->batch * sizeof(float));
+    l->output = (float*)xrealloc(l->output, l->outputs * l->batch * sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch);
+#endif
+
+}
+
+void forward_sam_layer(const layer l, network_state state)
+{
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    //int channel_size = 1;
+    float *from_output = state.net.layers[l.index].output;
+
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < size; ++i) {
+        l.output[i] = state.input[i] * from_output[i];
+    }
+
+    activate_array(l.output, l.outputs*l.batch, l.activation);
+}
+
+void backward_sam_layer(const layer l, network_state state)
+{
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+    //axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1);
+    //scale_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta);
+
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    //int channel_size = 1;
+    float *from_output = state.net.layers[l.index].output;
+    float *from_delta = state.net.layers[l.index].delta;
+
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < size; ++i) {
+        state.delta[i] += l.delta[i] * from_output[i]; // l.delta * from  (should be divided by channel_size?)
+
+        from_delta[i] = state.input[i] * l.delta[i]; // input * l.delta
+    }
+}
+
+#ifdef GPU
+void forward_sam_layer_gpu(const layer l, network_state state)
+{
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    int channel_size = 1;
+
+    sam_gpu(state.net.layers[l.index].output_gpu, size, channel_size, state.input, l.output_gpu);
+
+    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+}
+
+void backward_sam_layer_gpu(const layer l, network_state state)
+{
+    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    int channel_size = 1;
+    float *from_output = state.net.layers[l.index].output_gpu;
+    float *from_delta = state.net.layers[l.index].delta_gpu;
+
+
+    backward_sam_gpu(l.delta_gpu, size, channel_size, state.input, from_delta, from_output, state.delta);
+}
+#endif
diff --git a/darknet-master/src/sam_layer.h b/darknet-master/src/sam_layer.h
new file mode 100644
index 0000000..0fa66fa
--- /dev/null
+++ b/darknet-master/src/sam_layer.h
@@ -0,0 +1,23 @@
+#ifndef SAM_CHANNELS_LAYER_H
+#define SAM_CHANNELS_LAYER_H
+
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_sam_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2);
+void forward_sam_layer(const layer l, network_state state);
+void backward_sam_layer(const layer l, network_state state);
+void resize_sam_layer(layer *l, int w, int h);
+
+#ifdef GPU
+void forward_sam_layer_gpu(const layer l, network_state state);
+void backward_sam_layer_gpu(const layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // SAM_CHANNELS_LAYER_H
diff --git a/darknet-master/src/scale_channels_layer.c b/darknet-master/src/scale_channels_layer.c
new file mode 100644
index 0000000..c4f6410
--- /dev/null
+++ b/darknet-master/src/scale_channels_layer.c
@@ -0,0 +1,150 @@
+#include "scale_channels_layer.h"
+#include "utils.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include <stdio.h>
+#include <assert.h>
+
+layer make_scale_channels_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2, int scale_wh)
+{
+    fprintf(stderr,"scale Layer: %d\n", index);
+    layer l = { (LAYER_TYPE)0 };
+    l.type = SCALE_CHANNELS;
+    l.batch = batch;
+    l.scale_wh = scale_wh;
+    l.w = w;
+    l.h = h;
+    l.c = c;
+    if (!l.scale_wh) assert(w == 1 && h == 1);
+    else assert(c == 1);
+
+    l.out_w = w2;
+    l.out_h = h2;
+    l.out_c = c2;
+    if (!l.scale_wh) assert(l.out_c == l.c);
+    else assert(l.out_w == l.w && l.out_h == l.h);
+
+    l.outputs = l.out_w*l.out_h*l.out_c;
+    l.inputs = l.outputs;
+    l.index = index;
+
+    l.delta = (float*)xcalloc(l.outputs * batch, sizeof(float));
+    l.output = (float*)xcalloc(l.outputs * batch, sizeof(float));
+
+    l.forward = forward_scale_channels_layer;
+    l.backward = backward_scale_channels_layer;
+#ifdef GPU
+    l.forward_gpu = forward_scale_channels_layer_gpu;
+    l.backward_gpu = backward_scale_channels_layer_gpu;
+
+    l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+#endif
+    return l;
+}
+
+void resize_scale_channels_layer(layer *l, network *net)
+{
+    layer first = net->layers[l->index];
+    l->out_w = first.out_w;
+    l->out_h = first.out_h;
+    l->outputs = l->out_w*l->out_h*l->out_c;
+    l->inputs = l->outputs;
+    l->delta = (float*)xrealloc(l->delta, l->outputs * l->batch * sizeof(float));
+    l->output = (float*)xrealloc(l->output, l->outputs * l->batch * sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch);
+#endif
+
+}
+
+void forward_scale_channels_layer(const layer l, network_state state)
+{
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    int channel_size = l.out_w * l.out_h;
+    int batch_size = l.out_c * l.out_w * l.out_h;
+    float *from_output = state.net.layers[l.index].output;
+
+    if (l.scale_wh) {
+        int i;
+        #pragma omp parallel for
+        for (i = 0; i < size; ++i) {
+            int input_index = i % channel_size + (i / batch_size)*channel_size;
+
+            l.output[i] = state.input[input_index] * from_output[i];
+        }
+    }
+    else {
+        int i;
+        #pragma omp parallel for
+        for (i = 0; i < size; ++i) {
+            l.output[i] = state.input[i / channel_size] * from_output[i];
+        }
+    }
+
+    activate_array(l.output, l.outputs*l.batch, l.activation);
+}
+
+void backward_scale_channels_layer(const layer l, network_state state)
+{
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+    //axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1);
+    //scale_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta);
+
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    int channel_size = l.out_w * l.out_h;
+    int batch_size = l.out_c * l.out_w * l.out_h;
+    float *from_output = state.net.layers[l.index].output;
+    float *from_delta = state.net.layers[l.index].delta;
+
+    if (l.scale_wh) {
+        int i;
+        #pragma omp parallel for
+        for (i = 0; i < size; ++i) {
+            int input_index = i % channel_size + (i / batch_size)*channel_size;
+
+            state.delta[input_index] += l.delta[i] * from_output[i];// / l.out_c; // l.delta * from  (should be divided by l.out_c?)
+
+            from_delta[i] += state.input[input_index] * l.delta[i]; // input * l.delta
+        }
+    }
+    else {
+        int i;
+        #pragma omp parallel for
+        for (i = 0; i < size; ++i) {
+            state.delta[i / channel_size] += l.delta[i] * from_output[i];// / channel_size; // l.delta * from  (should be divided by channel_size?)
+
+            from_delta[i] += state.input[i / channel_size] * l.delta[i]; // input * l.delta
+        }
+    }
+}
+
+#ifdef GPU
+void forward_scale_channels_layer_gpu(const layer l, network_state state)
+{
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    int channel_size = l.out_w * l.out_h;
+    int batch_size = l.out_c * l.out_w * l.out_h;
+
+    scale_channels_gpu(state.net.layers[l.index].output_gpu, size, channel_size, batch_size, l.scale_wh, state.input, l.output_gpu);
+
+    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+}
+
+void backward_scale_channels_layer_gpu(const layer l, network_state state)
+{
+    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    int channel_size = l.out_w * l.out_h;
+    int batch_size = l.out_c * l.out_w * l.out_h;
+    float *from_output = state.net.layers[l.index].output_gpu;
+    float *from_delta = state.net.layers[l.index].delta_gpu;
+
+    backward_scale_channels_gpu(l.delta_gpu, size, channel_size, batch_size, l.scale_wh, state.input, from_delta, from_output, state.delta);
+}
+#endif
diff --git a/darknet-master/src/scale_channels_layer.h b/darknet-master/src/scale_channels_layer.h
new file mode 100644
index 0000000..c8d51dd
--- /dev/null
+++ b/darknet-master/src/scale_channels_layer.h
@@ -0,0 +1,23 @@
+#ifndef SCALE_CHANNELS_LAYER_H
+#define SCALE_CHANNELS_LAYER_H
+
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_scale_channels_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2, int scale_wh);
+void forward_scale_channels_layer(const layer l, network_state state);
+void backward_scale_channels_layer(const layer l, network_state state);
+void resize_scale_channels_layer(layer *l, network *net);
+
+#ifdef GPU
+void forward_scale_channels_layer_gpu(const layer l, network_state state);
+void backward_scale_channels_layer_gpu(const layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // SCALE_CHANNELS_LAYER_H
diff --git a/darknet-master/src/shortcut_layer.c b/darknet-master/src/shortcut_layer.c
new file mode 100644
index 0000000..87f0d7e
--- /dev/null
+++ b/darknet-master/src/shortcut_layer.c
@@ -0,0 +1,293 @@
+#include "shortcut_layer.h"
+#include "convolutional_layer.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include "utils.h"
+#include "gemm.h"
+#include <stdio.h>
+#include <assert.h>
+
+layer make_shortcut_layer(int batch, int n, int *input_layers, int* input_sizes, int w, int h, int c,
+    float **layers_output, float **layers_delta, float **layers_output_gpu, float **layers_delta_gpu, WEIGHTS_TYPE_T weights_type, WEIGHTS_NORMALIZATION_T weights_normalization,
+    ACTIVATION activation, int train)
+{
+    fprintf(stderr, "Shortcut Layer: ");
+    int i;
+    for(i = 0; i < n; ++i) fprintf(stderr, "%d, ", input_layers[i]);
+
+    layer l = { (LAYER_TYPE)0 };
+    l.train = train;
+    l.type = SHORTCUT;
+    l.batch = batch;
+    l.activation = activation;
+    l.n = n;
+    l.input_layers = input_layers;
+    l.input_sizes = input_sizes;
+    l.layers_output = layers_output;
+    l.layers_delta = layers_delta;
+    l.weights_type = weights_type;
+    l.weights_normalization = weights_normalization;
+    l.learning_rate_scale = 1;  // not necessary
+
+    //l.w = w2;
+    //l.h = h2;
+    //l.c = c2;
+    l.w = l.out_w = w;
+    l.h = l.out_h = h;
+    l.c = l.out_c = c;
+    l.outputs = w*h*c;
+    l.inputs = l.outputs;
+
+    //if(w != w2 || h != h2 || c != c2) fprintf(stderr, " w = %d, w2 = %d, h = %d, h2 = %d, c = %d, c2 = %d \n", w, w2, h, h2, c, c2);
+
+    l.index = l.input_layers[0];
+
+
+    if (train) l.delta = (float*)xcalloc(l.outputs * batch, sizeof(float));
+    l.output = (float*)xcalloc(l.outputs * batch, sizeof(float));
+
+    l.nweights = 0;
+    if (l.weights_type == PER_FEATURE) l.nweights = (l.n + 1);
+    else if (l.weights_type == PER_CHANNEL) l.nweights = (l.n + 1) * l.c;
+
+    if (l.nweights > 0) {
+        l.weights = (float*)calloc(l.nweights, sizeof(float));
+        float scale = sqrt(2. / l.nweights);
+        for (i = 0; i < l.nweights; ++i) l.weights[i] = 1;// +0.01*rand_uniform(-1, 1);// scale*rand_uniform(-1, 1);   // rand_normal();
+
+        if (train) l.weight_updates = (float*)calloc(l.nweights, sizeof(float));
+        l.update = update_shortcut_layer;
+    }
+
+    l.forward = forward_shortcut_layer;
+    l.backward = backward_shortcut_layer;
+#ifndef GPU
+    if (l.activation == SWISH || l.activation == MISH) l.activation_input = (float*)calloc(l.batch*l.outputs, sizeof(float));
+#endif // GPU
+
+#ifdef GPU
+    if (l.activation == SWISH || l.activation == MISH) l.activation_input_gpu = cuda_make_array(l.activation_input, l.batch*l.outputs);
+
+    l.forward_gpu = forward_shortcut_layer_gpu;
+    l.backward_gpu = backward_shortcut_layer_gpu;
+
+    if (l.nweights > 0) {
+        l.update_gpu = update_shortcut_layer_gpu;
+        l.weights_gpu = cuda_make_array(l.weights, l.nweights);
+        if (train) l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
+    }
+
+    if (train) l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+
+    l.input_sizes_gpu = cuda_make_int_array_new_api(input_sizes, l.n);
+    l.layers_output_gpu = (float**)cuda_make_array_pointers((void**)layers_output_gpu, l.n);
+    l.layers_delta_gpu = (float**)cuda_make_array_pointers((void**)layers_delta_gpu, l.n);
+#endif  // GPU
+
+    l.bflops = l.out_w * l.out_h * l.out_c * l.n / 1000000000.;
+    if (l.weights_type) l.bflops *= 2;
+    fprintf(stderr, " wt = %d, wn = %d, outputs:%4d x%4d x%4d %5.3f BF\n", l.weights_type, l.weights_normalization, l.out_w, l.out_h, l.out_c, l.bflops);
+    return l;
+}
+
+void resize_shortcut_layer(layer *l, int w, int h, network *net)
+{
+    //assert(l->w == l->out_w);
+    //assert(l->h == l->out_h);
+    l->w = l->out_w = w;
+    l->h = l->out_h = h;
+    l->outputs = w*h*l->out_c;
+    l->inputs = l->outputs;
+    if (l->train) l->delta = (float*)xrealloc(l->delta, l->outputs * l->batch * sizeof(float));
+    l->output = (float*)xrealloc(l->output, l->outputs * l->batch * sizeof(float));
+
+    int i;
+    for (i = 0; i < l->n; ++i) {
+        int index = l->input_layers[i];
+        l->input_sizes[i] = net->layers[index].outputs;
+        l->layers_output[i] = net->layers[index].output;
+        l->layers_delta[i] = net->layers[index].delta;
+
+        assert(l->w == net->layers[index].out_w && l->h == net->layers[index].out_h);
+    }
+
+    if (l->activation == SWISH || l->activation == MISH) l->activation_input = (float*)realloc(l->activation_input, l->batch*l->outputs * sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
+
+    if (l->train) {
+        cuda_free(l->delta_gpu);
+        l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch);
+    }
+
+    float **layers_output_gpu = (float **)calloc(l->n, sizeof(float *));
+    float **layers_delta_gpu = (float **)calloc(l->n, sizeof(float *));
+
+    for (i = 0; i < l->n; ++i) {
+        const int index = l->input_layers[i];
+        layers_output_gpu[i] = net->layers[index].output_gpu;
+        layers_delta_gpu[i] = net->layers[index].delta_gpu;
+    }
+
+    memcpy_ongpu(l->input_sizes_gpu, l->input_sizes, l->n * sizeof(int));
+    memcpy_ongpu(l->layers_output_gpu, layers_output_gpu, l->n * sizeof(float*));
+    memcpy_ongpu(l->layers_delta_gpu, layers_delta_gpu, l->n * sizeof(float*));
+
+    free(layers_output_gpu);
+    free(layers_delta_gpu);
+
+    if (l->activation == SWISH || l->activation == MISH) {
+        cuda_free(l->activation_input_gpu);
+        l->activation_input_gpu = cuda_make_array(l->activation_input, l->batch*l->outputs);
+    }
+#endif
+
+}
+
+void forward_shortcut_layer(const layer l, network_state state)
+{
+    int from_w = state.net.layers[l.index].w;
+    int from_h = state.net.layers[l.index].h;
+    int from_c = state.net.layers[l.index].c;
+
+    if (l.nweights == 0 && l.n == 1 && from_w == l.w && from_h == l.h && from_c == l.c) {
+        int size = l.batch * l.w * l.h * l.c;
+        int i;
+        #pragma omp parallel for
+        for(i = 0; i < size; ++i)
+            l.output[i] = state.input[i] + state.net.layers[l.index].output[i];
+    }
+    else {
+        shortcut_multilayer_cpu(l.outputs * l.batch, l.outputs, l.batch, l.n, l.input_sizes, l.layers_output, l.output, state.input, l.weights, l.nweights, l.weights_normalization);
+    }
+
+    //copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
+    //shortcut_cpu(l.batch, from_w, from_h, from_c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
+
+    //activate_array(l.output, l.outputs*l.batch, l.activation);
+    if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+    else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+    else activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation);
+}
+
+void backward_shortcut_layer(const layer l, network_state state)
+{
+    if (l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.delta);
+    else if (l.activation == MISH) gradient_array_mish(l.outputs*l.batch, l.activation_input, l.delta);
+    else gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+
+    backward_shortcut_multilayer_cpu(l.outputs * l.batch, l.outputs, l.batch, l.n, l.input_sizes,
+        l.layers_delta, state.delta, l.delta, l.weights, l.weight_updates, l.nweights, state.input, l.layers_output, l.weights_normalization);
+
+    //axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1);
+    //shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta);
+}
+
+void update_shortcut_layer(layer l, int batch, float learning_rate_init, float momentum, float decay)
+{
+    if (l.nweights > 0) {
+        float learning_rate = learning_rate_init*l.learning_rate_scale;
+        //float momentum = a.momentum;
+        //float decay = a.decay;
+        //int batch = a.batch;
+
+        axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1);
+        axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1);
+        scal_cpu(l.nweights, momentum, l.weight_updates, 1);
+    }
+}
+
+#ifdef GPU
+void forward_shortcut_layer_gpu(const layer l, network_state state)
+{
+    //copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
+    //simple_copy_ongpu(l.outputs*l.batch, state.input, l.output_gpu);
+    //shortcut_gpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
+
+    //input_shortcut_gpu(state.input, l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
+
+    //-----------
+    //if (l.outputs == l.input_sizes[0])
+    //if(l.n == 1 && l.nweights == 0)
+    //{
+    //    input_shortcut_gpu(state.input, l.batch, state.net.layers[l.index].w, state.net.layers[l.index].h, state.net.layers[l.index].c,
+    //        state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
+    //}
+    //else
+    {
+        shortcut_multilayer_gpu(l.outputs, l.batch, l.n, l.input_sizes_gpu, l.layers_output_gpu, l.output_gpu, state.input, l.weights_gpu, l.nweights, l.weights_normalization);
+    }
+
+    if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+    else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+    else activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+
+}
+
+void backward_shortcut_layer_gpu(const layer l, network_state state)
+{
+    if (l.activation == SWISH) gradient_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
+    else if (l.activation == MISH) gradient_array_mish_ongpu(l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
+    else gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+
+    backward_shortcut_multilayer_gpu(l.outputs, l.batch, l.n, l.input_sizes_gpu, l.layers_delta_gpu, state.delta, l.delta_gpu,
+        l.weights_gpu, l.weight_updates_gpu, l.nweights, state.input, l.layers_output_gpu, l.weights_normalization);
+
+    //axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, state.delta, 1);
+    //shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, state.net.layers[l.index].delta_gpu);
+}
+
+void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
+{
+    if (l.nweights > 0) {
+        float learning_rate = learning_rate_init*l.learning_rate_scale / loss_scale;
+        //float momentum = a.momentum;
+        //float decay = a.decay;
+        //int batch = a.batch;
+
+        reset_nan_and_inf(l.weight_updates_gpu, l.nweights);
+        fix_nan_and_inf(l.weights_gpu, l.nweights);
+
+        //constrain_weight_updates_ongpu(l.nweights, 1, l.weights_gpu, l.weight_updates_gpu);
+        constrain_ongpu(l.nweights, 1, l.weight_updates_gpu, 1);
+
+        /*
+        cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
+        cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
+        CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
+        for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weight_updates[i]);
+        printf(" l.nweights = %d - updates \n", l.nweights);
+        for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
+        printf(" l.nweights = %d \n\n", l.nweights);
+        */
+
+        //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+        scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
+
+        //fill_ongpu(l.nweights, 0, l.weight_updates_gpu, 1);
+
+        //if (l.clip) {
+        //    constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1);
+        //}
+    }
+}
+
+void pull_shortcut_layer(layer l)
+{
+    constrain_ongpu(l.nweights, 1, l.weight_updates_gpu, 1);
+    cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
+    cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
+    CHECK_CUDA(cudaPeekAtLastError());
+    CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
+}
+
+void push_shortcut_layer(layer l)
+{
+    cuda_push_array(l.weights_gpu, l.weights, l.nweights);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+#endif
diff --git a/darknet-master/src/shortcut_layer.h b/darknet-master/src/shortcut_layer.h
new file mode 100644
index 0000000..8932137
--- /dev/null
+++ b/darknet-master/src/shortcut_layer.h
@@ -0,0 +1,29 @@
+#ifndef SHORTCUT_LAYER_H
+#define SHORTCUT_LAYER_H
+
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_shortcut_layer(int batch, int n, int *input_layers, int* input_sizes, int w, int h, int c,
+    float **layers_output, float **layers_delta, float **layers_output_gpu, float **layers_delta_gpu, WEIGHTS_TYPE_T weights_type, WEIGHTS_NORMALIZATION_T weights_normalization,
+    ACTIVATION activation, int train);
+void forward_shortcut_layer(const layer l, network_state state);
+void backward_shortcut_layer(const layer l, network_state state);
+void update_shortcut_layer(layer l, int batch, float learning_rate_init, float momentum, float decay);
+void resize_shortcut_layer(layer *l, int w, int h, network *net);
+
+#ifdef GPU
+void forward_shortcut_layer_gpu(const layer l, network_state state);
+void backward_shortcut_layer_gpu(const layer l, network_state state);
+void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale);
+void pull_shortcut_layer(layer l);
+void push_shortcut_layer(layer l);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/softmax_layer.c b/darknet-master/src/softmax_layer.c
new file mode 100644
index 0000000..6535c57
--- /dev/null
+++ b/darknet-master/src/softmax_layer.c
@@ -0,0 +1,618 @@
+#include "softmax_layer.h"
+#include "blas.h"
+#include "dark_cuda.h"
+#include "utils.h"
+#include "blas.h"
+
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define SECRET_NUM -1234
+
+void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output)
+{
+    int b;
+    for (b = 0; b < batch; ++b) {
+        int i;
+        int count = 0;
+        for (i = 0; i < hierarchy->groups; ++i) {
+            int group_size = hierarchy->group_size[i];
+            softmax(input + b*inputs + count, group_size, temp, output + b*inputs + count, 1);
+            count += group_size;
+        }
+    }
+}
+
+softmax_layer make_softmax_layer(int batch, int inputs, int groups)
+{
+    assert(inputs%groups == 0);
+    fprintf(stderr, "softmax                                        %4d\n",  inputs);
+    softmax_layer l = { (LAYER_TYPE)0 };
+    l.type = SOFTMAX;
+    l.batch = batch;
+    l.groups = groups;
+    l.inputs = inputs;
+    l.outputs = inputs;
+    l.loss = (float*)xcalloc(inputs * batch, sizeof(float));
+    l.output = (float*)xcalloc(inputs * batch, sizeof(float));
+    l.delta = (float*)xcalloc(inputs * batch, sizeof(float));
+    l.cost = (float*)xcalloc(1, sizeof(float));
+
+    l.forward = forward_softmax_layer;
+    l.backward = backward_softmax_layer;
+#ifdef GPU
+    l.forward_gpu = forward_softmax_layer_gpu;
+    l.backward_gpu = backward_softmax_layer_gpu;
+
+    l.output_gpu = cuda_make_array(l.output, inputs*batch);
+    l.loss_gpu = cuda_make_array(l.loss, inputs*batch);
+    l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
+#endif
+    return l;
+}
+
+void forward_softmax_layer(const softmax_layer l, network_state net)
+{
+    if(l.softmax_tree){
+        int i;
+        int count = 0;
+        for (i = 0; i < l.softmax_tree->groups; ++i) {
+            int group_size = l.softmax_tree->group_size[i];
+            softmax_cpu(net.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output + count);
+            count += group_size;
+        }
+    } else {
+        softmax_cpu(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output);
+    }
+
+    if(net.truth && !l.noloss){
+        softmax_x_ent_cpu(l.batch*l.inputs, l.output, net.truth, l.delta, l.loss);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
+    }
+}
+
+void backward_softmax_layer(const softmax_layer l, network_state net)
+{
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1);
+}
+
+#ifdef GPU
+
+void pull_softmax_layer_output(const softmax_layer layer)
+{
+    cuda_pull_array(layer.output_gpu, layer.output, layer.inputs*layer.batch);
+}
+
+void forward_softmax_layer_gpu(const softmax_layer l, network_state net)
+{
+    if(l.softmax_tree){
+        softmax_tree_gpu(net.input, 1, l.batch, l.inputs, l.temperature, l.output_gpu, *l.softmax_tree);
+        /*
+        int i;
+        int count = 0;
+        for (i = 0; i < l.softmax_tree->groups; ++i) {
+        int group_size = l.softmax_tree->group_size[i];
+        softmax_gpu(net.input_gpu + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count);
+        count += group_size;
+        }
+        */
+    } else {
+        if(l.spatial){
+            softmax_gpu_new_api(net.input, l.c, l.batch*l.c, l.inputs/l.c, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu);
+        }else{
+            softmax_gpu_new_api(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu);
+        }
+    }
+    if(net.truth && !l.noloss){
+        softmax_x_ent_gpu(l.batch*l.inputs, l.output_gpu, net.truth, l.delta_gpu, l.loss_gpu);
+        if(l.softmax_tree){
+            mask_gpu_new_api(l.batch*l.inputs, l.delta_gpu, SECRET_NUM, net.truth, 0);
+            mask_gpu_new_api(l.batch*l.inputs, l.loss_gpu, SECRET_NUM, net.truth, 0);
+        }
+        cuda_pull_array(l.loss_gpu, l.loss, l.batch*l.inputs);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
+    }
+}
+
+void backward_softmax_layer_gpu(const softmax_layer layer, network_state state)
+{
+    axpy_ongpu(layer.batch*layer.inputs, state.net.loss_scale, layer.delta_gpu, 1, state.delta, 1);
+}
+
+#endif
+
+// -------------------------------------
+
+// Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf
+contrastive_layer make_contrastive_layer(int batch, int w, int h, int c, int classes, int inputs, layer *yolo_layer)
+{
+    contrastive_layer l = { (LAYER_TYPE)0 };
+    l.type = CONTRASTIVE;
+    l.batch = batch;
+    l.inputs = inputs;
+    l.w = w;
+    l.h = h;
+    l.c = c;
+    l.temperature = 1;
+
+    l.max_boxes = 0;
+    if (yolo_layer) {
+        l.detection = 1;
+        l.max_boxes = yolo_layer->max_boxes;
+        l.labels = yolo_layer->labels;  // track id
+        l.class_ids = yolo_layer->class_ids;  // class_ids
+        l.n = yolo_layer->n;            // num of embeddings per cell = num of anchors
+        l.classes = yolo_layer->classes;// num of classes
+        classes = l.classes;
+        l.embedding_size = l.inputs / (l.n*l.h*l.w);
+        l.truths = yolo_layer->truths;
+        if (l.embedding_size != yolo_layer->embedding_size) {
+            printf(" Error: [contrastive] embedding_size=%d isn't equal to [yolo] embedding_size=%d. They should use the same [convolutional] layer \n", l.embedding_size, yolo_layer->embedding_size);
+            error("Error!", DARKNET_LOC);
+        }
+        if (l.inputs % (l.n*l.h*l.w) != 0) {
+            printf(" Warning: filters= number in the previous (embedding) layer isn't divisable by number of anchors %d \n", l.n);
+        }
+    }
+    else {
+        l.detection = 0;
+        l.labels = (int*)xcalloc(l.batch, sizeof(int)); // labels
+        l.n = 1;                                        // num of embeddings per cell
+        l.classes = classes;                            // num of classes
+        l.embedding_size = l.c;
+    }
+    l.outputs = inputs;
+
+    l.loss = (float*)xcalloc(1, sizeof(float));
+    l.output = (float*)xcalloc(inputs * batch, sizeof(float));
+    l.delta = (float*)xcalloc(inputs * batch, sizeof(float));
+    l.cost = (float*)xcalloc(1, sizeof(float));
+
+    const size_t step = l.batch*l.n*l.h*l.w;
+    l.cos_sim = NULL;
+    l.exp_cos_sim = NULL;
+    l.p_constrastive = NULL;
+    if (!l.detection) {
+        l.cos_sim = (float*)xcalloc(step*step, sizeof(float));
+        l.exp_cos_sim = (float*)xcalloc(step*step, sizeof(float));
+        l.p_constrastive = (float*)xcalloc(step*step, sizeof(float));
+    }
+    //l.p_constrastive = (float*)xcalloc(step*step, sizeof(float));
+    //l.contrast_p_size = (int*)xcalloc(1, sizeof(int));
+    //*l.contrast_p_size = step;
+    //l.contrast_p = (contrastive_params*)xcalloc(*l.contrast_p_size, sizeof(contrastive_params));
+
+    l.forward = forward_contrastive_layer;
+    l.backward = backward_contrastive_layer;
+#ifdef GPU
+    l.forward_gpu = forward_contrastive_layer_gpu;
+    l.backward_gpu = backward_contrastive_layer_gpu;
+
+    l.output_gpu = cuda_make_array(l.output, inputs*batch);
+    l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
+
+    const int max_contr_size = (l.max_boxes*l.batch)*(l.max_boxes*l.batch) * sizeof(contrastive_params)/4;
+    printf(" max_contr_size = %d MB \n", max_contr_size / (1024*1024));
+    l.contrast_p_gpu = (contrastive_params *)cuda_make_array(NULL, max_contr_size);
+#endif
+    fprintf(stderr, "contrastive %4d x%4d x%4d x emb_size %4d x batch: %4d  classes = %4d, step = %4zu \n", w, h, l.n, l.embedding_size, batch, l.classes, step);
+    if(l.detection) fprintf(stderr, "detection \n");
+    return l;
+}
+
+static inline float clip_value(float val, const float max_val)
+{
+    if (val > max_val) {
+        //printf("\n val = %f > max_val = %f \n", val, max_val);
+        val = max_val;
+    }
+    else if (val < -max_val) {
+        //printf("\n val = %f < -max_val = %f \n", val, -max_val);
+        val = -max_val;
+    }
+    return val;
+}
+
+void forward_contrastive_layer(contrastive_layer l, network_state state)
+{
+    if (!state.train) return;
+    const float truth_thresh = state.net.label_smooth_eps;
+
+    const int mini_batch = l.batch / l.steps;
+
+    int b, n, w, h;
+    fill_cpu(l.batch*l.inputs, 0, l.delta, 1);
+
+    if (!l.detection) {
+
+        for (b = 0; b < l.batch; ++b) {
+            if (state.net.adversarial) l.labels[b] = b % 2;
+            else l.labels[b] = b / 2;
+        }
+
+        // set labels
+        for (b = 0; b < l.batch; ++b) {
+            for (h = 0; h < l.h; ++h) {
+                for (w = 0; w < l.w; ++w)
+                {
+                    // find truth with max prob (only 1 label even if mosaic is used)
+                    float max_truth = 0;
+                    int n;
+                    for (n = 0; n < l.classes; ++n) {
+                        const float truth_prob = state.truth[b*l.classes + n];
+                        //printf(" truth_prob = %f, ", truth_prob);
+                        //if (truth_prob > max_truth)
+                        if (truth_prob > truth_thresh)
+                        {
+                            //printf(" truth_prob = %f, max_truth = %f, n = %d; ", truth_prob, max_truth, n);
+                            max_truth = truth_prob;
+                            l.labels[b] = n;
+                        }
+                    }
+                    //printf(", l.labels[b] = %d ", l.labels[b]);
+                }
+            }
+        }
+
+    }
+    //printf("\n\n");
+
+    // set pointers to features
+    float **z = (float**)xcalloc(l.batch*l.n*l.h*l.w, sizeof(float*));
+
+    for (b = 0; b < l.batch; ++b) {
+        for (n = 0; n < l.n; ++n) {
+            for (h = 0; h < l.h; ++h) {
+                for (w = 0; w < l.w; ++w)
+                {
+                    const int z_index = b*l.n*l.h*l.w + n*l.h*l.w + h*l.w + w;
+                    if (l.labels[z_index] < 0) continue;
+
+                    //const int input_index = b*l.inputs + n*l.embedding_size*l.h*l.w + h*l.w + w;
+                    //float *ptr = state.input + input_index;
+                    //z[z_index] = ptr;
+
+                    z[z_index] = (float*)xcalloc(l.embedding_size, sizeof(float));
+                    get_embedding(state.input, l.w, l.h, l.c, l.embedding_size, w, h, n, b, z[z_index]);
+                }
+            }
+        }
+    }
+
+    int b2, n2, h2, w2;
+    int contrast_p_index = 0;
+
+    const size_t step = l.batch*l.n*l.h*l.w;
+    size_t contrast_p_size = step;
+    if (!l.detection) contrast_p_size = l.batch*l.batch;
+    contrastive_params *contrast_p = (contrastive_params*)xcalloc(contrast_p_size, sizeof(contrastive_params));
+
+    float *max_sim_same = (float *)xcalloc(l.batch*l.inputs, sizeof(float));
+    float *max_sim_diff = (float *)xcalloc(l.batch*l.inputs, sizeof(float));
+    fill_cpu(l.batch*l.inputs, -10, max_sim_same, 1);
+    fill_cpu(l.batch*l.inputs, -10, max_sim_diff, 1);
+
+    // precalculate cosine similiraty
+    for (b = 0; b < l.batch; ++b) {
+        for (n = 0; n < l.n; ++n) {
+            for (h = 0; h < l.h; ++h) {
+                for (w = 0; w < l.w; ++w)
+                {
+                    const int z_index = b*l.n*l.h*l.w + n*l.h*l.w + h*l.w + w;
+                    if (l.labels[z_index] < 0) continue;
+
+                    for (b2 = 0; b2 < l.batch; ++b2) {
+                        for (n2 = 0; n2 < l.n; ++n2) {
+                            for (h2 = 0; h2 < l.h; ++h2) {
+                                for (w2 = 0; w2 < l.w; ++w2)
+                                {
+                                    const int z_index2 = b2*l.n*l.h*l.w + n2*l.h*l.w + h2*l.w + w2;
+                                    if (l.labels[z_index2] < 0) continue;
+                                    if (z_index == z_index2) continue;
+                                    if (l.detection)
+                                        if (l.class_ids[z_index] != l.class_ids[z_index2]) continue;
+
+                                    const int time_step_i = b / mini_batch;
+                                    const int time_step_j = b2 / mini_batch;
+                                    if (time_step_i != time_step_j) continue;
+
+                                    const size_t step = l.batch*l.n*l.h*l.w;
+
+                                    const float sim = cosine_similarity(z[z_index], z[z_index2], l.embedding_size);
+                                    const float exp_sim = expf(sim / l.temperature);
+                                    if (!l.detection) {
+                                        l.cos_sim[z_index*step + z_index2] = sim;
+                                        l.exp_cos_sim[z_index*step + z_index2] = exp_sim;
+                                    }
+
+                                    // calc good sim
+                                    if (l.labels[z_index] == l.labels[z_index2] && max_sim_same[z_index] < sim) max_sim_same[z_index] = sim;
+                                    if (l.labels[z_index] != l.labels[z_index2] && max_sim_diff[z_index] < sim) max_sim_diff[z_index] = sim;
+                                    //printf(" z_i = %d, z_i2 = %d, l = %d, l2 = %d, sim = %f \n", z_index, z_index2, l.labels[z_index], l.labels[z_index2], sim);
+
+                                    contrast_p[contrast_p_index].sim = sim;
+                                    contrast_p[contrast_p_index].exp_sim = exp_sim;
+                                    contrast_p[contrast_p_index].i = z_index;
+                                    contrast_p[contrast_p_index].j = z_index2;
+                                    contrast_p[contrast_p_index].time_step_i = time_step_i;
+                                    contrast_p[contrast_p_index].time_step_j = time_step_j;
+                                    contrast_p_index++;
+                                    //printf(" contrast_p_index = %d, contrast_p_size = %d \n", contrast_p_index, contrast_p_size);
+                                    if ((contrast_p_index+1) >= contrast_p_size) {
+                                        contrast_p_size = contrast_p_index + 1;
+                                        //printf(" contrast_p_size = %d, z_index = %d, z_index2 = %d \n", contrast_p_size, z_index, z_index2);
+                                        contrast_p = (contrastive_params*)xrealloc(contrast_p, contrast_p_size * sizeof(contrastive_params));
+                                    }
+
+                                    if (sim > 1.001 || sim < -1.001) {
+                                        printf(" sim = %f, ", sim);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // calc contrastive accuracy
+    int i;
+    int good_sims = 0, all_sims = 0, same_sim = 0, diff_sim = 0;
+    for (i = 0; i < l.batch*l.inputs; ++i) {
+        if (max_sim_same[i] >= -1 && max_sim_diff[i] >= -1) {
+            if (max_sim_same[i] >= -1) same_sim++;
+            if (max_sim_diff[i] >= -1) diff_sim++;
+            ++all_sims;
+            //printf(" max_sim_diff[i] = %f, max_sim_same[i] = %f \n", max_sim_diff[i], max_sim_same[i]);
+            if (max_sim_diff[i] < max_sim_same[i]) good_sims++;
+        }
+    }
+    if (all_sims > 0) {
+        *l.loss = 100 * good_sims / all_sims;
+    }
+    else *l.loss = -1;
+    printf(" Contrast accuracy = %f %%, all = %d, good = %d, same = %d, diff = %d \n", *l.loss, all_sims, good_sims, same_sim, diff_sim);
+    free(max_sim_same);
+    free(max_sim_diff);
+
+
+    /*
+    // show near sim
+    float good_contrast = 0;
+    for (b = 0; b < l.batch; b += 2) {
+        float same = l.cos_sim[b*l.batch + b];
+        float aug = l.cos_sim[b*l.batch + b + 1];
+        float diff = l.cos_sim[b*l.batch + b + 2];
+        good_contrast += (aug > diff);
+        //printf(" l.labels[b] = %d, l.labels[b+1] = %d, l.labels[b+2] = %d, b = %d \n", l.labels[b], l.labels[b + 1], l.labels[b + 2], b);
+        //printf(" same = %f, aug = %f, diff = %f, (aug > diff) = %d \n", same, aug, diff, (aug > diff));
+    }
+    *l.loss = 100 * good_contrast / (l.batch / 2);
+    printf(" Contrast accuracy = %f %% \n", *l.loss);
+    */
+
+    /*
+    // precalculate P_contrastive
+    for (b = 0; b < l.batch; ++b) {
+        int b2;
+        for (b2 = 0; b2 < l.batch; ++b2) {
+            if (b != b2) {
+                const float P = P_constrastive(b, b2, l.labels, l.batch, z, l.embedding_size, l.temperature, l.cos_sim);
+                l.p_constrastive[b*l.batch + b2] = P;
+                if (P > 1 || P < -1) {
+                    printf(" p = %f, ", P);
+                }
+            }
+        }
+    }
+    */
+
+
+    const size_t contr_size = contrast_p_index;
+
+    if (l.detection) {
+#ifdef GPU
+        const int max_contr_size = (l.max_boxes*l.batch)*(l.max_boxes*l.batch);
+        if (max_contr_size < contr_size) {
+            printf(" Error: too large number of bboxes: contr_size = %d > max_contr_size  = %d \n", contr_size, max_contr_size);
+            error("Error!", DARKNET_LOC);
+        }
+        int *labels = NULL;
+        if (contr_size > 2) {
+            cuda_push_array((float *)l.contrast_p_gpu, (float *)contrast_p, contr_size * sizeof(contrastive_params) / 4);
+            P_constrastive_f_det_gpu(labels, l.embedding_size, l.temperature, l.contrast_p_gpu, contr_size);
+            cuda_pull_array((float *)l.contrast_p_gpu, (float *)contrast_p, contr_size * sizeof(contrastive_params) / 4);
+        }
+#else   // GPU
+        int k;
+        //#pragma omp parallel for
+        for (k = 0; k < contr_size; ++k) {
+            contrast_p[k].P = P_constrastive_f_det(k, l.labels, z, l.embedding_size, l.temperature, contrast_p, contr_size);
+        }
+#endif  // GPU
+    }
+    else {
+        // precalculate P-contrastive
+        for (b = 0; b < l.batch; ++b) {
+            for (n = 0; n < l.n; ++n) {
+                for (h = 0; h < l.h; ++h) {
+                    for (w = 0; w < l.w; ++w)
+                    {
+                        const int z_index = b*l.n*l.h*l.w + n*l.h*l.w + h*l.w + w;
+                        if (l.labels[z_index] < 0) continue;
+
+                        for (b2 = 0; b2 < l.batch; ++b2) {
+                            for (n2 = 0; n2 < l.n; ++n2) {
+                                for (h2 = 0; h2 < l.h; ++h2) {
+                                    for (w2 = 0; w2 < l.w; ++w2)
+                                    {
+                                        const int z_index2 = b2*l.n*l.h*l.w + n2*l.h*l.w + h2*l.w + w2;
+                                        if (l.labels[z_index2] < 0) continue;
+                                        if (z_index == z_index2) continue;
+                                        if (l.detection)
+                                            if (l.class_ids[z_index] != l.class_ids[z_index2]) continue;
+
+                                        const int time_step_i = b / mini_batch;
+                                        const int time_step_j = b2 / mini_batch;
+                                        if (time_step_i != time_step_j) continue;
+
+                                        const size_t step = l.batch*l.n*l.h*l.w;
+
+                                        float P = -10;
+                                        if (l.detection) {
+                                            P = P_constrastive_f(z_index, z_index2, l.labels, z, l.embedding_size, l.temperature, contrast_p, contr_size);
+                                        }
+                                        else {
+                                            P = P_constrastive(z_index, z_index2, l.labels, step, z, l.embedding_size, l.temperature, l.cos_sim, l.exp_cos_sim);
+                                            l.p_constrastive[z_index*step + z_index2] = P;
+                                        }
+
+                                        int q;
+                                        for (q = 0; q < contr_size; ++q)
+                                            if (contrast_p[q].i == z_index && contrast_p[q].j == z_index2) {
+                                                contrast_p[q].P = P;
+                                                break;
+                                            }
+
+                                        //if (P > 1 || P < -1) {
+                                        //    printf(" p = %f, z_index = %d, z_index2 = %d ", P, z_index, z_index2);
+                                        //}
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+
+    // calc deltas
+    int bd = 0;
+    #pragma omp parallel for
+    for (bd = 0; bd < l.batch; ++bd) {
+        for (int nd = 0; nd < l.n; ++nd) {
+            for (int hd = 0; hd < l.h; ++hd) {
+                for (int wd = 0; wd < l.w; ++wd)
+                {
+                    const int z_index = bd*l.n*l.h*l.w + nd*l.h*l.w + hd*l.w + wd;
+                    const size_t step = l.batch*l.n*l.h*l.w;
+                    if (l.labels[z_index] < 0) continue;
+
+                    const int delta_index = bd*l.embedding_size*l.n*l.h*l.w + nd*l.embedding_size*l.h*l.w + hd*l.w + wd;
+                    const int wh = l.w*l.h;
+
+                    if (l.detection) {
+                        // detector
+
+                        // positive
+                        grad_contrastive_loss_positive_f(z_index, l.class_ids, l.labels, step, z, l.embedding_size, l.temperature, l.delta + delta_index, wh, contrast_p, contr_size);
+
+                        // negative
+                        grad_contrastive_loss_negative_f(z_index, l.class_ids, l.labels, step, z, l.embedding_size, l.temperature, l.delta + delta_index, wh, contrast_p, contr_size, l.contrastive_neg_max);
+                    }
+                    else {
+                        // classifier
+
+                        // positive
+                        grad_contrastive_loss_positive(z_index, l.labels, step, z, l.embedding_size, l.temperature, l.cos_sim, l.p_constrastive, l.delta + delta_index, wh);
+
+                        // negative
+                        grad_contrastive_loss_negative(z_index, l.labels, step, z, l.embedding_size, l.temperature, l.cos_sim, l.p_constrastive, l.delta + delta_index, wh);
+                    }
+
+                }
+            }
+        }
+    }
+
+    scal_cpu(l.inputs * l.batch, l.cls_normalizer, l.delta, 1);
+
+    for (i = 0; i < l.inputs * l.batch; ++i) {
+        l.delta[i] = clip_value(l.delta[i], l.max_delta);
+    }
+
+    *(l.cost) = pow(mag_array(l.delta, l.inputs * l.batch), 2);
+    if (state.net.adversarial) {
+        printf(" adversarial contrastive loss = %f \n\n", *(l.cost));
+    }
+    else {
+        printf(" contrastive loss = %f \n\n", *(l.cost));
+    }
+
+    for (b = 0; b < l.batch; ++b) {
+        for (n = 0; n < l.n; ++n) {
+            for (h = 0; h < l.h; ++h) {
+                for (w = 0; w < l.w; ++w)
+                {
+                    const int z_index = b*l.n*l.h*l.w + n*l.h*l.w + h*l.w + w;
+                    //if (l.labels[z_index] < 0) continue;
+                    if (z[z_index]) free(z[z_index]);
+                }
+            }
+        }
+    }
+
+    free(contrast_p);
+    free(z);
+}
+
+void backward_contrastive_layer(contrastive_layer l, network_state state)
+{
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, state.delta, 1);
+}
+
+
+#ifdef GPU
+
+void pull_contrastive_layer_output(const contrastive_layer l)
+{
+    cuda_pull_array(l.output_gpu, l.output, l.inputs*l.batch);
+}
+
+void push_contrastive_layer_output(const contrastive_layer l)
+{
+    cuda_push_array(l.delta_gpu, l.delta, l.inputs*l.batch);
+}
+
+
+void forward_contrastive_layer_gpu(contrastive_layer l, network_state state)
+{
+    simple_copy_ongpu(l.batch*l.inputs, state.input, l.output_gpu);
+    if (!state.train) return;
+
+    float *in_cpu = (float *)xcalloc(l.batch*l.inputs, sizeof(float));
+    cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+    memcpy(in_cpu, l.output, l.batch*l.outputs * sizeof(float));
+    float *truth_cpu = 0;
+    if (state.truth) {
+        int num_truth = l.batch*l.classes;
+        if (l.detection) num_truth = l.batch*l.truths;
+        truth_cpu = (float *)xcalloc(num_truth, sizeof(float));
+        cuda_pull_array(state.truth, truth_cpu, num_truth);
+    }
+    network_state cpu_state = state;
+    cpu_state.net = state.net;
+    cpu_state.index = state.index;
+    cpu_state.train = state.train;
+    cpu_state.truth = truth_cpu;
+    cpu_state.input = in_cpu;
+
+    forward_contrastive_layer(l, cpu_state);
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
+
+    free(in_cpu);
+    if (cpu_state.truth) free(cpu_state.truth);
+}
+
+void backward_contrastive_layer_gpu(contrastive_layer layer, network_state state)
+{
+    axpy_ongpu(layer.batch*layer.inputs, state.net.loss_scale, layer.delta_gpu, 1, state.delta, 1);
+}
+
+#endif
diff --git a/darknet-master/src/softmax_layer.h b/darknet-master/src/softmax_layer.h
new file mode 100644
index 0000000..c86997b
--- /dev/null
+++ b/darknet-master/src/softmax_layer.h
@@ -0,0 +1,39 @@
+#ifndef SOFTMAX_LAYER_H
+#define SOFTMAX_LAYER_H
+#include "layer.h"
+#include "network.h"
+
+typedef layer softmax_layer;
+typedef layer contrastive_layer;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void softmax_array(float *input, int n, float temp, float *output);
+softmax_layer make_softmax_layer(int batch, int inputs, int groups);
+void forward_softmax_layer(const softmax_layer l, network_state state);
+void backward_softmax_layer(const softmax_layer l, network_state state);
+
+#ifdef GPU
+void pull_softmax_layer_output(const softmax_layer l);
+void forward_softmax_layer_gpu(const softmax_layer l, network_state state);
+void backward_softmax_layer_gpu(const softmax_layer l, network_state state);
+#endif
+
+//-----------------------
+
+contrastive_layer make_contrastive_layer(int batch, int w, int h, int n, int classes, int inputs, layer *yolo_layer);
+void forward_contrastive_layer(contrastive_layer l, network_state state);
+void backward_contrastive_layer(contrastive_layer l, network_state net);
+
+#ifdef GPU
+void pull_contrastive_layer_output(const contrastive_layer l);
+void push_contrastive_layer_output(const contrastive_layer l);
+void forward_contrastive_layer_gpu(contrastive_layer l, network_state state);
+void backward_contrastive_layer_gpu(contrastive_layer layer, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/super.c b/darknet-master/src/super.c
new file mode 100644
index 0000000..35e7f6c
--- /dev/null
+++ b/darknet-master/src/super.c
@@ -0,0 +1,128 @@
+#include "network.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "parser.h"
+
+
+void train_super(char *cfgfile, char *weightfile)
+{
+    char* train_images = "data/imagenet/imagenet1k.train.list";
+    char* backup_directory = "backup/";
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    float avg_loss = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = net.batch*net.subdivisions;
+    int i = *net.seen/imgs;
+    data train, buffer;
+
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.scale = 4;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.d = &buffer;
+    args.type = SUPER_DATA;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+    //while(i*imgs < N*120){
+    while(get_current_batch(net) < net.max_batches){
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        time=clock();
+        float loss = train_network(net, train);
+        if (avg_loss < 0) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+
+        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+        }
+        free_data(train);
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+}
+
+void test_super(char *cfgfile, char *weightfile, char *filename)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    srand(2222222);
+
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, 0, 0);
+        resize_network(&net, im.w, im.h);
+        printf("%d %d\n", im.w, im.h);
+
+        float *X = im.data;
+        time=clock();
+        network_predict(net, X);
+        image out = get_network_image(net);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        save_image(out, "out");
+
+        free_image(im);
+        if (filename) break;
+    }
+}
+
+
+void run_super(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *filename = (argc > 5) ? argv[5] : 0;
+    if(0==strcmp(argv[2], "train")) train_super(cfg, weights);
+    else if(0==strcmp(argv[2], "test")) test_super(cfg, weights, filename);
+    /*
+    else if(0==strcmp(argv[2], "valid")) validate_super(cfg, weights);
+    */
+}
diff --git a/darknet-master/src/swag.c b/darknet-master/src/swag.c
new file mode 100644
index 0000000..210f03f
--- /dev/null
+++ b/darknet-master/src/swag.c
@@ -0,0 +1,87 @@
+#include "network.h"
+#include "detection_layer.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "parser.h"
+#include "box.h"
+
+void train_swag(char *cfgfile, char *weightfile)
+{
+    char *train_images = "data/voc.0712.trainval";
+    char* backup_directory = "backup/";
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    float avg_loss = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = net.batch*net.subdivisions;
+    int i = *net.seen/imgs;
+    data train, buffer;
+
+    layer l = net.layers[net.n - 1];
+
+    int side = l.side;
+    int classes = l.classes;
+    float jitter = l.jitter;
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.classes = classes;
+    args.jitter = jitter;
+    args.num_boxes = side;
+    args.d = &buffer;
+    args.type = REGION_DATA;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+    //while(i*imgs < N*120){
+    while(get_current_batch(net) < net.max_batches){
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        time=clock();
+        float loss = train_network(net, train);
+        if (avg_loss < 0) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+
+        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
+        if(i%1000==0 || i == 600){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
+        free_data(train);
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+}
+
+void run_swag(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    if(0==strcmp(argv[2], "train")) train_swag(cfg, weights);
+}
diff --git a/darknet-master/src/tag.c b/darknet-master/src/tag.c
new file mode 100644
index 0000000..c1e031b
--- /dev/null
+++ b/darknet-master/src/tag.c
@@ -0,0 +1,151 @@
+#include "network.h"
+#include "utils.h"
+#include "parser.h"
+
+void train_tag(char *cfgfile, char *weightfile, int clear)
+{
+    srand(time(0));
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    char* backup_directory = "backup/";
+    printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    if (clear) {
+        *net.seen = 0;
+        *net.cur_iteration = 0;
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = 1024;
+    list* plist = get_paths("tag/train.list");
+    char **paths = (char **)list_to_array(plist);
+    printf("%d\n", plist->size);
+    int N = plist->size;
+    clock_t time;
+    pthread_t load_thread;
+    data train;
+    data buffer;
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+
+    args.min = net.w;
+    args.max = net.max_crop;
+    args.size = net.w;
+
+    args.paths = paths;
+    args.classes = net.outputs;
+    args.n = imgs;
+    args.m = N;
+    args.d = &buffer;
+    args.type = TAG_DATA;
+
+    args.angle = net.angle;
+    args.exposure = net.exposure;
+    args.saturation = net.saturation;
+    args.hue = net.hue;
+
+    fprintf(stderr, "%d classes\n", net.outputs);
+
+    load_thread = load_data_in_thread(args);
+    int epoch = (*net.seen)/N;
+    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+
+        load_thread = load_data_in_thread(args);
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+        time=clock();
+        float loss = train_network(net, train);
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %" PRIu64 " images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
+        free_data(train);
+        if(*net.seen/N > epoch){
+            epoch = *net.seen/N;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            save_weights(net, buff);
+        }
+        if(get_current_batch(net)%100 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            save_weights(net, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s.weights", backup_directory, base);
+    save_weights(net, buff);
+
+    pthread_join(load_thread, 0);
+    free_data(buffer);
+    free_network(net);
+    free_ptrs((void**)paths, plist->size);
+    free_list(plist);
+    free(base);
+}
+
+void test_tag(char *cfgfile, char *weightfile, char *filename)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    srand(2222222);
+    int i = 0;
+    char **names = get_labels("data/tags.txt");
+    clock_t time;
+    int indexes[10];
+    char buff[256];
+    char *input = buff;
+    int size = net.w;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, 0, 0);
+        image r = resize_min(im, size);
+        resize_network(&net, r.w, r.h);
+        printf("%d %d\n", r.w, r.h);
+
+        float *X = r.data;
+        time=clock();
+        float *predictions = network_predict(net, X);
+        top_predictions(net, 10, indexes);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        for(i = 0; i < 10; ++i){
+            int index = indexes[i];
+            printf("%.1f%%: %s\n", predictions[index]*100, names[index]);
+        }
+        if(r.data != im.data) free_image(r);
+        free_image(im);
+        if (filename) break;
+    }
+}
+
+
+void run_tag(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    int clear = find_arg(argc, argv, "-clear");
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *filename = (argc > 5) ? argv[5] : 0;
+    if(0==strcmp(argv[2], "train")) train_tag(cfg, weights, clear);
+    else if(0==strcmp(argv[2], "test")) test_tag(cfg, weights, filename);
+}
diff --git a/darknet-master/src/tree.c b/darknet-master/src/tree.c
new file mode 100644
index 0000000..8a2c231
--- /dev/null
+++ b/darknet-master/src/tree.c
@@ -0,0 +1,135 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree.h"
+#include "utils.h"
+#include "data.h"
+
+void change_leaves(tree *t, char *leaf_list)
+{
+    list *llist = get_paths(leaf_list);
+    char **leaves = (char **)list_to_array(llist);
+    int n = llist->size;
+    int i,j;
+    int found = 0;
+    for(i = 0; i < t->n; ++i){
+        t->leaf[i] = 0;
+        for(j = 0; j < n; ++j){
+            if (0==strcmp(t->name[i], leaves[j])){
+                t->leaf[i] = 1;
+                ++found;
+                break;
+            }
+        }
+    }
+    fprintf(stderr, "Found %d leaves.\n", found);
+}
+
+float get_hierarchy_probability(float *x, tree *hier, int c)
+{
+    float p = 1;
+    while(c >= 0){
+        p = p * x[c];
+        c = hier->parent[c];
+    }
+    return p;
+}
+
+void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves)
+{
+    int j;
+    for(j = 0; j < n; ++j){
+        int parent = hier->parent[j];
+        if(parent >= 0){
+            predictions[j] *= predictions[parent];
+        }
+    }
+    if(only_leaves){
+        for(j = 0; j < n; ++j){
+            if(!hier->leaf[j]) predictions[j] = 0;
+        }
+    }
+}
+
+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride)
+{
+    float p = 1;
+    int group = 0;
+    int i;
+    while (1) {
+        float max = 0;
+        int max_i = 0;
+
+        for (i = 0; i < hier->group_size[group]; ++i) {
+            int index = i + hier->group_offset[group];
+            float val = predictions[(i + hier->group_offset[group])*stride];
+            if (val > max) {
+                max_i = index;
+                max = val;
+            }
+        }
+        if (p*max > thresh) {
+            p = p*max;
+            group = hier->child[max_i];
+            if (hier->child[max_i] < 0) return max_i;
+        }
+        else if (group == 0) {
+            return max_i;
+        }
+        else {
+            return hier->parent[hier->group_offset[group]];
+        }
+    }
+    return 0;
+}
+
+tree *read_tree(char *filename)
+{
+    tree t = {0};
+    FILE *fp = fopen(filename, "r");
+
+    char *line;
+    int last_parent = -1;
+    int group_size = 0;
+    int groups = 0;
+    int n = 0;
+    while((line=fgetl(fp)) != 0){
+        char* id = (char*)xcalloc(256, sizeof(char));
+        int parent = -1;
+        sscanf(line, "%s %d", id, &parent);
+        t.parent = (int*)xrealloc(t.parent, (n + 1) * sizeof(int));
+        t.parent[n] = parent;
+
+        t.name = (char**)xrealloc(t.name, (n + 1) * sizeof(char*));
+        t.name[n] = id;
+        if(parent != last_parent){
+            ++groups;
+            t.group_offset = (int*)xrealloc(t.group_offset, groups * sizeof(int));
+            t.group_offset[groups - 1] = n - group_size;
+            t.group_size = (int*)xrealloc(t.group_size, groups * sizeof(int));
+            t.group_size[groups - 1] = group_size;
+            group_size = 0;
+            last_parent = parent;
+        }
+        t.group = (int*)xrealloc(t.group, (n + 1) * sizeof(int));
+        t.group[n] = groups;
+        ++n;
+        ++group_size;
+    }
+    ++groups;
+    t.group_offset = (int*)xrealloc(t.group_offset, groups * sizeof(int));
+    t.group_offset[groups - 1] = n - group_size;
+    t.group_size = (int*)xrealloc(t.group_size, groups * sizeof(int));
+    t.group_size[groups - 1] = group_size;
+    t.n = n;
+    t.groups = groups;
+    t.leaf = (int*)xcalloc(n, sizeof(int));
+    int i;
+    for(i = 0; i < n; ++i) t.leaf[i] = 1;
+    for(i = 0; i < n; ++i) if(t.parent[i] >= 0) t.leaf[t.parent[i]] = 0;
+
+    fclose(fp);
+    tree* tree_ptr = (tree*)xcalloc(1, sizeof(tree));
+    *tree_ptr = t;
+    //error(0);
+    return tree_ptr;
+}
diff --git a/darknet-master/src/tree.h b/darknet-master/src/tree.h
new file mode 100644
index 0000000..c5a09aa
--- /dev/null
+++ b/darknet-master/src/tree.h
@@ -0,0 +1,30 @@
+#ifndef TREE_H
+#define TREE_H
+#include "darknet.h"
+
+//typedef struct{
+//    int *leaf;
+//    int n;
+//    int *parent;
+//    int *child;
+//    int *group;
+//    char **name;
+//
+//    int groups;
+//    int *group_size;
+//    int *group_offset;
+//} tree;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+//tree *read_tree(char *filename);
+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride);
+void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves);
+void change_leaves(tree *t, char *leaf_list);
+float get_hierarchy_probability(float *x, tree *hier, int c);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/upsample_layer.c b/darknet-master/src/upsample_layer.c
new file mode 100644
index 0000000..778f5b4
--- /dev/null
+++ b/darknet-master/src/upsample_layer.c
@@ -0,0 +1,107 @@
+#include "upsample_layer.h"
+#include "dark_cuda.h"
+#include "utils.h"
+#include "blas.h"
+
+#include <stdio.h>
+
+layer make_upsample_layer(int batch, int w, int h, int c, int stride)
+{
+    layer l = { (LAYER_TYPE)0 };
+    l.type = UPSAMPLE;
+    l.batch = batch;
+    l.w = w;
+    l.h = h;
+    l.c = c;
+    l.out_w = w*stride;
+    l.out_h = h*stride;
+    l.out_c = c;
+    if(stride < 0){
+        stride = -stride;
+        l.reverse=1;
+        l.out_w = w/stride;
+        l.out_h = h/stride;
+    }
+    l.stride = stride;
+    l.outputs = l.out_w*l.out_h*l.out_c;
+    l.inputs = l.w*l.h*l.c;
+    l.delta = (float*)xcalloc(l.outputs * batch, sizeof(float));
+    l.output = (float*)xcalloc(l.outputs * batch, sizeof(float));
+
+    l.forward = forward_upsample_layer;
+    l.backward = backward_upsample_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_upsample_layer_gpu;
+    l.backward_gpu = backward_upsample_layer_gpu;
+
+    l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+    #endif
+    if(l.reverse) fprintf(stderr, "downsample              %2dx  %4d x%4d x%4d -> %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    else fprintf(stderr, "upsample                %2dx  %4d x%4d x%4d -> %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    return l;
+}
+
+void resize_upsample_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+    l->out_w = w*l->stride;
+    l->out_h = h*l->stride;
+    if(l->reverse){
+        l->out_w = w/l->stride;
+        l->out_h = h/l->stride;
+    }
+    l->outputs = l->out_w*l->out_h*l->out_c;
+    l->inputs = l->h*l->w*l->c;
+    l->delta = (float*)xrealloc(l->delta, l->outputs * l->batch * sizeof(float));
+    l->output = (float*)xrealloc(l->output, l->outputs * l->batch * sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu  = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu   = cuda_make_array(l->delta,  l->outputs*l->batch);
+#endif
+
+}
+
+void forward_upsample_layer(const layer l, network_state net)
+{
+    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+    if(l.reverse){
+        upsample_cpu(l.output, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input);
+    }else{
+        upsample_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output);
+    }
+}
+
+void backward_upsample_layer(const layer l, network_state state)
+{
+    if(l.reverse){
+        upsample_cpu(l.delta, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, state.delta);
+    }else{
+        upsample_cpu(state.delta, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta);
+    }
+}
+
+#ifdef GPU
+void forward_upsample_layer_gpu(const layer l, network_state state)
+{
+    fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+    if(l.reverse){
+        upsample_gpu(l.output_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, state.input);
+    }else{
+        upsample_gpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output_gpu);
+    }
+}
+
+void backward_upsample_layer_gpu(const layer l, network_state state)
+{
+    if(l.reverse){
+        upsample_gpu(l.delta_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, state.delta);
+    }else{
+        upsample_gpu(state.delta, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta_gpu);
+    }
+}
+#endif
diff --git a/darknet-master/src/upsample_layer.h b/darknet-master/src/upsample_layer.h
new file mode 100644
index 0000000..4461cb1
--- /dev/null
+++ b/darknet-master/src/upsample_layer.h
@@ -0,0 +1,23 @@
+#ifndef UPSAMPLE_LAYER_H
+#define UPSAMPLE_LAYER_H
+#include "dark_cuda.h"
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_upsample_layer(int batch, int w, int h, int c, int stride);
+void forward_upsample_layer(const layer l, network_state state);
+void backward_upsample_layer(const layer l, network_state state);
+void resize_upsample_layer(layer *l, int w, int h);
+
+#ifdef GPU
+void forward_upsample_layer_gpu(const layer l, network_state state);
+void backward_upsample_layer_gpu(const layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/utils.c b/darknet-master/src/utils.c
new file mode 100644
index 0000000..2ad33d5
--- /dev/null
+++ b/darknet-master/src/utils.c
@@ -0,0 +1,1102 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "utils.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include "darkunistd.h"
+#ifdef WIN32
+#include "gettimeofday.h"
+#else
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <execinfo.h>
+#endif
+
+
+#ifndef USE_CMAKE_LIBS
+#pragma warning(disable: 4996)
+#endif
+
+void *xmalloc_location(const size_t size, const char * const filename, const char * const funcname, const int line) {
+    void *ptr=malloc(size);
+    if(!ptr) {
+        malloc_error(size, filename, funcname, line);
+    }
+    return ptr;
+}
+
+void *xcalloc_location(const size_t nmemb, const size_t size, const char * const filename, const char * const funcname, const int line) {
+    void *ptr=calloc(nmemb, size);
+    if(!ptr) {
+        calloc_error(nmemb * size, filename, funcname, line);
+    }
+    return ptr;
+}
+
+void *xrealloc_location(void *ptr, const size_t size, const char * const filename, const char * const funcname, const int line) {
+    ptr=realloc(ptr,size);
+    if(!ptr) {
+        realloc_error(size, filename, funcname, line);
+    }
+    return ptr;
+}
+
+double what_time_is_it_now()
+{
+    struct timeval time;
+    if (gettimeofday(&time, NULL)) {
+        return 0;
+    }
+    return (double)time.tv_sec + (double)time.tv_usec * .000001;
+}
+
+int *read_map(char *filename)
+{
+    int n = 0;
+    int *map = 0;
+    char *str;
+    FILE *file = fopen(filename, "r");
+    if(!file) file_error(filename);
+    while((str=fgetl(file))){
+        ++n;
+        map = (int*)xrealloc(map, n * sizeof(int));
+        map[n-1] = atoi(str);
+        free(str);
+    }
+    if (file) fclose(file);
+    return map;
+}
+
+void sorta_shuffle(void *arr, size_t n, size_t size, size_t sections)
+{
+    size_t i;
+    for(i = 0; i < sections; ++i){
+        size_t start = n*i/sections;
+        size_t end = n*(i+1)/sections;
+        size_t num = end-start;
+        shuffle((char*)arr+(start*size), num, size);
+    }
+}
+
+void shuffle(void *arr, size_t n, size_t size)
+{
+    size_t i;
+    void* swp = (void*)xcalloc(1, size);
+    for(i = 0; i < n-1; ++i){
+        size_t j = i + random_gen()/(RAND_MAX / (n-i)+1);
+        memcpy(swp,            (char*)arr+(j*size), size);
+        memcpy((char*)arr+(j*size), (char*)arr+(i*size), size);
+        memcpy((char*)arr+(i*size), swp,          size);
+    }
+    free(swp);
+}
+
+void del_arg(int argc, char **argv, int index)
+{
+    int i;
+    for(i = index; i < argc-1; ++i) argv[i] = argv[i+1];
+    argv[i] = 0;
+}
+
+int find_arg(int argc, char* argv[], char *arg)
+{
+    int i;
+    for(i = 0; i < argc; ++i) {
+        if(!argv[i]) continue;
+        if(0==strcmp(argv[i], arg)) {
+            del_arg(argc, argv, i);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int find_int_arg(int argc, char **argv, char *arg, int def)
+{
+    int i;
+    for(i = 0; i < argc-1; ++i){
+        if(!argv[i]) continue;
+        if(0==strcmp(argv[i], arg)){
+            def = atoi(argv[i+1]);
+            del_arg(argc, argv, i);
+            del_arg(argc, argv, i);
+            break;
+        }
+    }
+    return def;
+}
+
+float find_float_arg(int argc, char **argv, char *arg, float def)
+{
+    int i;
+    for(i = 0; i < argc-1; ++i){
+        if(!argv[i]) continue;
+        if(0==strcmp(argv[i], arg)){
+            def = atof(argv[i+1]);
+            del_arg(argc, argv, i);
+            del_arg(argc, argv, i);
+            break;
+        }
+    }
+    return def;
+}
+
+char *find_char_arg(int argc, char **argv, char *arg, char *def)
+{
+    int i;
+    for(i = 0; i < argc-1; ++i){
+        if(!argv[i]) continue;
+        if(0==strcmp(argv[i], arg)){
+            def = argv[i+1];
+            del_arg(argc, argv, i);
+            del_arg(argc, argv, i);
+            break;
+        }
+    }
+    return def;
+}
+
+
+char *basecfg(char *cfgfile)
+{
+    char *c = cfgfile;
+    char *next;
+    while((next = strchr(c, '/')))
+    {
+        c = next+1;
+    }
+    if(!next) while ((next = strchr(c, '\\'))) { c = next + 1; }
+    c = copy_string(c);
+    next = strchr(c, '.');
+    if (next) *next = 0;
+    return c;
+}
+
+int alphanum_to_int(char c)
+{
+    return (c < 58) ? c - 48 : c-87;
+}
+char int_to_alphanum(int i)
+{
+    if (i == 36) return '.';
+    return (i < 10) ? i + 48 : i + 87;
+}
+
+void pm(int M, int N, float *A)
+{
+    int i,j;
+    for(i =0 ; i < M; ++i){
+        printf("%d ", i+1);
+        for(j = 0; j < N; ++j){
+            printf("%2.4f, ", A[i*N+j]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+}
+
+void find_replace(const char* str, char* orig, char* rep, char* output)
+{
+    char* buffer = (char*)calloc(8192, sizeof(char));
+    char *p;
+
+    sprintf(buffer, "%s", str);
+    if (!(p = strstr(buffer, orig))) {  // Is 'orig' even in 'str'?
+        sprintf(output, "%s", buffer);
+        free(buffer);
+        return;
+    }
+
+    *p = '\0';
+
+    sprintf(output, "%s%s%s", buffer, rep, p + strlen(orig));
+    free(buffer);
+}
+
+void trim(char *str)
+{
+    char* buffer = (char*)xcalloc(8192, sizeof(char));
+    sprintf(buffer, "%s", str);
+
+    char *p = buffer;
+    while (*p == ' ' || *p == '\t') ++p;
+
+    char *end = p + strlen(p) - 1;
+    while (*end == ' ' || *end == '\t') {
+        *end = '\0';
+        --end;
+    }
+    sprintf(str, "%s", p);
+
+    free(buffer);
+}
+
+char *strlaststr(char *haystack, char *needle)
+{
+    char *p = strstr(haystack, needle), *r = NULL;
+    while (p != NULL)
+    {
+        r = p;
+        p = strstr(p + 1, needle);
+    }
+    return r;
+}
+
+void find_replace_extension(char *str, char *orig, char *rep, char *output)
+{
+    char* buffer = (char*)calloc(8192, sizeof(char));
+
+    sprintf(buffer, "%s", str);
+    char *p = strlaststr(buffer, orig);
+    int offset = (p - buffer);
+    int chars_from_end = strlen(buffer) - offset;
+    if (!p || chars_from_end != strlen(orig)) {  // Is 'orig' even in 'str' AND is 'orig' found at the end of 'str'?
+        sprintf(output, "%s", buffer);
+        free(buffer);
+        return;
+    }
+
+    *p = '\0';
+    sprintf(output, "%s%s%s", buffer, rep, p + strlen(orig));
+    free(buffer);
+}
+
+void replace_image_to_label(const char* input_path, char* output_path)
+{
+    find_replace(input_path, "/images/train2017/", "/labels/train2017/", output_path);    // COCO
+    find_replace(output_path, "/images/val2017/", "/labels/val2017/", output_path);        // COCO
+    find_replace(output_path, "/JPEGImages/", "/labels/", output_path);    // PascalVOC
+    find_replace(output_path, "\\images\\train2017\\", "\\labels\\train2017\\", output_path);    // COCO
+    find_replace(output_path, "\\images\\val2017\\", "\\labels\\val2017\\", output_path);        // COCO
+
+    find_replace(output_path, "\\images\\train2014\\", "\\labels\\train2014\\", output_path);    // COCO
+    find_replace(output_path, "\\images\\val2014\\", "\\labels\\val2014\\", output_path);        // COCO
+    find_replace(output_path, "/images/train2014/", "/labels/train2014/", output_path);    // COCO
+    find_replace(output_path, "/images/val2014/", "/labels/val2014/", output_path);        // COCO
+
+    find_replace(output_path, "\\JPEGImages\\", "\\labels\\", output_path);    // PascalVOC
+    //find_replace(output_path, "/images/", "/labels/", output_path);    // COCO
+    //find_replace(output_path, "/VOC2007/JPEGImages/", "/VOC2007/labels/", output_path);        // PascalVOC
+    //find_replace(output_path, "/VOC2012/JPEGImages/", "/VOC2012/labels/", output_path);        // PascalVOC
+
+    //find_replace(output_path, "/raw/", "/labels/", output_path);
+    trim(output_path);
+
+    // replace only ext of files
+    find_replace_extension(output_path, ".jpg", ".txt", output_path);
+    find_replace_extension(output_path, ".JPG", ".txt", output_path); // error
+    find_replace_extension(output_path, ".jpeg", ".txt", output_path);
+    find_replace_extension(output_path, ".JPEG", ".txt", output_path);
+    find_replace_extension(output_path, ".png", ".txt", output_path);
+    find_replace_extension(output_path, ".PNG", ".txt", output_path);
+    find_replace_extension(output_path, ".bmp", ".txt", output_path);
+    find_replace_extension(output_path, ".BMP", ".txt", output_path);
+    find_replace_extension(output_path, ".ppm", ".txt", output_path);
+    find_replace_extension(output_path, ".PPM", ".txt", output_path);
+    find_replace_extension(output_path, ".tiff", ".txt", output_path);
+    find_replace_extension(output_path, ".TIFF", ".txt", output_path);
+
+    // Check file ends with txt:
+    if(strlen(output_path) > 4) {
+        char *output_path_ext = output_path + strlen(output_path) - 4;
+        if( strcmp(".txt", output_path_ext) != 0){
+            fprintf(stderr, "Failed to infer label file name (check image extension is supported): %s \n", output_path);
+        }
+    }else{
+        fprintf(stderr, "Label file name is too short: %s \n", output_path);
+    }
+}
+
+float sec(clock_t clocks)
+{
+    return (float)clocks/CLOCKS_PER_SEC;
+}
+
+void top_k(float *a, int n, int k, int *index)
+{
+    int i,j;
+    for(j = 0; j < k; ++j) index[j] = -1;
+    for(i = 0; i < n; ++i){
+        int curr = i;
+        for(j = 0; j < k; ++j){
+            if((index[j] < 0) || a[curr] > a[index[j]]){
+                int swap = curr;
+                curr = index[j];
+                index[j] = swap;
+            }
+        }
+    }
+}
+
+
+void log_backtrace()
+{
+#ifndef WIN32
+    void * buffer[50];
+    int count = backtrace(buffer, sizeof(buffer));
+    char **symbols = backtrace_symbols(buffer, count);
+
+    fprintf(stderr, "backtrace (%d entries)\n", count);
+
+    for (int idx = 0; idx < count; idx ++)
+    {
+        fprintf(stderr, "%d/%d: %s\n", idx + 1, count, symbols[idx]);
+    }
+
+    free(symbols);
+#endif
+}
+
+void error(const char * const msg, const char * const filename, const char * const funcname, const int line)
+{
+    fprintf(stderr, "Darknet error location: %s, %s(), line #%d\n", filename, funcname, line);
+    perror(msg);
+    log_backtrace();
+    exit(EXIT_FAILURE);
+}
+
+const char * size_to_IEC_string(const size_t size)
+{
+    const float bytes = (double)size;
+    const float KiB = 1024;
+    const float MiB = 1024 * KiB;
+    const float GiB = 1024 * MiB;
+
+    static char buffer[25];
+    if (size < KiB)         sprintf(buffer, "%ld bytes", size);
+    else if (size < MiB)    sprintf(buffer, "%1.1f KiB", bytes / KiB);
+    else if (size < GiB)    sprintf(buffer, "%1.1f MiB", bytes / MiB);
+    else                    sprintf(buffer, "%1.1f GiB", bytes / GiB);
+
+    return buffer;
+}
+
+void malloc_error(const size_t size, const char * const filename, const char * const funcname, const int line)
+{
+    fprintf(stderr, "Failed to malloc %s\n", size_to_IEC_string(size));
+    error("Malloc error - possibly out of CPU RAM", filename, funcname, line);
+}
+
+void calloc_error(const size_t size, const char * const filename, const char * const funcname, const int line)
+{
+    fprintf(stderr, "Failed to calloc %s\n", size_to_IEC_string(size));
+    error("Calloc error - possibly out of CPU RAM", filename, funcname, line);
+}
+
+void realloc_error(const size_t size, const char * const filename, const char * const funcname, const int line)
+{
+    fprintf(stderr, "Failed to realloc %s\n", size_to_IEC_string(size));
+    error("Realloc error - possibly out of CPU RAM", filename, funcname, line);
+}
+
+void file_error(const char * const s)
+{
+    fprintf(stderr, "Couldn't open file: %s\n", s);
+    exit(EXIT_FAILURE);
+}
+
+list *split_str(char *s, char delim)
+{
+    size_t i;
+    size_t len = strlen(s);
+    list *l = make_list();
+    list_insert(l, s);
+    for(i = 0; i < len; ++i){
+        if(s[i] == delim){
+            s[i] = '\0';
+            list_insert(l, &(s[i+1]));
+        }
+    }
+    return l;
+}
+
+void strip(char *s)
+{
+    size_t i;
+    size_t len = strlen(s);
+    size_t offset = 0;
+    for(i = 0; i < len; ++i){
+        char c = s[i];
+        if(c==' '||c=='\t'||c=='\n'||c =='\r'||c==0x0d||c==0x0a) ++offset;
+        else s[i-offset] = c;
+    }
+    s[len-offset] = '\0';
+}
+
+
+void strip_args(char *s)
+{
+    size_t i;
+    size_t len = strlen(s);
+    size_t offset = 0;
+    for (i = 0; i < len; ++i) {
+        char c = s[i];
+        if (c == '\t' || c == '\n' || c == '\r' || c == 0x0d || c == 0x0a) ++offset;
+        else s[i - offset] = c;
+    }
+    s[len - offset] = '\0';
+}
+
+void strip_char(char *s, char bad)
+{
+    size_t i;
+    size_t len = strlen(s);
+    size_t offset = 0;
+    for(i = 0; i < len; ++i){
+        char c = s[i];
+        if(c==bad) ++offset;
+        else s[i-offset] = c;
+    }
+    s[len-offset] = '\0';
+}
+
+void free_ptrs(void **ptrs, int n)
+{
+    int i;
+    for(i = 0; i < n; ++i) free(ptrs[i]);
+    free(ptrs);
+}
+
+char *fgetl(FILE *fp)
+{
+    if(feof(fp)) return 0;
+    size_t size = 512;
+    char* line = (char*)xmalloc(size * sizeof(char));
+    if(!fgets(line, size, fp)){
+        free(line);
+        return 0;
+    }
+
+    size_t curr = strlen(line);
+
+    while((line[curr-1] != '\n') && !feof(fp)){
+        if(curr == size-1){
+            size *= 2;
+            line = (char*)xrealloc(line, size * sizeof(char));
+        }
+        size_t readsize = size-curr;
+        if(readsize > INT_MAX) readsize = INT_MAX-1;
+        fgets(&line[curr], readsize, fp);
+        curr = strlen(line);
+    }
+    if(curr >= 2)
+        if(line[curr-2] == 0x0d) line[curr-2] = 0x00;
+
+    if(curr >= 1)
+        if(line[curr-1] == 0x0a) line[curr-1] = 0x00;
+
+    return line;
+}
+
+int read_int(int fd)
+{
+    int n = 0;
+    int next = read(fd, &n, sizeof(int));
+    if(next <= 0) return -1;
+    return n;
+}
+
+void write_int(int fd, int n)
+{
+    int next = write(fd, &n, sizeof(int));
+    if(next <= 0) error("read failed", DARKNET_LOC);
+}
+
+int read_all_fail(int fd, char *buffer, size_t bytes)
+{
+    size_t n = 0;
+    while(n < bytes){
+        int next = read(fd, buffer + n, bytes-n);
+        if(next <= 0) return 1;
+        n += next;
+    }
+    return 0;
+}
+
+int write_all_fail(int fd, char *buffer, size_t bytes)
+{
+    size_t n = 0;
+    while(n < bytes){
+        size_t next = write(fd, buffer + n, bytes-n);
+        if(next <= 0) return 1;
+        n += next;
+    }
+    return 0;
+}
+
+void read_all(int fd, char *buffer, size_t bytes)
+{
+    size_t n = 0;
+    while(n < bytes){
+        int next = read(fd, buffer + n, bytes-n);
+        if(next <= 0) error("read failed", DARKNET_LOC);
+        n += next;
+    }
+}
+
+void write_all(int fd, char *buffer, size_t bytes)
+{
+    size_t n = 0;
+    while(n < bytes){
+        size_t next = write(fd, buffer + n, bytes-n);
+        if(next <= 0) error("write failed", DARKNET_LOC);
+        n += next;
+    }
+}
+
+
+char *copy_string(char *s)
+{
+    if(!s) {
+        return NULL;
+    }
+    char* copy = (char*)xmalloc(strlen(s) + 1);
+    strncpy(copy, s, strlen(s)+1);
+    return copy;
+}
+
+list *parse_csv_line(char *line)
+{
+    list *l = make_list();
+    char *c, *p;
+    int in = 0;
+    for(c = line, p = line; *c != '\0'; ++c){
+        if(*c == '"') in = !in;
+        else if(*c == ',' && !in){
+            *c = '\0';
+            list_insert(l, copy_string(p));
+            p = c+1;
+        }
+    }
+    list_insert(l, copy_string(p));
+    return l;
+}
+
+int count_fields(char *line)
+{
+    int count = 0;
+    int done = 0;
+    char *c;
+    for(c = line; !done; ++c){
+        done = (*c == '\0');
+        if(*c == ',' || done) ++count;
+    }
+    return count;
+}
+
+float *parse_fields(char *line, int n)
+{
+    float* field = (float*)xcalloc(n, sizeof(float));
+    char *c, *p, *end;
+    int count = 0;
+    int done = 0;
+    for(c = line, p = line; !done; ++c){
+        done = (*c == '\0');
+        if(*c == ',' || done){
+            *c = '\0';
+            field[count] = strtod(p, &end);
+            if(p == c) field[count] = nan("");
+            if(end != c && (end != c-1 || *end != '\r')) field[count] = nan(""); //DOS file formats!
+            p = c+1;
+            ++count;
+        }
+    }
+    return field;
+}
+
+float sum_array(float *a, int n)
+{
+    int i;
+    float sum = 0;
+    for(i = 0; i < n; ++i) sum += a[i];
+    return sum;
+}
+
+float mean_array(float *a, int n)
+{
+    return sum_array(a,n)/n;
+}
+
+void mean_arrays(float **a, int n, int els, float *avg)
+{
+    int i;
+    int j;
+    memset(avg, 0, els*sizeof(float));
+    for(j = 0; j < n; ++j){
+        for(i = 0; i < els; ++i){
+            avg[i] += a[j][i];
+        }
+    }
+    for(i = 0; i < els; ++i){
+        avg[i] /= n;
+    }
+}
+
+void print_statistics(float *a, int n)
+{
+    float m = mean_array(a, n);
+    float v = variance_array(a, n);
+    printf("MSE: %.6f, Mean: %.6f, Variance: %.6f\n", mse_array(a, n), m, v);
+}
+
+float variance_array(float *a, int n)
+{
+    int i;
+    float sum = 0;
+    float mean = mean_array(a, n);
+    for(i = 0; i < n; ++i) sum += (a[i] - mean)*(a[i]-mean);
+    float variance = sum/n;
+    return variance;
+}
+
+int constrain_int(int a, int min, int max)
+{
+    if (a < min) return min;
+    if (a > max) return max;
+    return a;
+}
+
+float constrain(float min, float max, float a)
+{
+    if (a < min) return min;
+    if (a > max) return max;
+    return a;
+}
+
+float dist_array(float *a, float *b, int n, int sub)
+{
+    int i;
+    float sum = 0;
+    for(i = 0; i < n; i += sub) sum += pow(a[i]-b[i], 2);
+    return sqrt(sum);
+}
+
+float mse_array(float *a, int n)
+{
+    int i;
+    float sum = 0;
+    for(i = 0; i < n; ++i) sum += a[i]*a[i];
+    return sqrt(sum/n);
+}
+
+void normalize_array(float *a, int n)
+{
+    int i;
+    float mu = mean_array(a,n);
+    float sigma = sqrt(variance_array(a,n));
+    for(i = 0; i < n; ++i){
+        a[i] = (a[i] - mu)/sigma;
+    }
+    //mu = mean_array(a,n);
+    //sigma = sqrt(variance_array(a,n));
+}
+
+void translate_array(float *a, int n, float s)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        a[i] += s;
+    }
+}
+
+float mag_array(float *a, int n)
+{
+    int i;
+    float sum = 0;
+    for(i = 0; i < n; ++i){
+        sum += a[i]*a[i];
+    }
+    return sqrt(sum);
+}
+
+// indicies to skip is a bit array
+float mag_array_skip(float *a, int n, int * indices_to_skip)
+{
+    int i;
+    float sum = 0;
+    for (i = 0; i < n; ++i) {
+        if (indices_to_skip[i] != 1) {
+            sum += a[i] * a[i];
+        }
+    }
+    return sqrt(sum);
+}
+
+void scale_array(float *a, int n, float s)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        a[i] *= s;
+    }
+}
+
+int sample_array(float *a, int n)
+{
+    float sum = sum_array(a, n);
+    scale_array(a, n, 1. / sum);
+    float r = rand_uniform(0, 1);
+    int i;
+    for (i = 0; i < n; ++i) {
+        r = r - a[i];
+        if (r <= 0) return i;
+    }
+    return n - 1;
+}
+
+int sample_array_custom(float *a, int n)
+{
+    float sum = sum_array(a, n);
+    scale_array(a, n, 1./sum);
+    float r = rand_uniform(0, 1);
+    int start_index = rand_int(0, 0);
+    int i;
+    for(i = 0; i < n; ++i){
+        r = r - a[(i + start_index) % n];
+        if (r <= 0) return i;
+    }
+    return n-1;
+}
+
+int max_index(float *a, int n)
+{
+    if(n <= 0) return -1;
+    int i, max_i = 0;
+    float max = a[0];
+    for(i = 1; i < n; ++i){
+        if(a[i] > max){
+            max = a[i];
+            max_i = i;
+        }
+    }
+    return max_i;
+}
+
+int top_max_index(float *a, int n, int k)
+{
+    if (n <= 0) return -1;
+    float *values = (float*)xcalloc(k, sizeof(float));
+    int *indexes = (int*)xcalloc(k, sizeof(int));
+    int i, j;
+    for (i = 0; i < n; ++i) {
+        for (j = 0; j < k; ++j) {
+            if (a[i] > values[j]) {
+                values[j] = a[i];
+                indexes[j] = i;
+                break;
+            }
+        }
+    }
+    int count = 0;
+    for (j = 0; j < k; ++j) if (values[j] > 0) count++;
+    int get_index = rand_int(0, count-1);
+    int val = indexes[get_index];
+    free(indexes);
+    free(values);
+    return val;
+}
+
+
+int int_index(int *a, int val, int n)
+{
+    int i;
+    for (i = 0; i < n; ++i) {
+        if (a[i] == val) return i;
+    }
+    return -1;
+}
+
+int rand_int(int min, int max)
+{
+    if (max < min){
+        int s = min;
+        min = max;
+        max = s;
+    }
+    int r = (random_gen()%(max - min + 1)) + min;
+    return r;
+}
+
+// From http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+float rand_normal()
+{
+    static int haveSpare = 0;
+    static double rand1, rand2;
+
+    if(haveSpare)
+    {
+        haveSpare = 0;
+        return sqrt(rand1) * sin(rand2);
+    }
+
+    haveSpare = 1;
+
+    rand1 = random_gen() / ((double) RAND_MAX);
+    if(rand1 < 1e-100) rand1 = 1e-100;
+    rand1 = -2 * log(rand1);
+    rand2 = (random_gen() / ((double)RAND_MAX)) * 2.0 * M_PI;
+
+    return sqrt(rand1) * cos(rand2);
+}
+
+/*
+   float rand_normal()
+   {
+   int n = 12;
+   int i;
+   float sum= 0;
+   for(i = 0; i < n; ++i) sum += (float)random_gen()/RAND_MAX;
+   return sum-n/2.;
+   }
+ */
+
+size_t rand_size_t()
+{
+    return  ((size_t)(random_gen()&0xff) << 56) |
+            ((size_t)(random_gen()&0xff) << 48) |
+            ((size_t)(random_gen()&0xff) << 40) |
+            ((size_t)(random_gen()&0xff) << 32) |
+            ((size_t)(random_gen()&0xff) << 24) |
+            ((size_t)(random_gen()&0xff) << 16) |
+            ((size_t)(random_gen()&0xff) << 8) |
+            ((size_t)(random_gen()&0xff) << 0);
+}
+
+float rand_uniform(float min, float max)
+{
+    if(max < min){
+        float swap = min;
+        min = max;
+        max = swap;
+    }
+
+#if (RAND_MAX < 65536)
+        int rnd = rand()*(RAND_MAX + 1) + rand();
+        return ((float)rnd / (RAND_MAX*RAND_MAX) * (max - min)) + min;
+#else
+        return ((float)rand() / RAND_MAX * (max - min)) + min;
+#endif
+    //return (random_float() * (max - min)) + min;
+}
+
+float rand_scale(float s)
+{
+    float scale = rand_uniform_strong(1, s);
+    if(random_gen()%2) return scale;
+    return 1./scale;
+}
+
+float **one_hot_encode(float *a, int n, int k)
+{
+    int i;
+    float** t = (float**)xcalloc(n, sizeof(float*));
+    for(i = 0; i < n; ++i){
+        t[i] = (float*)xcalloc(k, sizeof(float));
+        int index = (int)a[i];
+        t[i][index] = 1;
+    }
+    return t;
+}
+
+static unsigned int x = 123456789, y = 362436069, z = 521288629;
+
+// Marsaglia's xorshf96 generator: period 2^96-1
+unsigned int random_gen_fast(void)
+{
+    unsigned int t;
+    x ^= x << 16;
+    x ^= x >> 5;
+    x ^= x << 1;
+
+    t = x;
+    x = y;
+    y = z;
+    z = t ^ x ^ y;
+
+    return z;
+}
+
+float random_float_fast()
+{
+    return ((float)random_gen_fast() / (float)UINT_MAX);
+}
+
+int rand_int_fast(int min, int max)
+{
+    if (max < min) {
+        int s = min;
+        min = max;
+        max = s;
+    }
+    int r = (random_gen_fast() % (max - min + 1)) + min;
+    return r;
+}
+
+unsigned int random_gen()
+{
+    unsigned int rnd = 0;
+#ifdef WIN32
+    rand_s(&rnd);
+#else   // WIN32
+    rnd = rand();
+#if (RAND_MAX < 65536)
+        rnd = rand()*(RAND_MAX + 1) + rnd;
+#endif  //(RAND_MAX < 65536)
+#endif  // WIN32
+    return rnd;
+}
+
+float random_float()
+{
+    unsigned int rnd = 0;
+#ifdef WIN32
+    rand_s(&rnd);
+    return ((float)rnd / (float)UINT_MAX);
+#else   // WIN32
+
+    rnd = rand();
+#if (RAND_MAX < 65536)
+    rnd = rand()*(RAND_MAX + 1) + rnd;
+    return((float)rnd / (float)(RAND_MAX*RAND_MAX));
+#endif  //(RAND_MAX < 65536)
+    return ((float)rnd / (float)RAND_MAX);
+
+#endif  // WIN32
+}
+
+float rand_uniform_strong(float min, float max)
+{
+    if (max < min) {
+        float swap = min;
+        min = max;
+        max = swap;
+    }
+    return (random_float() * (max - min)) + min;
+}
+
+float rand_precalc_random(float min, float max, float random_part)
+{
+    if (max < min) {
+        float swap = min;
+        min = max;
+        max = swap;
+    }
+    return (random_part * (max - min)) + min;
+}
+
+#define RS_SCALE (1.0 / (1.0 + RAND_MAX))
+
+double double_rand(void)
+{
+    double d;
+    do {
+        d = (((rand() * RS_SCALE) + rand()) * RS_SCALE + rand()) * RS_SCALE;
+    } while (d >= 1); // Round off
+    return d;
+}
+
+unsigned int uint_rand(unsigned int less_than)
+{
+    return (unsigned int)((less_than)* double_rand());
+}
+
+int check_array_is_nan(float *arr, int size)
+{
+    int i;
+    for (i = 0; i < size; ++i) {
+        if (isnan(arr[i])) return 1;
+    }
+    return 0;
+}
+
+int check_array_is_inf(float *arr, int size)
+{
+    int i;
+    for (i = 0; i < size; ++i) {
+        if (isinf(arr[i])) return 1;
+    }
+    return 0;
+}
+
+int *random_index_order(int min, int max)
+{
+    int *inds = (int *)xcalloc(max - min, sizeof(int));
+    int i;
+    for (i = min; i < max; ++i) {
+        inds[i - min] = i;
+    }
+    for (i = min; i < max - 1; ++i) {
+        int swap = inds[i - min];
+        int index = i + rand() % (max - i);
+        inds[i - min] = inds[index - min];
+        inds[index - min] = swap;
+    }
+    return inds;
+}
+
+int max_int_index(int *a, int n)
+{
+    if (n <= 0) return -1;
+    int i, max_i = 0;
+    int max = a[0];
+    for (i = 1; i < n; ++i) {
+        if (a[i] > max) {
+            max = a[i];
+            max_i = i;
+        }
+    }
+    return max_i;
+}
+
+
+// Absolute box from relative coordinate bounding box and image size
+boxabs box_to_boxabs(const box* b, const int img_w, const int img_h, const int bounds_check)
+{
+    boxabs ba;
+    ba.left = (b->x - b->w / 2.)*img_w;
+    ba.right = (b->x + b->w / 2.)*img_w;
+    ba.top = (b->y - b->h / 2.)*img_h;
+    ba.bot = (b->y + b->h / 2.)*img_h;
+
+    if (bounds_check) {
+        if (ba.left < 0) ba.left = 0;
+        if (ba.right > img_w - 1) ba.right = img_w - 1;
+        if (ba.top < 0) ba.top = 0;
+        if (ba.bot > img_h - 1) ba.bot = img_h - 1;
+    }
+
+    return ba;
+}
+
+int make_directory(char *path, int mode)
+{
+#ifdef WIN32
+    return _mkdir(path);
+#else
+    return mkdir(path, mode);
+#endif
+}
+
+unsigned long custom_hash(char *str)
+{
+    unsigned long hash = 5381;
+    int c;
+
+    while (c = *str++)
+        hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
+
+    return hash;
+}
+
+bool is_live_stream(const char * path){
+    const char *url_schema = "://";
+    return (NULL != strstr(path, url_schema));
+}
diff --git a/darknet-master/src/utils.h b/darknet-master/src/utils.h
new file mode 100644
index 0000000..6a4ea8a
--- /dev/null
+++ b/darknet-master/src/utils.h
@@ -0,0 +1,119 @@
+#ifndef UTILS_H
+#define UTILS_H
+#include "darknet.h"
+#include "list.h"
+
+#include <stdio.h>
+#include <time.h>
+#include <stdbool.h>
+
+#ifndef M_PI
+#define M_PI       3.14159265358979323846
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DARKNET_LOC __FILE__, __func__, __LINE__
+
+LIB_API void free_ptrs(void **ptrs, int n);
+LIB_API void top_k(float *a, int n, int k, int *index);
+
+/* The "location" is the file, function, and line as defined by the DARKNET_LOC macro.
+ * This is then printed when error() is called to terminate the instance of darknet.
+ */
+void *xmalloc_location(const size_t size, const char * const filename, const char * const funcname, const int line);
+void *xcalloc_location(const size_t nmemb, const size_t size, const char * const filename, const char * const funcname, const int line);
+void *xrealloc_location(void *ptr, const size_t size, const char * const filename, const char * const funcname, const int line);
+
+#define xmalloc(s)      xmalloc_location(s, DARKNET_LOC)
+#define xcalloc(m, s)   xcalloc_location(m, s, DARKNET_LOC)
+#define xrealloc(p, s)  xrealloc_location(p, s, DARKNET_LOC)
+
+void error(const char * const msg, const char * const filename, const char * const funcname, const int line);
+
+double what_time_is_it_now();
+int *read_map(char *filename);
+void shuffle(void *arr, size_t n, size_t size);
+void sorta_shuffle(void *arr, size_t n, size_t size, size_t sections);
+char *basecfg(char *cfgfile);
+int alphanum_to_int(char c);
+char int_to_alphanum(int i);
+int read_int(int fd);
+void write_int(int fd, int n);
+void read_all(int fd, char *buffer, size_t bytes);
+void write_all(int fd, char *buffer, size_t bytes);
+int read_all_fail(int fd, char *buffer, size_t bytes);
+int write_all_fail(int fd, char *buffer, size_t bytes);
+LIB_API void find_replace(const char* str, char* orig, char* rep, char* output);
+void replace_image_to_label(const char* input_path, char* output_path);
+void malloc_error(const size_t size, const char * const filename, const char * const funcname, const int line);
+void calloc_error(const size_t size, const char * const filename, const char * const funcname, const int line);
+void realloc_error(const size_t size, const char * const filename, const char * const funcname, const int line);
+void file_error(const char * const s);
+void strip(char *s);
+void strip_args(char *s);
+void strip_char(char *s, char bad);
+list *split_str(char *s, char delim);
+char *fgetl(FILE *fp);
+list *parse_csv_line(char *line);
+char *copy_string(char *s);
+int count_fields(char *line);
+float *parse_fields(char *line, int n);
+void normalize_array(float *a, int n);
+void scale_array(float *a, int n, float s);
+void translate_array(float *a, int n, float s);
+int max_index(float *a, int n);
+int top_max_index(float *a, int n, int k);
+float constrain(float min, float max, float a);
+int constrain_int(int a, int min, int max);
+float mse_array(float *a, int n);
+float rand_normal();
+size_t rand_size_t();
+float rand_uniform(float min, float max);
+float rand_scale(float s);
+int rand_int(int min, int max);
+float sum_array(float *a, int n);
+float mean_array(float *a, int n);
+void mean_arrays(float **a, int n, int els, float *avg);
+float variance_array(float *a, int n);
+float mag_array(float *a, int n);
+float mag_array_skip(float *a, int n, int * indices_to_skip);
+float dist_array(float *a, float *b, int n, int sub);
+float **one_hot_encode(float *a, int n, int k);
+float sec(clock_t clocks);
+int find_int_arg(int argc, char **argv, char *arg, int def);
+float find_float_arg(int argc, char **argv, char *arg, float def);
+int find_arg(int argc, char* argv[], char *arg);
+char *find_char_arg(int argc, char **argv, char *arg, char *def);
+int sample_array(float *a, int n);
+int sample_array_custom(float *a, int n);
+void print_statistics(float *a, int n);
+unsigned int random_gen_fast(void);
+float random_float_fast();
+int rand_int_fast(int min, int max);
+unsigned int random_gen();
+float random_float();
+float rand_uniform_strong(float min, float max);
+float rand_precalc_random(float min, float max, float random_part);
+double double_rand(void);
+unsigned int uint_rand(unsigned int less_than);
+int check_array_is_nan(float *arr, int size);
+int check_array_is_inf(float *arr, int size);
+int int_index(int *a, int val, int n);
+int *random_index_order(int min, int max);
+int max_int_index(int *a, int n);
+boxabs box_to_boxabs(const box* b, const int img_w, const int img_h, const int bounds_check);
+int make_directory(char *path, int mode);
+unsigned long custom_hash(char *str);
+bool is_live_stream(const char * path);
+
+#define max_val_cmp(a,b) (((a) > (b)) ? (a) : (b))
+#define min_val_cmp(a,b) (((a) < (b)) ? (a) : (b))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/darknet-master/src/version.h.in b/darknet-master/src/version.h.in
new file mode 100644
index 0000000..e908119
--- /dev/null
+++ b/darknet-master/src/version.h.in
@@ -0,0 +1,3 @@
+#define MAJOR_VERSION @Darknet_MAJOR_VERSION@
+#define MINOR_VERSION @Darknet_MINOR_VERSION@
+#define PATCH_VERSION @Darknet_PATCH_VERSION@
diff --git a/darknet-master/src/voxel.c b/darknet-master/src/voxel.c
new file mode 100644
index 0000000..9f50112
--- /dev/null
+++ b/darknet-master/src/voxel.c
@@ -0,0 +1,164 @@
+#include "network.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "parser.h"
+
+void extract_voxel(char *lfile, char *rfile, char *prefix)
+{
+#ifdef OPENCV
+    int w = 1920;
+    int h = 1080;
+    int shift = 0;
+    int count = 0;
+    cap_cv *lcap = get_capture_video_stream(lfile);
+    cap_cv *rcap = get_capture_video_stream(rfile);
+    while(1){
+        image l = get_image_from_stream_cpp(lcap);
+        image r = get_image_from_stream_cpp(rcap);
+        if(!l.w || !r.w) break;
+        if(count%100 == 0) {
+            shift = best_3d_shift_r(l, r, -l.h/100, l.h/100);
+            printf("%d\n", shift);
+        }
+        image ls = crop_image(l, (l.w - w)/2, (l.h - h)/2, w, h);
+        image rs = crop_image(r, 105 + (r.w - w)/2, (r.h - h)/2 + shift, w, h);
+        char buff[256];
+        sprintf(buff, "%s_%05d_l", prefix, count);
+        save_image(ls, buff);
+        sprintf(buff, "%s_%05d_r", prefix, count);
+        save_image(rs, buff);
+        free_image(l);
+        free_image(r);
+        free_image(ls);
+        free_image(rs);
+        ++count;
+    }
+
+#else
+    printf("need OpenCV for extraction\n");
+#endif
+}
+
+void train_voxel(char *cfgfile, char *weightfile)
+{
+    char* train_images = "data/imagenet/imagenet1k.train.list";
+    char* backup_directory = "backup/";
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    float avg_loss = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = net.batch*net.subdivisions;
+    int i = *net.seen/imgs;
+    data train, buffer;
+
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.scale = 4;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.d = &buffer;
+    args.type = SUPER_DATA;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+    //while(i*imgs < N*120){
+    while(get_current_batch(net) < net.max_batches){
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        time=clock();
+        float loss = train_network(net, train);
+        if (avg_loss < 0) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+
+        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+        }
+        free_data(train);
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+}
+
+void test_voxel(char *cfgfile, char *weightfile, char *filename)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    srand(2222222);
+
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, 0, 0);
+        resize_network(&net, im.w, im.h);
+        printf("%d %d\n", im.w, im.h);
+
+        float *X = im.data;
+        time=clock();
+        network_predict(net, X);
+        image out = get_network_image(net);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        save_image(out, "out");
+
+        free_image(im);
+        if (filename) break;
+    }
+}
+
+
+void run_voxel(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *filename = (argc > 5) ? argv[5] : 0;
+    if(0==strcmp(argv[2], "train")) train_voxel(cfg, weights);
+    else if(0==strcmp(argv[2], "test")) test_voxel(cfg, weights, filename);
+    else if(0==strcmp(argv[2], "extract")) extract_voxel(argv[3], argv[4], argv[5]);
+    /*
+       else if(0==strcmp(argv[2], "valid")) validate_voxel(cfg, weights);
+     */
+}
diff --git a/darknet-master/src/writing.c b/darknet-master/src/writing.c
new file mode 100644
index 0000000..1fed538
--- /dev/null
+++ b/darknet-master/src/writing.c
@@ -0,0 +1,144 @@
+#include "network.h"
+#include "utils.h"
+#include "parser.h"
+
+void train_writing(char *cfgfile, char *weightfile)
+{
+    char* backup_directory = "backup/";
+    srand(time(0));
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = net.batch*net.subdivisions;
+    list *plist = get_paths("figures.list");
+    char **paths = (char **)list_to_array(plist);
+    clock_t time;
+    int N = plist->size;
+    printf("N: %d\n", N);
+    image out = get_network_image(net);
+
+    data train, buffer;
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.out_w = out.w;
+    args.out_h = out.h;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = N;
+    args.d = &buffer;
+    args.type = WRITING_DATA;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    int epoch = (*net.seen)/N;
+    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+        printf("Loaded %lf seconds\n",sec(clock()-time));
+
+        time=clock();
+        float loss = train_network(net, train);
+
+        /*
+           image pred = float_to_image(64, 64, 1, out);
+           print_image(pred);
+         */
+
+        /*
+           image im = float_to_image(256, 256, 3, train.X.vals[0]);
+           image lab = float_to_image(64, 64, 1, train.y.vals[0]);
+           image pred = float_to_image(64, 64, 1, out);
+           show_image(im, "image");
+           show_image(lab, "label");
+           print_image(lab);
+           show_image(pred, "pred");
+           cvWaitKey(0);
+         */
+
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %" PRIu64 " images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
+        free_data(train);
+        if(get_current_batch(net)%100 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s_batch_%d.weights", backup_directory, base, get_current_batch(net));
+            save_weights(net, buff);
+        }
+        if(*net.seen/N > epoch){
+            epoch = *net.seen/N;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            save_weights(net, buff);
+        }
+    }
+}
+
+void test_writing(char *cfgfile, char *weightfile, char *filename)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    srand(2222222);
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+
+        image im = load_image_color(input, 0, 0);
+        resize_network(&net, im.w, im.h);
+        printf("%d %d %d\n", im.h, im.w, im.c);
+        float *X = im.data;
+        time=clock();
+        network_predict(net, X);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        image pred = get_network_image(net);
+
+        image upsampled = resize_image(pred, im.w, im.h);
+        image thresh = threshold_image(upsampled, .5);
+        pred = thresh;
+
+        show_image(pred, "prediction");
+        show_image(im, "orig");
+
+        wait_until_press_key_cv();
+        destroy_all_windows_cv();
+
+        free_image(upsampled);
+        free_image(thresh);
+        free_image(im);
+        if (filename) break;
+    }
+}
+
+void run_writing(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *filename = (argc > 5) ? argv[5] : 0;
+    if(0==strcmp(argv[2], "train")) train_writing(cfg, weights);
+    else if(0==strcmp(argv[2], "test")) test_writing(cfg, weights, filename);
+}
diff --git a/darknet-master/src/yolo.c b/darknet-master/src/yolo.c
new file mode 100644
index 0000000..ef68aca
--- /dev/null
+++ b/darknet-master/src/yolo.c
@@ -0,0 +1,371 @@
+#include "network.h"
+#include "detection_layer.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "parser.h"
+#include "box.h"
+#include "demo.h"
+
+char *voc_names[] = {"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"};
+
+void train_yolo(char *cfgfile, char *weightfile)
+{
+    char* train_images = "data/voc/train.txt";
+    char* backup_directory = "backup/";
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    float avg_loss = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = net.batch*net.subdivisions;
+    int i = *net.seen/imgs;
+    data train, buffer;
+
+
+    layer l = net.layers[net.n - 1];
+
+    int side = l.side;
+    int classes = l.classes;
+    float jitter = l.jitter;
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.classes = classes;
+    args.jitter = jitter;
+    args.num_boxes = side;
+    args.d = &buffer;
+    args.type = REGION_DATA;
+
+    args.angle = net.angle;
+    args.exposure = net.exposure;
+    args.saturation = net.saturation;
+    args.hue = net.hue;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+    //while(i*imgs < N*120){
+    while(get_current_batch(net) < net.max_batches){
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        time=clock();
+        float loss = train_network(net, train);
+        if (avg_loss < 0) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+
+        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
+        if(i%1000==0 || (i < 1000 && i%100 == 0)){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
+        free_data(train);
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+}
+
+void print_yolo_detections(FILE **fps, char *id, box *boxes, float **probs, int total, int classes, int w, int h)
+{
+    int i, j;
+    for(i = 0; i < total; ++i){
+        float xmin = boxes[i].x - boxes[i].w/2.;
+        float xmax = boxes[i].x + boxes[i].w/2.;
+        float ymin = boxes[i].y - boxes[i].h/2.;
+        float ymax = boxes[i].y + boxes[i].h/2.;
+
+        if (xmin < 0) xmin = 0;
+        if (ymin < 0) ymin = 0;
+        if (xmax > w) xmax = w;
+        if (ymax > h) ymax = h;
+
+        for(j = 0; j < classes; ++j){
+            if (probs[i][j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, probs[i][j],
+                    xmin, ymin, xmax, ymax);
+        }
+    }
+}
+
+void validate_yolo(char *cfgfile, char *weightfile)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    srand(time(0));
+
+    char *base = "results/comp4_det_test_";
+    //list *plist = get_paths("data/voc.2007.test");
+    list* plist = get_paths("data/voc/2007_test.txt");
+    //list *plist = get_paths("data/voc.2012.test");
+    char **paths = (char **)list_to_array(plist);
+
+    layer l = net.layers[net.n-1];
+    int classes = l.classes;
+
+    int j;
+    FILE** fps = (FILE**)xcalloc(classes, sizeof(FILE*));
+    for(j = 0; j < classes; ++j){
+        char buff[1024];
+        snprintf(buff, 1024, "%s%s.txt", base, voc_names[j]);
+        fps[j] = fopen(buff, "w");
+    }
+    box* boxes = (box*)xcalloc(l.side * l.side * l.n, sizeof(box));
+    float** probs = (float**)xcalloc(l.side * l.side * l.n, sizeof(float*));
+    for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = (float*)xcalloc(classes, sizeof(float));
+
+    int m = plist->size;
+    int i=0;
+    int t;
+
+    float thresh = .001;
+    int nms = 1;
+    float iou_thresh = .5;
+
+    int nthreads = 8;
+    image* val = (image*)xcalloc(nthreads, sizeof(image));
+    image* val_resized = (image*)xcalloc(nthreads, sizeof(image));
+    image* buf = (image*)xcalloc(nthreads, sizeof(image));
+    image* buf_resized = (image*)xcalloc(nthreads, sizeof(image));
+    pthread_t* thr = (pthread_t*)xcalloc(nthreads, sizeof(pthread_t));
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.type = IMAGE_DATA;
+
+    for(t = 0; t < nthreads; ++t){
+        args.path = paths[i+t];
+        args.im = &buf[t];
+        args.resized = &buf_resized[t];
+        thr[t] = load_data_in_thread(args);
+    }
+    time_t start = time(0);
+    for(i = nthreads; i < m+nthreads; i += nthreads){
+        fprintf(stderr, "%d\n", i);
+        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+            pthread_join(thr[t], 0);
+            val[t] = buf[t];
+            val_resized[t] = buf_resized[t];
+        }
+        for(t = 0; t < nthreads && i+t < m; ++t){
+            args.path = paths[i+t];
+            args.im = &buf[t];
+            args.resized = &buf_resized[t];
+            thr[t] = load_data_in_thread(args);
+        }
+        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+            char *path = paths[i+t-nthreads];
+            char *id = basecfg(path);
+            float *X = val_resized[t].data;
+            network_predict(net, X);
+            int w = val[t].w;
+            int h = val[t].h;
+            get_detection_boxes(l, w, h, thresh, probs, boxes, 0);
+            if (nms) do_nms_sort_v2(boxes, probs, l.side*l.side*l.n, classes, iou_thresh);
+            print_yolo_detections(fps, id, boxes, probs, l.side*l.side*l.n, classes, w, h);
+            free(id);
+            free_image(val[t]);
+            free_image(val_resized[t]);
+        }
+    }
+
+    if (val) free(val);
+    if (val_resized) free(val_resized);
+    if (buf) free(buf);
+    if (buf_resized) free(buf_resized);
+    if (thr) free(thr);
+
+    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
+    if (fps) {
+        for(j = 0; j < classes; ++j){
+            fclose(fps[j]);
+        }
+        free(fps);
+    }
+}
+
+void validate_yolo_recall(char *cfgfile, char *weightfile)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    srand(time(0));
+
+    list *plist = get_paths("data/voc.2007.test");
+    char **paths = (char **)list_to_array(plist);
+
+    layer l = net.layers[net.n-1];
+    int classes = l.classes;
+    int side = l.side;
+
+    int j, k;
+    box* boxes = (box*)xcalloc(side * side * l.n, sizeof(box));
+    float** probs = (float**)xcalloc(side * side * l.n, sizeof(float*));
+    for(j = 0; j < side*side*l.n; ++j) {
+        probs[j] = (float*)xcalloc(classes, sizeof(float));
+    }
+
+    int m = plist->size;
+    int i=0;
+
+    float thresh = .001;
+    float iou_thresh = .5;
+    float nms = 0;
+
+    int total = 0;
+    int correct = 0;
+    int proposals = 0;
+    float avg_iou = 0;
+
+    for(i = 0; i < m; ++i){
+        char *path = paths[i];
+        image orig = load_image_color(path, 0, 0);
+        image sized = resize_image(orig, net.w, net.h);
+        char *id = basecfg(path);
+        network_predict(net, sized.data);
+        get_detection_boxes(l, orig.w, orig.h, thresh, probs, boxes, 1);
+        if (nms) do_nms(boxes, probs, side*side*l.n, 1, nms);
+
+        char labelpath[4096];
+        replace_image_to_label(path, labelpath);
+
+        int num_labels = 0;
+        box_label *truth = read_boxes(labelpath, &num_labels);
+        for(k = 0; k < side*side*l.n; ++k){
+            if(probs[k][0] > thresh){
+                ++proposals;
+            }
+        }
+        for (j = 0; j < num_labels; ++j) {
+            ++total;
+            box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
+            float best_iou = 0;
+            for(k = 0; k < side*side*l.n; ++k){
+                float iou = box_iou(boxes[k], t);
+                if(probs[k][0] > thresh && iou > best_iou){
+                    best_iou = iou;
+                }
+            }
+            avg_iou += best_iou;
+            if(best_iou > iou_thresh){
+                ++correct;
+            }
+        }
+
+        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
+        free(id);
+        free(truth);
+        free_image(orig);
+        free_image(sized);
+    }
+}
+
+void test_yolo(char *cfgfile, char *weightfile, char *filename, float thresh)
+{
+    image **alphabet = load_alphabet();
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    detection_layer l = net.layers[net.n-1];
+    set_batch_network(&net, 1);
+    srand(2222222);
+    char buff[256];
+    char *input = buff;
+    int j;
+    float nms=.4;
+    box* boxes = (box*)xcalloc(l.side * l.side * l.n, sizeof(box));
+    float** probs = (float**)xcalloc(l.side * l.side * l.n, sizeof(float*));
+    for(j = 0; j < l.side*l.side*l.n; ++j) {
+        probs[j] = (float*)xcalloc(l.classes, sizeof(float));
+    }
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        } else {
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input,0,0);
+        image sized = resize_image(im, net.w, net.h);
+        float *X = sized.data;
+        clock_t time=clock();
+        network_predict(net, X);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 0);
+        if (nms) do_nms_sort_v2(boxes, probs, l.side*l.side*l.n, l.classes, nms);
+        //draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, alphabet, 20);
+        draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, alphabet, 20);
+        save_image(im, "predictions");
+        show_image(im, "predictions");
+
+        free_image(im);
+        free_image(sized);
+        free_alphabet(alphabet);
+        wait_until_press_key_cv();
+        destroy_all_windows_cv();
+
+      if (filename) break;
+    }
+    free(boxes);
+    for(j = 0; j < l.side*l.side*l.n; ++j) {
+        free(probs[j]);
+    }
+    free(probs);
+}
+
+void run_yolo(int argc, char **argv)
+{
+    int dont_show = find_arg(argc, argv, "-dont_show");
+    int mjpeg_port = find_int_arg(argc, argv, "-mjpeg_port", -1);
+    int json_port = find_int_arg(argc, argv, "-json_port", -1);
+    char *out_filename = find_char_arg(argc, argv, "-out_filename", 0);
+    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
+    float thresh = find_float_arg(argc, argv, "-thresh", .2);
+    float hier_thresh = find_float_arg(argc, argv, "-hier", .5);
+    int cam_index = find_int_arg(argc, argv, "-c", 0);
+    int frame_skip = find_int_arg(argc, argv, "-s", 0);
+    int ext_output = find_arg(argc, argv, "-ext_output");
+    char *json_file_output = find_char_arg(argc, argv, "-json_file_output", 0);
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *filename = (argc > 5) ? argv[5]: 0;
+    if(0==strcmp(argv[2], "test")) test_yolo(cfg, weights, filename, thresh);
+    else if(0==strcmp(argv[2], "train")) train_yolo(cfg, weights);
+    else if(0==strcmp(argv[2], "valid")) validate_yolo(cfg, weights);
+    else if(0==strcmp(argv[2], "recall")) validate_yolo_recall(cfg, weights);
+    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, hier_thresh, cam_index, filename, voc_names, 20, 1, frame_skip,
+        prefix, out_filename, mjpeg_port, 0, json_port, dont_show, ext_output, 0, 0, 0, 0, 0, json_file_output);
+}
diff --git a/darknet-master/src/yolo_console_dll.cpp b/darknet-master/src/yolo_console_dll.cpp
new file mode 100644
index 0000000..83df369
--- /dev/null
+++ b/darknet-master/src/yolo_console_dll.cpp
@@ -0,0 +1,702 @@
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <vector>
+#include <queue>
+#include <fstream>
+#include <thread>
+#include <future>
+#include <atomic>
+#include <mutex>         // std::mutex, std::unique_lock
+#include <cmath>
+
+
+// It makes sense only for video-Camera (not for video-File)
+// To use - uncomment the following line. Optical-flow is supported only by OpenCV 3.x - 4.x
+//#define TRACK_OPTFLOW
+//#define GPU
+
+// To use 3D-stereo camera ZED - uncomment the following line. ZED_SDK should be installed.
+//#define ZED_STEREO
+
+
+#include "yolo_v2_class.hpp"    // imported functions from DLL
+
+#ifdef OPENCV
+#ifdef ZED_STEREO
+#include <sl/Camera.hpp>
+#if ZED_SDK_MAJOR_VERSION == 2
+#define ZED_STEREO_2_COMPAT_MODE
+#endif
+
+#undef GPU // avoid conflict with sl::MEM::GPU
+
+#ifdef ZED_STEREO_2_COMPAT_MODE
+#pragma comment(lib, "sl_core64.lib")
+#pragma comment(lib, "sl_input64.lib")
+#endif
+#pragma comment(lib, "sl_zed64.lib")
+
+float getMedian(std::vector<float> &v) {
+    size_t n = v.size() / 2;
+    std::nth_element(v.begin(), v.begin() + n, v.end());
+    return v[n];
+}
+
+std::vector<bbox_t> get_3d_coordinates(std::vector<bbox_t> bbox_vect, cv::Mat xyzrgba)
+{
+    bool valid_measure;
+    int i, j;
+    const unsigned int R_max_global = 10;
+
+    std::vector<bbox_t> bbox3d_vect;
+
+    for (auto &cur_box : bbox_vect) {
+
+        const unsigned int obj_size = std::min(cur_box.w, cur_box.h);
+        const unsigned int R_max = std::min(R_max_global, obj_size / 2);
+        int center_i = cur_box.x + cur_box.w * 0.5f, center_j = cur_box.y + cur_box.h * 0.5f;
+
+        std::vector<float> x_vect, y_vect, z_vect;
+        for (int R = 0; R < R_max; R++) {
+            for (int y = -R; y <= R; y++) {
+                for (int x = -R; x <= R; x++) {
+                    i = center_i + x;
+                    j = center_j + y;
+                    sl::float4 out(NAN, NAN, NAN, NAN);
+                    if (i >= 0 && i < xyzrgba.cols && j >= 0 && j < xyzrgba.rows) {
+                        cv::Vec4f &elem = xyzrgba.at<cv::Vec4f>(j, i);  // x,y,z,w
+                        out.x = elem[0];
+                        out.y = elem[1];
+                        out.z = elem[2];
+                        out.w = elem[3];
+                    }
+                    valid_measure = std::isfinite(out.z);
+                    if (valid_measure)
+                    {
+                        x_vect.push_back(out.x);
+                        y_vect.push_back(out.y);
+                        z_vect.push_back(out.z);
+                    }
+                }
+            }
+        }
+
+        if (x_vect.size() * y_vect.size() * z_vect.size() > 0)
+        {
+            cur_box.x_3d = getMedian(x_vect);
+            cur_box.y_3d = getMedian(y_vect);
+            cur_box.z_3d = getMedian(z_vect);
+        }
+        else {
+            cur_box.x_3d = NAN;
+            cur_box.y_3d = NAN;
+            cur_box.z_3d = NAN;
+        }
+
+        bbox3d_vect.emplace_back(cur_box);
+    }
+
+    return bbox3d_vect;
+}
+
+cv::Mat slMat2cvMat(sl::Mat &input) {
+    int cv_type = -1; // Mapping between MAT_TYPE and CV_TYPE
+    if(input.getDataType() ==
+#ifdef ZED_STEREO_2_COMPAT_MODE
+        sl::MAT_TYPE_32F_C4
+#else
+        sl::MAT_TYPE::F32_C4
+#endif
+        ) {
+        cv_type = CV_32FC4;
+    } else cv_type = CV_8UC4; // sl::Mat used are either RGBA images or XYZ (4C) point clouds
+    return cv::Mat(input.getHeight(), input.getWidth(), cv_type, input.getPtr<sl::uchar1>(
+#ifdef ZED_STEREO_2_COMPAT_MODE
+        sl::MEM::MEM_CPU
+#else
+        sl::MEM::CPU
+#endif
+        ));
+}
+
+cv::Mat zed_capture_rgb(sl::Camera &zed) {
+    sl::Mat left;
+    zed.retrieveImage(left);
+    cv::Mat left_rgb;
+    cv::cvtColor(slMat2cvMat(left), left_rgb, CV_RGBA2RGB);
+    return left_rgb;
+}
+
+cv::Mat zed_capture_3d(sl::Camera &zed) {
+    sl::Mat cur_cloud;
+    zed.retrieveMeasure(cur_cloud,
+#ifdef ZED_STEREO_2_COMPAT_MODE
+        sl::MEASURE_XYZ
+#else
+        sl::MEASURE::XYZ
+#endif
+        );
+    return slMat2cvMat(cur_cloud).clone();
+}
+
+static sl::Camera zed; // ZED-camera
+
+#else   // ZED_STEREO
+std::vector<bbox_t> get_3d_coordinates(std::vector<bbox_t> bbox_vect, cv::Mat xyzrgba) {
+    return bbox_vect;
+}
+#endif  // ZED_STEREO
+
+
+#include <opencv2/opencv.hpp>            // C++
+#include <opencv2/core/version.hpp>
+#ifndef CV_VERSION_EPOCH     // OpenCV 3.x and 4.x
+#include <opencv2/videoio/videoio.hpp>
+#define OPENCV_VERSION CVAUX_STR(CV_VERSION_MAJOR)"" CVAUX_STR(CV_VERSION_MINOR)"" CVAUX_STR(CV_VERSION_REVISION)
+#ifndef USE_CMAKE_LIBS
+#pragma comment(lib, "opencv_world" OPENCV_VERSION ".lib")
+#ifdef TRACK_OPTFLOW
+/*
+#pragma comment(lib, "opencv_cudaoptflow" OPENCV_VERSION ".lib")
+#pragma comment(lib, "opencv_cudaimgproc" OPENCV_VERSION ".lib")
+#pragma comment(lib, "opencv_core" OPENCV_VERSION ".lib")
+#pragma comment(lib, "opencv_imgproc" OPENCV_VERSION ".lib")
+#pragma comment(lib, "opencv_highgui" OPENCV_VERSION ".lib")
+*/
+#endif    // TRACK_OPTFLOW
+#endif    // USE_CMAKE_LIBS
+#else     // OpenCV 2.x
+#define OPENCV_VERSION CVAUX_STR(CV_VERSION_EPOCH)"" CVAUX_STR(CV_VERSION_MAJOR)"" CVAUX_STR(CV_VERSION_MINOR)
+#ifndef USE_CMAKE_LIBS
+#pragma comment(lib, "opencv_core" OPENCV_VERSION ".lib")
+#pragma comment(lib, "opencv_imgproc" OPENCV_VERSION ".lib")
+#pragma comment(lib, "opencv_highgui" OPENCV_VERSION ".lib")
+#pragma comment(lib, "opencv_video" OPENCV_VERSION ".lib")
+#endif    // USE_CMAKE_LIBS
+#endif    // CV_VERSION_EPOCH
+
+
+void draw_boxes(cv::Mat mat_img, std::vector<bbox_t> result_vec, std::vector<std::string> obj_names,
+    int current_det_fps = -1, int current_cap_fps = -1)
+{
+    int const colors[6][3] = { { 1,0,1 },{ 0,0,1 },{ 0,1,1 },{ 0,1,0 },{ 1,1,0 },{ 1,0,0 } };
+
+    for (auto &i : result_vec) {
+        cv::Scalar color = obj_id_to_color(i.obj_id);
+        cv::rectangle(mat_img, cv::Rect(i.x, i.y, i.w, i.h), color, 2);
+        if (obj_names.size() > i.obj_id) {
+            std::string obj_name = obj_names[i.obj_id];
+            if (i.track_id > 0) obj_name += " - " + std::to_string(i.track_id);
+            cv::Size const text_size = getTextSize(obj_name, cv::FONT_HERSHEY_COMPLEX_SMALL, 1.2, 2, 0);
+            int max_width = (text_size.width > i.w + 2) ? text_size.width : (i.w + 2);
+            max_width = std::max(max_width, (int)i.w + 2);
+            //max_width = std::max(max_width, 283);
+            std::string coords_3d;
+            if (!std::isnan(i.z_3d)) {
+                std::stringstream ss;
+                ss << std::fixed << std::setprecision(2) << "x:" << i.x_3d << "m y:" << i.y_3d << "m z:" << i.z_3d << "m ";
+                coords_3d = ss.str();
+                cv::Size const text_size_3d = getTextSize(ss.str(), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.8, 1, 0);
+                int const max_width_3d = (text_size_3d.width > i.w + 2) ? text_size_3d.width : (i.w + 2);
+                if (max_width_3d > max_width) max_width = max_width_3d;
+            }
+
+            cv::rectangle(mat_img, cv::Point2f(std::max((int)i.x - 1, 0), std::max((int)i.y - 35, 0)),
+                cv::Point2f(std::min((int)i.x + max_width, mat_img.cols - 1), std::min((int)i.y, mat_img.rows - 1)),
+                color, CV_FILLED, 8, 0);
+            putText(mat_img, obj_name, cv::Point2f(i.x, i.y - 16), cv::FONT_HERSHEY_COMPLEX_SMALL, 1.2, cv::Scalar(0, 0, 0), 2);
+            if(!coords_3d.empty()) putText(mat_img, coords_3d, cv::Point2f(i.x, i.y-1), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.8, cv::Scalar(0, 0, 0), 1);
+        }
+    }
+    if (current_det_fps >= 0 && current_cap_fps >= 0) {
+        std::string fps_str = "FPS detection: " + std::to_string(current_det_fps) + "   FPS capture: " + std::to_string(current_cap_fps);
+        putText(mat_img, fps_str, cv::Point2f(10, 20), cv::FONT_HERSHEY_COMPLEX_SMALL, 1.2, cv::Scalar(50, 255, 0), 2);
+    }
+}
+#endif    // OPENCV
+
+
+void show_console_result(std::vector<bbox_t> const result_vec, std::vector<std::string> const obj_names, int frame_id = -1) {
+    if (frame_id >= 0) std::cout << " Frame: " << frame_id << std::endl;
+    for (auto &i : result_vec) {
+        if (obj_names.size() > i.obj_id) std::cout << obj_names[i.obj_id] << " - ";
+        std::cout << "obj_id = " << i.obj_id << ",  x = " << i.x << ", y = " << i.y
+            << ", w = " << i.w << ", h = " << i.h
+            << std::setprecision(3) << ", prob = " << i.prob << std::endl;
+    }
+}
+
+std::vector<std::string> objects_names_from_file(std::string const filename) {
+    std::ifstream file(filename);
+    std::vector<std::string> file_lines;
+    if (!file.is_open()) return file_lines;
+    for(std::string line; getline(file, line);) file_lines.push_back(line);
+    std::cout << "object names loaded \n";
+    return file_lines;
+}
+
+template<typename T>
+class send_one_replaceable_object_t {
+    const bool sync;
+    std::atomic<T *> a_ptr;
+public:
+
+    void send(T const& _obj) {
+        T *new_ptr = new T;
+        *new_ptr = _obj;
+        if (sync) {
+            while (a_ptr.load()) std::this_thread::sleep_for(std::chrono::milliseconds(3));
+        }
+        std::unique_ptr<T> old_ptr(a_ptr.exchange(new_ptr));
+    }
+
+    T receive() {
+        std::unique_ptr<T> ptr;
+        do {
+            while(!a_ptr.load()) std::this_thread::sleep_for(std::chrono::milliseconds(3));
+            ptr.reset(a_ptr.exchange(NULL));
+        } while (!ptr);
+        T obj = *ptr;
+        return obj;
+    }
+
+    bool is_object_present() {
+        return (a_ptr.load() != NULL);
+    }
+
+    send_one_replaceable_object_t(bool _sync) : sync(_sync), a_ptr(NULL)
+    {}
+};
+
+int main(int argc, char *argv[])
+{
+    std::string  names_file = "data/coco.names";
+    std::string  cfg_file = "cfg/yolov3.cfg";
+    std::string  weights_file = "yolov3.weights";
+    std::string filename;
+
+    if (argc > 4) {    //voc.names yolo-voc.cfg yolo-voc.weights test.mp4
+        names_file = argv[1];
+        cfg_file = argv[2];
+        weights_file = argv[3];
+        filename = argv[4];
+    }
+    else if (argc > 1) filename = argv[1];
+
+    float const thresh = (argc > 5) ? std::stof(argv[5]) : 0.2;
+
+    Detector detector(cfg_file, weights_file);
+
+    auto obj_names = objects_names_from_file(names_file);
+    std::string out_videofile = "result.avi";
+    bool const save_output_videofile = false;   // true - for history
+    bool const send_network = false;        // true - for remote detection
+    bool const use_kalman_filter = false;   // true - for stationary camera
+
+    bool detection_sync = true;             // true - for video-file
+#ifdef TRACK_OPTFLOW    // for slow GPU
+    detection_sync = false;
+    Tracker_optflow tracker_flow;
+    //detector.wait_stream = true;
+#endif  // TRACK_OPTFLOW
+
+
+    while (true)
+    {
+        std::cout << "input image or video filename: ";
+        if(filename.size() == 0) std::cin >> filename;
+        if (filename.size() == 0) break;
+
+        try {
+#ifdef OPENCV
+            preview_boxes_t large_preview(100, 150, false), small_preview(50, 50, true);
+            bool show_small_boxes = false;
+
+            std::string const file_ext = filename.substr(filename.find_last_of(".") + 1);
+            std::string const protocol = filename.substr(0, 7);
+            if (file_ext == "avi" || file_ext == "mp4" || file_ext == "mjpg" || file_ext == "mov" ||     // video file
+                protocol == "rtmp://" || protocol == "rtsp://" || protocol == "http://" || protocol == "https:/" ||    // video network stream
+                filename == "zed_camera" || file_ext == "svo" || filename == "web_camera")   // ZED stereo camera
+
+            {
+                if (protocol == "rtsp://" || protocol == "http://" || protocol == "https:/" || filename == "zed_camera" || filename == "web_camera")
+                    detection_sync = false;
+
+                cv::Mat cur_frame;
+                std::atomic<int> fps_cap_counter(0), fps_det_counter(0);
+                std::atomic<int> current_fps_cap(0), current_fps_det(0);
+                std::atomic<bool> exit_flag(false);
+                std::chrono::steady_clock::time_point steady_start, steady_end;
+                int video_fps = 25;
+                bool use_zed_camera = false;
+
+                track_kalman_t track_kalman;
+
+#ifdef ZED_STEREO
+                sl::InitParameters init_params;
+                init_params.depth_minimum_distance = 0.5;
+    #ifdef ZED_STEREO_2_COMPAT_MODE
+                init_params.depth_mode = sl::DEPTH_MODE_ULTRA;
+                init_params.camera_resolution = sl::RESOLUTION_HD720;// sl::RESOLUTION_HD1080, sl::RESOLUTION_HD720
+                init_params.coordinate_units = sl::UNIT_METER;
+                init_params.camera_buffer_count_linux = 2;
+                if (file_ext == "svo") init_params.svo_input_filename.set(filename.c_str());
+    #else
+                init_params.depth_mode = sl::DEPTH_MODE::ULTRA;
+                init_params.camera_resolution = sl::RESOLUTION::HD720;// sl::RESOLUTION::HD1080, sl::RESOLUTION::HD720
+                init_params.coordinate_units = sl::UNIT::METER;
+                if (file_ext == "svo") init_params.input.setFromSVOFile(filename.c_str());
+    #endif
+                //init_params.sdk_cuda_ctx = (CUcontext)detector.get_cuda_context();
+                init_params.sdk_gpu_id = detector.cur_gpu_id;
+
+                if (filename == "zed_camera" || file_ext == "svo") {
+                    std::cout << "ZED 3D Camera " << zed.open(init_params) << std::endl;
+                    if (!zed.isOpened()) {
+                        error("Error: ZED Camera should be connected to USB 3.0. And ZED_SDK should be installed", DARKNET_LOC);
+                    }
+                    cur_frame = zed_capture_rgb(zed);
+                    use_zed_camera = true;
+                }
+#endif  // ZED_STEREO
+
+                cv::VideoCapture cap;
+                if (filename == "web_camera") {
+                    cap.open(0);
+                    cap >> cur_frame;
+                } else if (!use_zed_camera) {
+                    cap.open(filename);
+                    cap >> cur_frame;
+                }
+#ifdef CV_VERSION_EPOCH // OpenCV 2.x
+                video_fps = cap.get(CV_CAP_PROP_FPS);
+#else
+                video_fps = cap.get(cv::CAP_PROP_FPS);
+#endif
+                cv::Size const frame_size = cur_frame.size();
+                //cv::Size const frame_size(cap.get(CV_CAP_PROP_FRAME_WIDTH), cap.get(CV_CAP_PROP_FRAME_HEIGHT));
+                std::cout << "\n Video size: " << frame_size << std::endl;
+
+                cv::VideoWriter output_video;
+                if (save_output_videofile)
+#ifdef CV_VERSION_EPOCH // OpenCV 2.x
+                    output_video.open(out_videofile, CV_FOURCC('D', 'I', 'V', 'X'), std::max(35, video_fps), frame_size, true);
+#else
+                    output_video.open(out_videofile, cv::VideoWriter::fourcc('D', 'I', 'V', 'X'), std::max(35, video_fps), frame_size, true);
+#endif
+
+                struct detection_data_t {
+                    cv::Mat cap_frame;
+                    std::shared_ptr<image_t> det_image;
+                    std::vector<bbox_t> result_vec;
+                    cv::Mat draw_frame;
+                    bool new_detection;
+                    uint64_t frame_id;
+                    bool exit_flag;
+                    cv::Mat zed_cloud;
+                    std::queue<cv::Mat> track_optflow_queue;
+                    detection_data_t() : new_detection(false), exit_flag(false) {}
+                };
+
+                const bool sync = detection_sync; // sync data exchange
+                send_one_replaceable_object_t<detection_data_t> cap2prepare(sync), cap2draw(sync),
+                    prepare2detect(sync), detect2draw(sync), draw2show(sync), draw2write(sync), draw2net(sync);
+
+                std::thread t_cap, t_prepare, t_detect, t_post, t_draw, t_write, t_network;
+
+                // capture new video-frame
+                if (t_cap.joinable()) t_cap.join();
+                t_cap = std::thread([&]()
+                {
+                    uint64_t frame_id = 0;
+                    detection_data_t detection_data;
+                    do {
+                        detection_data = detection_data_t();
+#ifdef ZED_STEREO
+                        if (use_zed_camera) {
+                            while (zed.grab() !=
+        #ifdef ZED_STEREO_2_COMPAT_MODE
+                                sl::SUCCESS
+        #else
+                                sl::ERROR_CODE::SUCCESS
+        #endif
+                                ) std::this_thread::sleep_for(std::chrono::milliseconds(2));
+                            detection_data.cap_frame = zed_capture_rgb(zed);
+                            detection_data.zed_cloud = zed_capture_3d(zed);
+                        }
+                        else
+#endif   // ZED_STEREO
+                        {
+                            cap >> detection_data.cap_frame;
+                        }
+                        fps_cap_counter++;
+                        detection_data.frame_id = frame_id++;
+                        if (detection_data.cap_frame.empty() || exit_flag) {
+                            std::cout << " exit_flag: detection_data.cap_frame.size = " << detection_data.cap_frame.size() << std::endl;
+                            detection_data.exit_flag = true;
+                            detection_data.cap_frame = cv::Mat(frame_size, CV_8UC3);
+                        }
+
+                        if (!detection_sync) {
+                            cap2draw.send(detection_data);       // skip detection
+                        }
+                        cap2prepare.send(detection_data);
+                    } while (!detection_data.exit_flag);
+                    std::cout << " t_cap exit \n";
+                });
+
+
+                // pre-processing video frame (resize, convertion)
+                t_prepare = std::thread([&]()
+                {
+                    std::shared_ptr<image_t> det_image;
+                    detection_data_t detection_data;
+                    do {
+                        detection_data = cap2prepare.receive();
+
+                        det_image = detector.mat_to_image_resize(detection_data.cap_frame);
+                        detection_data.det_image = det_image;
+                        prepare2detect.send(detection_data);    // detection
+
+                    } while (!detection_data.exit_flag);
+                    std::cout << " t_prepare exit \n";
+                });
+
+
+                // detection by Yolo
+                if (t_detect.joinable()) t_detect.join();
+                t_detect = std::thread([&]()
+                {
+                    std::shared_ptr<image_t> det_image;
+                    detection_data_t detection_data;
+                    do {
+                        detection_data = prepare2detect.receive();
+                        det_image = detection_data.det_image;
+                        std::vector<bbox_t> result_vec;
+
+                        if(det_image)
+                            result_vec = detector.detect_resized(*det_image, frame_size.width, frame_size.height, thresh, true);  // true
+                        fps_det_counter++;
+                        //std::this_thread::sleep_for(std::chrono::milliseconds(150));
+
+                        detection_data.new_detection = true;
+                        detection_data.result_vec = result_vec;
+                        detect2draw.send(detection_data);
+                    } while (!detection_data.exit_flag);
+                    std::cout << " t_detect exit \n";
+                });
+
+                // draw rectangles (and track objects)
+                t_draw = std::thread([&]()
+                {
+                    std::queue<cv::Mat> track_optflow_queue;
+                    detection_data_t detection_data;
+                    do {
+
+                        // for Video-file
+                        if (detection_sync) {
+                            detection_data = detect2draw.receive();
+                        }
+                        // for Video-camera
+                        else
+                        {
+                            // get new Detection result if present
+                            if (detect2draw.is_object_present()) {
+                                cv::Mat old_cap_frame = detection_data.cap_frame;   // use old captured frame
+                                detection_data = detect2draw.receive();
+                                if (!old_cap_frame.empty()) detection_data.cap_frame = old_cap_frame;
+                            }
+                            // get new Captured frame
+                            else {
+                                std::vector<bbox_t> old_result_vec = detection_data.result_vec; // use old detections
+                                detection_data = cap2draw.receive();
+                                detection_data.result_vec = old_result_vec;
+                            }
+                        }
+
+                        cv::Mat cap_frame = detection_data.cap_frame;
+                        cv::Mat draw_frame = detection_data.cap_frame.clone();
+                        std::vector<bbox_t> result_vec = detection_data.result_vec;
+
+#ifdef TRACK_OPTFLOW
+                        if (detection_data.new_detection) {
+                            tracker_flow.update_tracking_flow(detection_data.cap_frame, detection_data.result_vec);
+                            while (track_optflow_queue.size() > 0) {
+                                draw_frame = track_optflow_queue.back();
+                                result_vec = tracker_flow.tracking_flow(track_optflow_queue.front(), false);
+                                track_optflow_queue.pop();
+                            }
+                        }
+                        else {
+                            track_optflow_queue.push(cap_frame);
+                            result_vec = tracker_flow.tracking_flow(cap_frame, false);
+                        }
+                        detection_data.new_detection = true;    // to correct kalman filter
+#endif //TRACK_OPTFLOW
+
+                        // track ID by using kalman filter
+                        if (use_kalman_filter) {
+                            if (detection_data.new_detection) {
+                                result_vec = track_kalman.correct(result_vec);
+                            }
+                            else {
+                                result_vec = track_kalman.predict();
+                            }
+                        }
+                        // track ID by using custom function
+                        else {
+                            int frame_story = std::max(5, current_fps_cap.load());
+                            result_vec = detector.tracking_id(result_vec, true, frame_story, 40);
+                        }
+
+                        if (use_zed_camera && !detection_data.zed_cloud.empty()) {
+                            result_vec = get_3d_coordinates(result_vec, detection_data.zed_cloud);
+                        }
+
+                        //small_preview.set(draw_frame, result_vec);
+                        //large_preview.set(draw_frame, result_vec);
+                        draw_boxes(draw_frame, result_vec, obj_names, current_fps_det, current_fps_cap);
+                        //show_console_result(result_vec, obj_names, detection_data.frame_id);
+                        //large_preview.draw(draw_frame);
+                        //small_preview.draw(draw_frame, true);
+
+                        detection_data.result_vec = result_vec;
+                        detection_data.draw_frame = draw_frame;
+                        draw2show.send(detection_data);
+                        if (send_network) draw2net.send(detection_data);
+                        if (output_video.isOpened()) draw2write.send(detection_data);
+                    } while (!detection_data.exit_flag);
+                    std::cout << " t_draw exit \n";
+                });
+
+
+                // write frame to videofile
+                t_write = std::thread([&]()
+                {
+                    if (output_video.isOpened()) {
+                        detection_data_t detection_data;
+                        cv::Mat output_frame;
+                        do {
+                            detection_data = draw2write.receive();
+                            if(detection_data.draw_frame.channels() == 4) cv::cvtColor(detection_data.draw_frame, output_frame, CV_RGBA2RGB);
+                            else output_frame = detection_data.draw_frame;
+                            output_video << output_frame;
+                        } while (!detection_data.exit_flag);
+                        output_video.release();
+                    }
+                    std::cout << " t_write exit \n";
+                });
+
+                // send detection to the network
+                t_network = std::thread([&]()
+                {
+                    if (send_network) {
+                        detection_data_t detection_data;
+                        do {
+                            detection_data = draw2net.receive();
+
+                            detector.send_json_http(detection_data.result_vec, obj_names, detection_data.frame_id, filename);
+
+                        } while (!detection_data.exit_flag);
+                    }
+                    std::cout << " t_network exit \n";
+                });
+
+
+                // show detection
+                detection_data_t detection_data;
+                do {
+
+                    steady_end = std::chrono::steady_clock::now();
+                    float time_sec = std::chrono::duration<double>(steady_end - steady_start).count();
+                    if (time_sec >= 1) {
+                        current_fps_det = fps_det_counter.load() / time_sec;
+                        current_fps_cap = fps_cap_counter.load() / time_sec;
+                        steady_start = steady_end;
+                        fps_det_counter = 0;
+                        fps_cap_counter = 0;
+                    }
+
+                    detection_data = draw2show.receive();
+                    cv::Mat draw_frame = detection_data.draw_frame;
+
+                    //if (extrapolate_flag) {
+                    //    cv::putText(draw_frame, "extrapolate", cv::Point2f(10, 40), cv::FONT_HERSHEY_COMPLEX_SMALL, 1.0, cv::Scalar(50, 50, 0), 2);
+                    //}
+
+                    cv::imshow("window name", draw_frame);
+                    int key = cv::waitKey(3);    // 3 or 16ms
+                    if (key == 'f') show_small_boxes = !show_small_boxes;
+                    if (key == 'p') while (true) if (cv::waitKey(100) == 'p') break;
+                    //if (key == 'e') extrapolate_flag = !extrapolate_flag;
+                    if (key == 27) { exit_flag = true;}
+
+                    //std::cout << " current_fps_det = " << current_fps_det << ", current_fps_cap = " << current_fps_cap << std::endl;
+                } while (!detection_data.exit_flag);
+                std::cout << " show detection exit \n";
+
+                cv::destroyWindow("window name");
+                // wait for all threads
+                if (t_cap.joinable()) t_cap.join();
+                if (t_prepare.joinable()) t_prepare.join();
+                if (t_detect.joinable()) t_detect.join();
+                if (t_post.joinable()) t_post.join();
+                if (t_draw.joinable()) t_draw.join();
+                if (t_write.joinable()) t_write.join();
+                if (t_network.joinable()) t_network.join();
+
+                break;
+
+            }
+            else if (file_ext == "txt") {    // list of image files
+                std::ifstream file(filename);
+                if (!file.is_open()) std::cout << "File not found! \n";
+                else
+                    for (std::string line; file >> line;) {
+                        std::cout << line << std::endl;
+                        cv::Mat mat_img = cv::imread(line);
+                        std::vector<bbox_t> result_vec = detector.detect(mat_img);
+                        show_console_result(result_vec, obj_names);
+                        //draw_boxes(mat_img, result_vec, obj_names);
+                        //cv::imwrite("res_" + line, mat_img);
+                    }
+
+            }
+            else {    // image file
+                // to achive high performance for multiple images do these 2 lines in another thread
+                cv::Mat mat_img = cv::imread(filename);
+                auto det_image = detector.mat_to_image_resize(mat_img);
+
+                auto start = std::chrono::steady_clock::now();
+                std::vector<bbox_t> result_vec = detector.detect_resized(*det_image, mat_img.size().width, mat_img.size().height);
+                auto end = std::chrono::steady_clock::now();
+                std::chrono::duration<double> spent = end - start;
+                std::cout << " Time: " << spent.count() << " sec \n";
+
+                //result_vec = detector.tracking_id(result_vec);    // comment it - if track_id is not required
+                draw_boxes(mat_img, result_vec, obj_names);
+                cv::imshow("window name", mat_img);
+                show_console_result(result_vec, obj_names);
+                cv::waitKey(0);
+            }
+#else   // OPENCV
+            //std::vector<bbox_t> result_vec = detector.detect(filename);
+
+            auto img = detector.load_image(filename);
+            std::vector<bbox_t> result_vec = detector.detect(img);
+            detector.free_image(img);
+            show_console_result(result_vec, obj_names);
+#endif  // OPENCV
+        }
+        catch (std::exception &e) {
+            std::cerr << "exception: " << e.what() << "\n";
+        }
+        catch (...) {
+            std::cerr << "unknown exception \n";
+        }
+        filename.clear();
+    }
+
+    return 0;
+}
diff --git a/darknet-master/src/yolo_layer.c b/darknet-master/src/yolo_layer.c
new file mode 100644
index 0000000..ac464ba
--- /dev/null
+++ b/darknet-master/src/yolo_layer.c
@@ -0,0 +1,1226 @@
+#include "yolo_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "box.h"
+#include "dark_cuda.h"
+#include "utils.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes, int max_boxes)
+{
+    int i;
+    layer l = { (LAYER_TYPE)0 };
+    l.type = YOLO;
+
+    l.n = n;
+    l.total = total;
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = n*(classes + 4 + 1);
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
+    l.classes = classes;
+    l.cost = (float*)xcalloc(1, sizeof(float));
+    l.biases = (float*)xcalloc(total * 2, sizeof(float));
+    if(mask) l.mask = mask;
+    else{
+        l.mask = (int*)xcalloc(n, sizeof(int));
+        for(i = 0; i < n; ++i){
+            l.mask[i] = i;
+        }
+    }
+    l.bias_updates = (float*)xcalloc(n * 2, sizeof(float));
+    l.outputs = h*w*n*(classes + 4 + 1);
+    l.inputs = l.outputs;
+    l.max_boxes = max_boxes;
+    l.truth_size = 4 + 2;
+    l.truths = l.max_boxes*l.truth_size;    // 90*(4 + 1);
+    l.labels = (int*)xcalloc(batch * l.w*l.h*l.n, sizeof(int));
+    for (i = 0; i < batch * l.w*l.h*l.n; ++i) l.labels[i] = -1;
+    l.class_ids = (int*)xcalloc(batch * l.w*l.h*l.n, sizeof(int));
+    for (i = 0; i < batch * l.w*l.h*l.n; ++i) l.class_ids[i] = -1;
+
+    l.delta = (float*)xcalloc(batch * l.outputs, sizeof(float));
+    l.output = (float*)xcalloc(batch * l.outputs, sizeof(float));
+    for(i = 0; i < total*2; ++i){
+        l.biases[i] = .5;
+    }
+
+    l.forward = forward_yolo_layer;
+    l.backward = backward_yolo_layer;
+#ifdef GPU
+    l.forward_gpu = forward_yolo_layer_gpu;
+    l.backward_gpu = backward_yolo_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
+    l.output_avg_gpu = cuda_make_array(l.output, batch*l.outputs);
+    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
+
+    free(l.output);
+    if (cudaSuccess == cudaHostAlloc(&l.output, batch*l.outputs*sizeof(float), cudaHostRegisterMapped)) l.output_pinned = 1;
+    else {
+        cudaGetLastError(); // reset CUDA-error
+        l.output = (float*)xcalloc(batch * l.outputs, sizeof(float));
+    }
+
+    free(l.delta);
+    if (cudaSuccess == cudaHostAlloc(&l.delta, batch*l.outputs*sizeof(float), cudaHostRegisterMapped)) l.delta_pinned = 1;
+    else {
+        cudaGetLastError(); // reset CUDA-error
+        l.delta = (float*)xcalloc(batch * l.outputs, sizeof(float));
+    }
+#endif
+
+    fprintf(stderr, "yolo\n");
+    srand(time(0));
+
+    return l;
+}
+
+void resize_yolo_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+
+    l->outputs = h*w*l->n*(l->classes + 4 + 1);
+    l->inputs = l->outputs;
+
+    if (l->embedding_output) l->embedding_output = (float*)xrealloc(l->output, l->batch * l->embedding_size * l->n * l->h * l->w * sizeof(float));
+    if (l->labels) l->labels = (int*)xrealloc(l->labels, l->batch * l->n * l->h * l->w * sizeof(int));
+    if (l->class_ids) l->class_ids = (int*)xrealloc(l->class_ids, l->batch * l->n * l->h * l->w * sizeof(int));
+
+    if (!l->output_pinned) l->output = (float*)xrealloc(l->output, l->batch*l->outputs * sizeof(float));
+    if (!l->delta_pinned) l->delta = (float*)xrealloc(l->delta, l->batch*l->outputs*sizeof(float));
+
+#ifdef GPU
+    if (l->output_pinned) {
+        CHECK_CUDA(cudaFreeHost(l->output));
+        if (cudaSuccess != cudaHostAlloc(&l->output, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
+            cudaGetLastError(); // reset CUDA-error
+            l->output = (float*)xcalloc(l->batch * l->outputs, sizeof(float));
+            l->output_pinned = 0;
+        }
+    }
+
+    if (l->delta_pinned) {
+        CHECK_CUDA(cudaFreeHost(l->delta));
+        if (cudaSuccess != cudaHostAlloc(&l->delta, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
+            cudaGetLastError(); // reset CUDA-error
+            l->delta = (float*)xcalloc(l->batch * l->outputs, sizeof(float));
+            l->delta_pinned = 0;
+        }
+    }
+
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+    cuda_free(l->output_avg_gpu);
+
+    l->delta_gpu =     cuda_make_array(l->delta, l->batch*l->outputs);
+    l->output_gpu =    cuda_make_array(l->output, l->batch*l->outputs);
+    l->output_avg_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+#endif
+}
+
+box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride, int new_coords)
+{
+    box b;
+    // ln - natural logarithm (base = e)
+    // x` = t.x * lw - i;   // x = ln(x`/(1-x`))   // x - output of previous conv-layer
+    // y` = t.y * lh - i;   // y = ln(y`/(1-y`))   // y - output of previous conv-layer
+    // w = ln(t.w * net.w / anchors_w); // w - output of previous conv-layer
+    // h = ln(t.h * net.h / anchors_h); // h - output of previous conv-layer
+    if (new_coords) {
+        b.x = (i + x[index + 0 * stride]) / lw;
+        b.y = (j + x[index + 1 * stride]) / lh;
+        b.w = x[index + 2 * stride] * x[index + 2 * stride] * 4 * biases[2 * n] / w;
+        b.h = x[index + 3 * stride] * x[index + 3 * stride] * 4 * biases[2 * n + 1] / h;
+    }
+    else {
+        b.x = (i + x[index + 0 * stride]) / lw;
+        b.y = (j + x[index + 1 * stride]) / lh;
+        b.w = exp(x[index + 2 * stride]) * biases[2 * n] / w;
+        b.h = exp(x[index + 3 * stride]) * biases[2 * n + 1] / h;
+    }
+    return b;
+}
+
+static inline float fix_nan_inf(float val)
+{
+    if (isnan(val) || isinf(val)) val = 0;
+    return val;
+}
+
+static inline float clip_value(float val, const float max_val)
+{
+    if (val > max_val) {
+        //printf("\n val = %f > max_val = %f \n", val, max_val);
+        val = max_val;
+    }
+    else if (val < -max_val) {
+        //printf("\n val = %f < -max_val = %f \n", val, -max_val);
+        val = -max_val;
+    }
+    return val;
+}
+
+ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss, int accumulate, float max_delta, int *rewritten_bbox, int new_coords)
+{
+    if (delta[index + 0 * stride] || delta[index + 1 * stride] || delta[index + 2 * stride] || delta[index + 3 * stride]) {
+        (*rewritten_bbox)++;
+    }
+
+    ious all_ious = { 0 };
+    // i - step in layer width
+    // j - step in layer height
+    //  Returns a box in absolute coordinates
+    box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride, new_coords);
+    all_ious.iou = box_iou(pred, truth);
+    all_ious.giou = box_giou(pred, truth);
+    all_ious.diou = box_diou(pred, truth);
+    all_ious.ciou = box_ciou(pred, truth);
+    // avoid nan in dx_box_iou
+    if (pred.w == 0) { pred.w = 1.0; }
+    if (pred.h == 0) { pred.h = 1.0; }
+    if (iou_loss == MSE)    // old loss
+    {
+        float tx = (truth.x*lw - i);
+        float ty = (truth.y*lh - j);
+        float tw = log(truth.w*w / biases[2 * n]);
+        float th = log(truth.h*h / biases[2 * n + 1]);
+
+        if (new_coords) {
+            //tx = (truth.x*lw - i + 0.5) / 2;
+            //ty = (truth.y*lh - j + 0.5) / 2;
+            tw = sqrt(truth.w*w / (4 * biases[2 * n]));
+            th = sqrt(truth.h*h / (4 * biases[2 * n + 1]));
+        }
+
+        //printf(" tx = %f, ty = %f, tw = %f, th = %f \n", tx, ty, tw, th);
+        //printf(" x = %f, y = %f, w = %f, h = %f \n", x[index + 0 * stride], x[index + 1 * stride], x[index + 2 * stride], x[index + 3 * stride]);
+
+        // accumulate delta
+        delta[index + 0 * stride] += scale * (tx - x[index + 0 * stride]) * iou_normalizer;
+        delta[index + 1 * stride] += scale * (ty - x[index + 1 * stride]) * iou_normalizer;
+        delta[index + 2 * stride] += scale * (tw - x[index + 2 * stride]) * iou_normalizer;
+        delta[index + 3 * stride] += scale * (th - x[index + 3 * stride]) * iou_normalizer;
+    }
+    else {
+        // https://github.com/generalized-iou/g-darknet
+        // https://arxiv.org/abs/1902.09630v2
+        // https://giou.stanford.edu/
+        all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
+
+        // jacobian^t (transpose)
+        //float dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
+        //float dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
+        //float dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
+        //float dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
+
+        // jacobian^t (transpose)
+        float dx = all_ious.dx_iou.dt;
+        float dy = all_ious.dx_iou.db;
+        float dw = all_ious.dx_iou.dl;
+        float dh = all_ious.dx_iou.dr;
+
+
+        // predict exponential, apply gradient of e^delta_t ONLY for w,h
+        if (new_coords) {
+            //dw *= 8 * x[index + 2 * stride];
+            //dh *= 8 * x[index + 3 * stride];
+            //dw *= 8 * x[index + 2 * stride] * biases[2 * n] / w;
+            //dh *= 8 * x[index + 3 * stride] * biases[2 * n + 1] / h;
+
+            //float grad_w = 8 * exp(-x[index + 2 * stride]) / pow(exp(-x[index + 2 * stride]) + 1, 3);
+            //float grad_h = 8 * exp(-x[index + 3 * stride]) / pow(exp(-x[index + 3 * stride]) + 1, 3);
+            //dw *= grad_w;
+            //dh *= grad_h;
+        }
+        else {
+            dw *= exp(x[index + 2 * stride]);
+            dh *= exp(x[index + 3 * stride]);
+        }
+
+
+        //dw *= exp(x[index + 2 * stride]);
+        //dh *= exp(x[index + 3 * stride]);
+
+        // normalize iou weight
+        dx *= iou_normalizer;
+        dy *= iou_normalizer;
+        dw *= iou_normalizer;
+        dh *= iou_normalizer;
+
+
+        dx = fix_nan_inf(dx);
+        dy = fix_nan_inf(dy);
+        dw = fix_nan_inf(dw);
+        dh = fix_nan_inf(dh);
+
+        if (max_delta != FLT_MAX) {
+            dx = clip_value(dx, max_delta);
+            dy = clip_value(dy, max_delta);
+            dw = clip_value(dw, max_delta);
+            dh = clip_value(dh, max_delta);
+        }
+
+
+        if (!accumulate) {
+            delta[index + 0 * stride] = 0;
+            delta[index + 1 * stride] = 0;
+            delta[index + 2 * stride] = 0;
+            delta[index + 3 * stride] = 0;
+        }
+
+        // accumulate delta
+        delta[index + 0 * stride] += dx;
+        delta[index + 1 * stride] += dy;
+        delta[index + 2 * stride] += dw;
+        delta[index + 3 * stride] += dh;
+    }
+
+    return all_ious;
+}
+
+void averages_yolo_deltas(int class_index, int box_index, int stride, int classes, float *delta)
+{
+
+    int classes_in_one_box = 0;
+    int c;
+    for (c = 0; c < classes; ++c) {
+        if (delta[class_index + stride*c] > 0) classes_in_one_box++;
+    }
+
+    if (classes_in_one_box > 0) {
+        delta[box_index + 0 * stride] /= classes_in_one_box;
+        delta[box_index + 1 * stride] /= classes_in_one_box;
+        delta[box_index + 2 * stride] /= classes_in_one_box;
+        delta[box_index + 3 * stride] /= classes_in_one_box;
+    }
+}
+
+void delta_yolo_class(float *output, float *delta, int index, int class_id, int classes, int stride, float *avg_cat, int focal_loss, float label_smooth_eps, float *classes_multipliers, float cls_normalizer)
+{
+    int n;
+    if (delta[index + stride*class_id]){
+        float y_true = 1;
+        if(label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        float result_delta = y_true - output[index + stride*class_id];
+        if(!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*class_id] = result_delta;
+        //delta[index + stride*class_id] = 1 - output[index + stride*class_id];
+
+        if (classes_multipliers) delta[index + stride*class_id] *= classes_multipliers[class_id];
+        if(avg_cat) *avg_cat += output[index + stride*class_id];
+        return;
+    }
+    // Focal loss
+    if (focal_loss) {
+        // Focal Loss
+        float alpha = 0.5;    // 0.25 or 0.5
+        //float gamma = 2;    // hardcoded in many places of the grad-formula
+
+        int ti = index + stride*class_id;
+        float pt = output[ti] + 0.000000000000001F;
+        // http://fooplot.com/#W3sidHlwZSI6MCwiZXEiOiItKDEteCkqKDIqeCpsb2coeCkreC0xKSIsImNvbG9yIjoiIzAwMDAwMCJ9LHsidHlwZSI6MTAwMH1d
+        float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1);    // http://blog.csdn.net/linmingan/article/details/77885832
+        //float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1);    // https://github.com/unsky/focal-loss
+
+        for (n = 0; n < classes; ++n) {
+            delta[index + stride*n] = (((n == class_id) ? 1 : 0) - output[index + stride*n]);
+
+            delta[index + stride*n] *= alpha*grad;
+
+            if (n == class_id && avg_cat) *avg_cat += output[index + stride*n];
+        }
+    }
+    else {
+        // default
+        for (n = 0; n < classes; ++n) {
+            float y_true = ((n == class_id) ? 1 : 0);
+            if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+            float result_delta = y_true - output[index + stride*n];
+            if (!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*n] = result_delta;
+
+            if (classes_multipliers && n == class_id) delta[index + stride*class_id] *= classes_multipliers[class_id] * cls_normalizer;
+            if (n == class_id && avg_cat) *avg_cat += output[index + stride*n];
+        }
+    }
+}
+
+int compare_yolo_class(float *output, int classes, int class_index, int stride, float objectness, int class_id, float conf_thresh)
+{
+    int j;
+    for (j = 0; j < classes; ++j) {
+        //float prob = objectness * output[class_index + stride*j];
+        float prob = output[class_index + stride*j];
+        if (prob > conf_thresh) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static int entry_index(layer l, int batch, int location, int entry)
+{
+    int n =   location / (l.w*l.h);
+    int loc = location % (l.w*l.h);
+    return batch*l.outputs + n*l.w*l.h*(4+l.classes+1) + entry*l.w*l.h + loc;
+}
+
+typedef struct train_yolo_args {
+    layer l;
+    network_state state;
+    int b;
+
+    float tot_iou;
+    float tot_giou_loss;
+    float tot_iou_loss;
+    int count;
+    int class_count;
+} train_yolo_args;
+
+void *process_batch(void* ptr)
+{
+    {
+        train_yolo_args *args = (train_yolo_args*)ptr;
+        const layer l = args->l;
+        network_state state = args->state;
+        int b = args->b;
+
+        int i, j, t, n;
+
+        //printf(" b = %d \n", b, b);
+
+        //float tot_iou = 0;
+        float tot_giou = 0;
+        float tot_diou = 0;
+        float tot_ciou = 0;
+        //float tot_iou_loss = 0;
+        //float tot_giou_loss = 0;
+        float tot_diou_loss = 0;
+        float tot_ciou_loss = 0;
+        float recall = 0;
+        float recall75 = 0;
+        float avg_cat = 0;
+        float avg_obj = 0;
+        float avg_anyobj = 0;
+        //int count = 0;
+        //int class_count = 0;
+
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    const int class_index = entry_index(l, b, n * l.w * l.h + j * l.w + i, 4 + 1);
+                    const int obj_index = entry_index(l, b, n * l.w * l.h + j * l.w + i, 4);
+                    const int box_index = entry_index(l, b, n * l.w * l.h + j * l.w + i, 0);
+                    const int stride = l.w * l.h;
+                    box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w * l.h, l.new_coords);
+                    float best_match_iou = 0;
+                    int best_match_t = 0;
+                    float best_iou = 0;
+                    int best_t = 0;
+                    for (t = 0; t < l.max_boxes; ++t) {
+                        box truth = float_to_box_stride(state.truth + t * l.truth_size + b * l.truths, 1);
+                        if (!truth.x) break;  // continue;
+                        int class_id = state.truth[t * l.truth_size + b * l.truths + 4];
+                        if (class_id >= l.classes || class_id < 0) {
+                            printf("\n Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1);
+                            printf("\n truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f, class_id = %d \n", truth.x, truth.y, truth.w, truth.h, class_id);
+                            continue; // if label contains class_id more than number of classes in the cfg-file and class_id check garbage value
+                        }
+
+                        float objectness = l.output[obj_index];
+                        if (isnan(objectness) || isinf(objectness)) l.output[obj_index] = 0;
+                        int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w * l.h, objectness, class_id, 0.25f);
+
+                        float iou = box_iou(pred, truth);
+                        if (iou > best_match_iou && class_id_match == 1) {
+                            best_match_iou = iou;
+                            best_match_t = t;
+                        }
+                        if (iou > best_iou) {
+                            best_iou = iou;
+                            best_t = t;
+                        }
+                    }
+
+                    avg_anyobj += l.output[obj_index];
+                    l.delta[obj_index] = l.obj_normalizer * (0 - l.output[obj_index]);
+                    if (best_match_iou > l.ignore_thresh) {
+                        if (l.objectness_smooth) {
+                            const float delta_obj = l.obj_normalizer * (best_match_iou - l.output[obj_index]);
+                            if (delta_obj > l.delta[obj_index]) l.delta[obj_index] = delta_obj;
+
+                        }
+                        else l.delta[obj_index] = 0;
+                    }
+                    else if (state.net.adversarial) {
+                        int stride = l.w * l.h;
+                        float scale = pred.w * pred.h;
+                        if (scale > 0) scale = sqrt(scale);
+                        l.delta[obj_index] = scale * l.obj_normalizer * (0 - l.output[obj_index]);
+                        int cl_id;
+                        int found_object = 0;
+                        for (cl_id = 0; cl_id < l.classes; ++cl_id) {
+                            if (l.output[class_index + stride * cl_id] * l.output[obj_index] > 0.25) {
+                                l.delta[class_index + stride * cl_id] = scale * (0 - l.output[class_index + stride * cl_id]);
+                                found_object = 1;
+                            }
+                        }
+                        if (found_object) {
+                            // don't use this loop for adversarial attack drawing
+                            for (cl_id = 0; cl_id < l.classes; ++cl_id)
+                                if (l.output[class_index + stride * cl_id] * l.output[obj_index] < 0.25)
+                                    l.delta[class_index + stride * cl_id] = scale * (1 - l.output[class_index + stride * cl_id]);
+
+                            l.delta[box_index + 0 * stride] += scale * (0 - l.output[box_index + 0 * stride]);
+                            l.delta[box_index + 1 * stride] += scale * (0 - l.output[box_index + 1 * stride]);
+                            l.delta[box_index + 2 * stride] += scale * (0 - l.output[box_index + 2 * stride]);
+                            l.delta[box_index + 3 * stride] += scale * (0 - l.output[box_index + 3 * stride]);
+                        }
+                    }
+                    if (best_iou > l.truth_thresh) {
+                        const float iou_multiplier = best_iou * best_iou;// (best_iou - l.truth_thresh) / (1.0 - l.truth_thresh);
+                        if (l.objectness_smooth) l.delta[obj_index] = l.obj_normalizer * (iou_multiplier - l.output[obj_index]);
+                        else l.delta[obj_index] = l.obj_normalizer * (1 - l.output[obj_index]);
+                        //l.delta[obj_index] = l.obj_normalizer * (1 - l.output[obj_index]);
+
+                        int class_id = state.truth[best_t * l.truth_size + b * l.truths + 4];
+                        if (l.map) class_id = l.map[class_id];
+                        delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w * l.h, 0, l.focal_loss, l.label_smooth_eps, l.classes_multipliers, l.cls_normalizer);
+                        const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
+                        if (l.objectness_smooth) l.delta[class_index + stride * class_id] = class_multiplier * (iou_multiplier - l.output[class_index + stride * class_id]);
+                        box truth = float_to_box_stride(state.truth + best_t * l.truth_size + b * l.truths, 1);
+                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w * truth.h), l.w * l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta, state.net.rewritten_bbox, l.new_coords);
+                        (*state.net.total_bbox)++;
+                    }
+                }
+            }
+        }
+        for (t = 0; t < l.max_boxes; ++t) {
+            box truth = float_to_box_stride(state.truth + t * l.truth_size + b * l.truths, 1);
+            if (!truth.x) break;  // continue;
+            if (truth.x < 0 || truth.y < 0 || truth.x > 1 || truth.y > 1 || truth.w < 0 || truth.h < 0) {
+                char buff[256];
+                printf(" Wrong label: truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f \n", truth.x, truth.y, truth.w, truth.h);
+                sprintf(buff, "echo \"Wrong label: truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f\" >> bad_label.list",
+                    truth.x, truth.y, truth.w, truth.h);
+                system(buff);
+            }
+            int class_id = state.truth[t * l.truth_size + b * l.truths + 4];
+            if (class_id >= l.classes || class_id < 0) continue; // if label contains class_id more than number of classes in the cfg-file and class_id check garbage value
+
+            float best_iou = 0;
+            int best_n = 0;
+            i = (truth.x * l.w);
+            j = (truth.y * l.h);
+            box truth_shift = truth;
+            truth_shift.x = truth_shift.y = 0;
+            for (n = 0; n < l.total; ++n) {
+                box pred = { 0 };
+                pred.w = l.biases[2 * n] / state.net.w;
+                pred.h = l.biases[2 * n + 1] / state.net.h;
+                float iou = box_iou(pred, truth_shift);
+                if (iou > best_iou) {
+                    best_iou = iou;
+                    best_n = n;
+                }
+            }
+
+            int mask_n = int_index(l.mask, best_n, l.n);
+            if (mask_n >= 0) {
+                int class_id = state.truth[t * l.truth_size + b * l.truths + 4];
+                if (l.map) class_id = l.map[class_id];
+
+                int box_index = entry_index(l, b, mask_n * l.w * l.h + j * l.w + i, 0);
+                const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
+                ious all_ious = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w * truth.h), l.w * l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta, state.net.rewritten_bbox, l.new_coords);
+                (*state.net.total_bbox)++;
+
+                const int truth_in_index = t * l.truth_size + b * l.truths + 5;
+                const int track_id = state.truth[truth_in_index];
+                const int truth_out_index = b * l.n * l.w * l.h + mask_n * l.w * l.h + j * l.w + i;
+                l.labels[truth_out_index] = track_id;
+                l.class_ids[truth_out_index] = class_id;
+                //printf(" track_id = %d, t = %d, b = %d, truth_in_index = %d, truth_out_index = %d \n", track_id, t, b, truth_in_index, truth_out_index);
+
+                // range is 0 <= 1
+                args->tot_iou += all_ious.iou;
+                args->tot_iou_loss += 1 - all_ious.iou;
+                // range is -1 <= giou <= 1
+                tot_giou += all_ious.giou;
+                args->tot_giou_loss += 1 - all_ious.giou;
+
+                tot_diou += all_ious.diou;
+                tot_diou_loss += 1 - all_ious.diou;
+
+                tot_ciou += all_ious.ciou;
+                tot_ciou_loss += 1 - all_ious.ciou;
+
+                int obj_index = entry_index(l, b, mask_n * l.w * l.h + j * l.w + i, 4);
+                avg_obj += l.output[obj_index];
+                if (l.objectness_smooth) {
+                    float delta_obj = class_multiplier * l.obj_normalizer * (1 - l.output[obj_index]);
+                    if (l.delta[obj_index] == 0) l.delta[obj_index] = delta_obj;
+                }
+                else l.delta[obj_index] = class_multiplier * l.obj_normalizer * (1 - l.output[obj_index]);
+
+                int class_index = entry_index(l, b, mask_n * l.w * l.h + j * l.w + i, 4 + 1);
+                delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w * l.h, &avg_cat, l.focal_loss, l.label_smooth_eps, l.classes_multipliers, l.cls_normalizer);
+
+                //printf(" label: class_id = %d, truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f \n", class_id, truth.x, truth.y, truth.w, truth.h);
+                //printf(" mask_n = %d, l.output[obj_index] = %f, l.output[class_index + class_id] = %f \n\n", mask_n, l.output[obj_index], l.output[class_index + class_id]);
+
+                ++(args->count);
+                ++(args->class_count);
+                if (all_ious.iou > .5) recall += 1;
+                if (all_ious.iou > .75) recall75 += 1;
+            }
+
+            // iou_thresh
+            for (n = 0; n < l.total; ++n) {
+                int mask_n = int_index(l.mask, n, l.n);
+                if (mask_n >= 0 && n != best_n && l.iou_thresh < 1.0f) {
+                    box pred = { 0 };
+                    pred.w = l.biases[2 * n] / state.net.w;
+                    pred.h = l.biases[2 * n + 1] / state.net.h;
+                    float iou = box_iou_kind(pred, truth_shift, l.iou_thresh_kind); // IOU, GIOU, MSE, DIOU, CIOU
+                    // iou, n
+
+                    if (iou > l.iou_thresh) {
+                        int class_id = state.truth[t * l.truth_size + b * l.truths + 4];
+                        if (l.map) class_id = l.map[class_id];
+
+                        int box_index = entry_index(l, b, mask_n * l.w * l.h + j * l.w + i, 0);
+                        const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
+                        ious all_ious = delta_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w * truth.h), l.w * l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta, state.net.rewritten_bbox, l.new_coords);
+                        (*state.net.total_bbox)++;
+
+                        // range is 0 <= 1
+                        args->tot_iou += all_ious.iou;
+                        args->tot_iou_loss += 1 - all_ious.iou;
+                        // range is -1 <= giou <= 1
+                        tot_giou += all_ious.giou;
+                        args->tot_giou_loss += 1 - all_ious.giou;
+
+                        tot_diou += all_ious.diou;
+                        tot_diou_loss += 1 - all_ious.diou;
+
+                        tot_ciou += all_ious.ciou;
+                        tot_ciou_loss += 1 - all_ious.ciou;
+
+                        int obj_index = entry_index(l, b, mask_n * l.w * l.h + j * l.w + i, 4);
+                        avg_obj += l.output[obj_index];
+                        if (l.objectness_smooth) {
+                            float delta_obj = class_multiplier * l.obj_normalizer * (1 - l.output[obj_index]);
+                            if (l.delta[obj_index] == 0) l.delta[obj_index] = delta_obj;
+                        }
+                        else l.delta[obj_index] = class_multiplier * l.obj_normalizer * (1 - l.output[obj_index]);
+
+                        int class_index = entry_index(l, b, mask_n * l.w * l.h + j * l.w + i, 4 + 1);
+                        delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w * l.h, &avg_cat, l.focal_loss, l.label_smooth_eps, l.classes_multipliers, l.cls_normalizer);
+
+                        ++(args->count);
+                        ++(args->class_count);
+                        if (all_ious.iou > .5) recall += 1;
+                        if (all_ious.iou > .75) recall75 += 1;
+                    }
+                }
+            }
+        }
+
+        if (l.iou_thresh < 1.0f) {
+            // averages the deltas obtained by the function: delta_yolo_box()_accumulate
+            for (j = 0; j < l.h; ++j) {
+                for (i = 0; i < l.w; ++i) {
+                    for (n = 0; n < l.n; ++n) {
+                        int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
+                        int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
+                        const int stride = l.w*l.h;
+
+                        if (l.delta[obj_index] != 0)
+                            averages_yolo_deltas(class_index, box_index, stride, l.classes, l.delta);
+                    }
+                }
+            }
+        }
+
+    }
+
+    return 0;
+}
+
+
+
+void forward_yolo_layer(const layer l, network_state state)
+{
+    //int i, j, b, t, n;
+    memcpy(l.output, state.input, l.outputs*l.batch * sizeof(float));
+    int b, n;
+
+#ifndef GPU
+    for (b = 0; b < l.batch; ++b) {
+        for (n = 0; n < l.n; ++n) {
+            int bbox_index = entry_index(l, b, n*l.w*l.h, 0);
+            if (l.new_coords) {
+                //activate_array(l.output + bbox_index, 4 * l.w*l.h, LOGISTIC);    // x,y,w,h
+            }
+            else {
+                activate_array(l.output + bbox_index, 2 * l.w*l.h, LOGISTIC);        // x,y,
+                int obj_index = entry_index(l, b, n*l.w*l.h, 4);
+                activate_array(l.output + obj_index, (1 + l.classes)*l.w*l.h, LOGISTIC);
+            }
+            scal_add_cpu(2 * l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output + bbox_index, 1);    // scale x,y
+        }
+    }
+#endif
+
+    // delta is zeroed
+    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+    if (!state.train) return;
+
+    int i;
+    for (i = 0; i < l.batch * l.w*l.h*l.n; ++i) l.labels[i] = -1;
+    for (i = 0; i < l.batch * l.w*l.h*l.n; ++i) l.class_ids[i] = -1;
+    //float avg_iou = 0;
+    float tot_iou = 0;
+    float tot_giou = 0;
+    float tot_diou = 0;
+    float tot_ciou = 0;
+    float tot_iou_loss = 0;
+    float tot_giou_loss = 0;
+    float tot_diou_loss = 0;
+    float tot_ciou_loss = 0;
+    float recall = 0;
+    float recall75 = 0;
+    float avg_cat = 0;
+    float avg_obj = 0;
+    float avg_anyobj = 0;
+    int count = 0;
+    int class_count = 0;
+    *(l.cost) = 0;
+
+
+    int num_threads = l.batch;
+    pthread_t* threads = (pthread_t*)calloc(num_threads, sizeof(pthread_t));
+
+    struct train_yolo_args* yolo_args = (train_yolo_args*)xcalloc(l.batch, sizeof(struct train_yolo_args));
+
+    for (b = 0; b < l.batch; b++)
+    {
+        yolo_args[b].l = l;
+        yolo_args[b].state = state;
+        yolo_args[b].b = b;
+
+        yolo_args[b].tot_iou = 0;
+        yolo_args[b].tot_iou_loss = 0;
+        yolo_args[b].tot_giou_loss = 0;
+        yolo_args[b].count = 0;
+        yolo_args[b].class_count = 0;
+
+        if (pthread_create(&threads[b], 0, process_batch, &(yolo_args[b]))) error("Thread creation failed", DARKNET_LOC);
+    }
+
+    for (b = 0; b < l.batch; b++)
+    {
+        pthread_join(threads[b], 0);
+
+        tot_iou += yolo_args[b].tot_iou;
+        tot_iou_loss += yolo_args[b].tot_iou_loss;
+        tot_giou_loss += yolo_args[b].tot_giou_loss;
+        count += yolo_args[b].count;
+        class_count += yolo_args[b].class_count;
+    }
+
+    free(yolo_args);
+    free(threads);
+
+    // Search for an equidistant point from the distant boundaries of the local minimum
+    int iteration_num = get_current_iteration(state.net);
+    const int start_point = state.net.max_batches * 3 / 4;
+    //printf(" equidistant_point ep = %d, it = %d \n", state.net.equidistant_point, iteration_num);
+
+    if ((state.net.badlabels_rejection_percentage && start_point < iteration_num) ||
+        (state.net.num_sigmas_reject_badlabels && start_point < iteration_num) ||
+        (state.net.equidistant_point && state.net.equidistant_point < iteration_num))
+    {
+        const float progress_it = iteration_num - state.net.equidistant_point;
+        const float progress = progress_it / (state.net.max_batches - state.net.equidistant_point);
+        float ep_loss_threshold = (*state.net.delta_rolling_avg) * progress * 1.4;
+
+        float cur_max = 0;
+        float cur_avg = 0;
+        float counter = 0;
+        for (i = 0; i < l.batch * l.outputs; ++i) {
+
+            if (l.delta[i] != 0) {
+                counter++;
+                cur_avg += fabs(l.delta[i]);
+
+                if (cur_max < fabs(l.delta[i]))
+                    cur_max = fabs(l.delta[i]);
+            }
+        }
+
+        cur_avg = cur_avg / counter;
+
+        if (*state.net.delta_rolling_max == 0) *state.net.delta_rolling_max = cur_max;
+        *state.net.delta_rolling_max = *state.net.delta_rolling_max * 0.99 + cur_max * 0.01;
+        *state.net.delta_rolling_avg = *state.net.delta_rolling_avg * 0.99 + cur_avg * 0.01;
+
+        // reject high loss to filter bad labels
+        if (state.net.num_sigmas_reject_badlabels && start_point < iteration_num)
+        {
+            const float rolling_std = (*state.net.delta_rolling_std);
+            const float rolling_max = (*state.net.delta_rolling_max);
+            const float rolling_avg = (*state.net.delta_rolling_avg);
+            const float progress_badlabels = (float)(iteration_num - start_point) / (start_point);
+
+            float cur_std = 0;
+            float counter = 0;
+            for (i = 0; i < l.batch * l.outputs; ++i) {
+                if (l.delta[i] != 0) {
+                    counter++;
+                    cur_std += pow(l.delta[i] - rolling_avg, 2);
+                }
+            }
+            cur_std = sqrt(cur_std / counter);
+
+            *state.net.delta_rolling_std = *state.net.delta_rolling_std * 0.99 + cur_std * 0.01;
+
+            float final_badlebels_threshold = rolling_avg + rolling_std * state.net.num_sigmas_reject_badlabels;
+            float badlabels_threshold = rolling_max - progress_badlabels * fabs(rolling_max - final_badlebels_threshold);
+            badlabels_threshold = max_val_cmp(final_badlebels_threshold, badlabels_threshold);
+            for (i = 0; i < l.batch * l.outputs; ++i) {
+                if (fabs(l.delta[i]) > badlabels_threshold)
+                    l.delta[i] = 0;
+            }
+            printf(" rolling_std = %f, rolling_max = %f, rolling_avg = %f \n", rolling_std, rolling_max, rolling_avg);
+            printf(" badlabels loss_threshold = %f, start_it = %d, progress = %f \n", badlabels_threshold, start_point, progress_badlabels *100);
+
+            ep_loss_threshold = min_val_cmp(final_badlebels_threshold, rolling_avg) * progress;
+        }
+
+
+        // reject some percent of the highest deltas to filter bad labels
+        if (state.net.badlabels_rejection_percentage && start_point < iteration_num) {
+            if (*state.net.badlabels_reject_threshold == 0)
+                *state.net.badlabels_reject_threshold = *state.net.delta_rolling_max;
+
+            printf(" badlabels_reject_threshold = %f \n", *state.net.badlabels_reject_threshold);
+
+            const float num_deltas_per_anchor = (l.classes + 4 + 1);
+            float counter_reject = 0;
+            float counter_all = 0;
+            for (i = 0; i < l.batch * l.outputs; ++i) {
+                if (l.delta[i] != 0) {
+                    counter_all++;
+                    if (fabs(l.delta[i]) > (*state.net.badlabels_reject_threshold)) {
+                        counter_reject++;
+                        l.delta[i] = 0;
+                    }
+                }
+            }
+            float cur_percent = 100 * (counter_reject*num_deltas_per_anchor / counter_all);
+            if (cur_percent > state.net.badlabels_rejection_percentage) {
+                *state.net.badlabels_reject_threshold += 0.01;
+                printf(" increase!!! \n");
+            }
+            else if (*state.net.badlabels_reject_threshold > 0.01) {
+                *state.net.badlabels_reject_threshold -= 0.01;
+                printf(" decrease!!! \n");
+            }
+
+            printf(" badlabels_reject_threshold = %f, cur_percent = %f, badlabels_rejection_percentage = %f, delta_rolling_max = %f \n",
+                *state.net.badlabels_reject_threshold, cur_percent, state.net.badlabels_rejection_percentage, *state.net.delta_rolling_max);
+        }
+
+
+        // reject low loss to find equidistant point
+        if (state.net.equidistant_point && state.net.equidistant_point < iteration_num) {
+            printf(" equidistant_point loss_threshold = %f, start_it = %d, progress = %3.1f %% \n", ep_loss_threshold, state.net.equidistant_point, progress * 100);
+            for (i = 0; i < l.batch * l.outputs; ++i) {
+                if (fabs(l.delta[i]) < ep_loss_threshold)
+                    l.delta[i] = 0;
+            }
+        }
+    }
+
+    if (count == 0) count = 1;
+    if (class_count == 0) class_count = 1;
+
+    if (l.show_details == 0) {
+        float loss = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+        *(l.cost) = loss;
+
+        loss /= l.batch;
+
+        fprintf(stderr, "v3 (%s loss, Normalizer: (iou: %.2f, obj: %.2f, cls: %.2f) Region %d Avg (IOU: %f), count: %d, total_loss = %f \n",
+            (l.iou_loss == MSE ? "mse" : (l.iou_loss == GIOU ? "giou" : "iou")), l.iou_normalizer, l.obj_normalizer, l.cls_normalizer, state.index, tot_iou / count, count, loss);
+    }
+    else {
+        // show detailed output
+
+        int stride = l.w*l.h;
+        float* no_iou_loss_delta = (float *)calloc(l.batch * l.outputs, sizeof(float));
+        memcpy(no_iou_loss_delta, l.delta, l.batch * l.outputs * sizeof(float));
+
+
+        int j, n;
+        for (b = 0; b < l.batch; ++b) {
+            for (j = 0; j < l.h; ++j) {
+                for (i = 0; i < l.w; ++i) {
+                    for (n = 0; n < l.n; ++n) {
+                        int index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                        no_iou_loss_delta[index + 0 * stride] = 0;
+                        no_iou_loss_delta[index + 1 * stride] = 0;
+                        no_iou_loss_delta[index + 2 * stride] = 0;
+                        no_iou_loss_delta[index + 3 * stride] = 0;
+                    }
+                }
+            }
+        }
+
+        float classification_loss = l.obj_normalizer * pow(mag_array(no_iou_loss_delta, l.outputs * l.batch), 2);
+        free(no_iou_loss_delta);
+        float loss = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+        float iou_loss = loss - classification_loss;
+
+        float avg_iou_loss = 0;
+        *(l.cost) = loss;
+
+        // gIOU loss + MSE (objectness) loss
+        if (l.iou_loss == MSE) {
+            *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+        }
+        else {
+            // Always compute classification loss both for iou + cls loss and for logging with mse loss
+            // TODO: remove IOU loss fields before computing MSE on class
+            //   probably split into two arrays
+            if (l.iou_loss == GIOU) {
+                avg_iou_loss = count > 0 ? l.iou_normalizer * (tot_giou_loss / count) : 0;
+            }
+            else {
+                avg_iou_loss = count > 0 ? l.iou_normalizer * (tot_iou_loss / count) : 0;
+            }
+            *(l.cost) = avg_iou_loss + classification_loss;
+        }
+
+
+        loss /= l.batch;
+        classification_loss /= l.batch;
+        iou_loss /= l.batch;
+
+        fprintf(stderr, "v3 (%s loss, Normalizer: (iou: %.2f, obj: %.2f, cls: %.2f) Region %d Avg (IOU: %f), count: %d, class_loss = %f, iou_loss = %f, total_loss = %f \n",
+            (l.iou_loss == MSE ? "mse" : (l.iou_loss == GIOU ? "giou" : "iou")), l.iou_normalizer, l.obj_normalizer, l.cls_normalizer, state.index, tot_iou / count, count, classification_loss, iou_loss, loss);
+
+        //fprintf(stderr, "v3 (%s loss, Normalizer: (iou: %.2f, cls: %.2f) Region %d Avg (IOU: %f, GIOU: %f), Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f, count: %d, class_loss = %f, iou_loss = %f, total_loss = %f \n",
+        //    (l.iou_loss == MSE ? "mse" : (l.iou_loss == GIOU ? "giou" : "iou")), l.iou_normalizer, l.obj_normalizer, state.index, tot_iou / count, tot_giou / count, avg_cat / class_count, avg_obj / count, avg_anyobj / (l.w*l.h*l.n*l.batch), recall / count, recall75 / count, count,
+        //    classification_loss, iou_loss, loss);
+    }
+}
+
+void backward_yolo_layer(const layer l, network_state state)
+{
+   axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
+}
+
+// Converts output of the network to detection boxes
+// w,h: image width,height
+// netw,neth: network width,height
+// relative: 1 (all callers seems to pass TRUE)
+void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter)
+{
+    int i;
+    // network height (or width)
+    int new_w = 0;
+    // network height (or width)
+    int new_h = 0;
+    // Compute scale given image w,h vs network w,h
+    // I think this "rotates" the image to match network to input image w/h ratio
+    // new_h and new_w are really just network width and height
+    if (letter) {
+        if (((float)netw / w) < ((float)neth / h)) {
+            new_w = netw;
+            new_h = (h * netw) / w;
+        }
+        else {
+            new_h = neth;
+            new_w = (w * neth) / h;
+        }
+    }
+    else {
+        new_w = netw;
+        new_h = neth;
+    }
+    // difference between network width and "rotated" width
+    float deltaw = netw - new_w;
+    // difference between network height and "rotated" height
+    float deltah = neth - new_h;
+    // ratio between rotated network width and network width
+    float ratiow = (float)new_w / netw;
+    // ratio between rotated network width and network width
+    float ratioh = (float)new_h / neth;
+    for (i = 0; i < n; ++i) {
+
+        box b = dets[i].bbox;
+        // x = ( x - (deltaw/2)/netw ) / ratiow;
+        //   x - [(1/2 the difference of the network width and rotated width) / (network width)]
+        b.x = (b.x - deltaw / 2. / netw) / ratiow;
+        b.y = (b.y - deltah / 2. / neth) / ratioh;
+        // scale to match rotation of incoming image
+        b.w *= 1 / ratiow;
+        b.h *= 1 / ratioh;
+
+        // relative seems to always be == 1, I don't think we hit this condition, ever.
+        if (!relative) {
+            b.x *= w;
+            b.w *= w;
+            b.y *= h;
+            b.h *= h;
+        }
+
+        dets[i].bbox = b;
+    }
+}
+
+/*
+void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter)
+{
+    int i;
+    int new_w=0;
+    int new_h=0;
+    if (letter) {
+        if (((float)netw / w) < ((float)neth / h)) {
+            new_w = netw;
+            new_h = (h * netw) / w;
+        }
+        else {
+            new_h = neth;
+            new_w = (w * neth) / h;
+        }
+    }
+    else {
+        new_w = netw;
+        new_h = neth;
+    }
+    for (i = 0; i < n; ++i){
+        box b = dets[i].bbox;
+        b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw);
+        b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth);
+        b.w *= (float)netw/new_w;
+        b.h *= (float)neth/new_h;
+        if(!relative){
+            b.x *= w;
+            b.w *= w;
+            b.y *= h;
+            b.h *= h;
+        }
+        dets[i].bbox = b;
+    }
+}
+*/
+
+int yolo_num_detections(layer l, float thresh)
+{
+    int i, n;
+    int count = 0;
+    for(n = 0; n < l.n; ++n){
+        for (i = 0; i < l.w*l.h; ++i) {
+            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
+            if(l.output[obj_index] > thresh){
+                ++count;
+            }
+        }
+    }
+    return count;
+}
+
+int yolo_num_detections_batch(layer l, float thresh, int batch)
+{
+    int i, n;
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_index(l, batch, n*l.w*l.h + i, 4);
+            if(l.output[obj_index] > thresh){
+                ++count;
+            }
+        }
+    }
+    return count;
+}
+
+void avg_flipped_yolo(layer l)
+{
+    int i,j,n,z;
+    float *flip = l.output + l.outputs;
+    for (j = 0; j < l.h; ++j) {
+        for (i = 0; i < l.w/2; ++i) {
+            for (n = 0; n < l.n; ++n) {
+                for(z = 0; z < l.classes + 4 + 1; ++z){
+                    int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
+                    int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
+                    float swap = flip[i1];
+                    flip[i1] = flip[i2];
+                    flip[i2] = swap;
+                    if(z == 0){
+                        flip[i1] = -flip[i1];
+                        flip[i2] = -flip[i2];
+                    }
+                }
+            }
+        }
+    }
+    for(i = 0; i < l.outputs; ++i){
+        l.output[i] = (l.output[i] + flip[i])/2.;
+    }
+}
+
+int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter)
+{
+    //printf("\n l.batch = %d, l.w = %d, l.h = %d, l.n = %d \n", l.batch, l.w, l.h, l.n);
+    int i,j,n;
+    float *predictions = l.output;
+    // This snippet below is not necessary
+    // Need to comment it in order to batch processing >= 2 images
+    //if (l.batch == 2) avg_flipped_yolo(l);
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        int row = i / l.w;
+        int col = i % l.w;
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
+            float objectness = predictions[obj_index];
+            //if(objectness <= thresh) continue;    // incorrect behavior for Nan values
+            if (objectness > thresh) {
+                //printf("\n objectness = %f, thresh = %f, i = %d, n = %d \n", objectness, thresh, i, n);
+                int box_index = entry_index(l, 0, n*l.w*l.h + i, 0);
+                dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h, l.new_coords);
+                dets[count].objectness = objectness;
+                dets[count].classes = l.classes;
+                if (l.embedding_output) {
+                    get_embedding(l.embedding_output, l.w, l.h, l.n*l.embedding_size, l.embedding_size, col, row, n, 0, dets[count].embeddings);
+                }
+
+                for (j = 0; j < l.classes; ++j) {
+                    int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
+                    float prob = objectness*predictions[class_index];
+                    dets[count].prob[j] = (prob > thresh) ? prob : 0;
+                }
+                ++count;
+            }
+        }
+    }
+    correct_yolo_boxes(dets, count, w, h, netw, neth, relative, letter);
+    return count;
+}
+
+int get_yolo_detections_batch(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter, int batch)
+{
+    int i,j,n;
+    float *predictions = l.output;
+    //if (l.batch == 2) avg_flipped_yolo(l);
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        int row = i / l.w;
+        int col = i % l.w;
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_index(l, batch, n*l.w*l.h + i, 4);
+            float objectness = predictions[obj_index];
+            //if(objectness <= thresh) continue;    // incorrect behavior for Nan values
+            if (objectness > thresh) {
+                //printf("\n objectness = %f, thresh = %f, i = %d, n = %d \n", objectness, thresh, i, n);
+                int box_index = entry_index(l, batch, n*l.w*l.h + i, 0);
+                dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h, l.new_coords);
+                dets[count].objectness = objectness;
+                dets[count].classes = l.classes;
+                if (l.embedding_output) {
+                    get_embedding(l.embedding_output, l.w, l.h, l.n*l.embedding_size, l.embedding_size, col, row, n, batch, dets[count].embeddings);
+                }
+
+                for (j = 0; j < l.classes; ++j) {
+                    int class_index = entry_index(l, batch, n*l.w*l.h + i, 4 + 1 + j);
+                    float prob = objectness*predictions[class_index];
+                    dets[count].prob[j] = (prob > thresh) ? prob : 0;
+                }
+                ++count;
+            }
+        }
+    }
+    correct_yolo_boxes(dets, count, w, h, netw, neth, relative, letter);
+    return count;
+}
+
+#ifdef GPU
+
+void forward_yolo_layer_gpu(const layer l, network_state state)
+{
+    if (l.embedding_output) {
+        layer le = state.net.layers[l.embedding_layer_id];
+        cuda_pull_array_async(le.output_gpu, l.embedding_output, le.batch*le.outputs);
+    }
+
+    //copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
+    simple_copy_ongpu(l.batch*l.inputs, state.input, l.output_gpu);
+    int b, n;
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int bbox_index = entry_index(l, b, n*l.w*l.h, 0);
+            // y = 1./(1. + exp(-x))
+            // x = ln(y/(1-y))  // ln - natural logarithm (base = e)
+            // if(y->1) x -> inf
+            // if(y->0) x -> -inf
+            if (l.new_coords) {
+                //activate_array_ongpu(l.output_gpu + bbox_index, 4 * l.w*l.h, LOGISTIC);    // x,y,w,h
+            }
+            else {
+                activate_array_ongpu(l.output_gpu + bbox_index, 2 * l.w*l.h, LOGISTIC);    // x,y
+
+                int obj_index = entry_index(l, b, n*l.w*l.h, 4);
+                activate_array_ongpu(l.output_gpu + obj_index, (1 + l.classes)*l.w*l.h, LOGISTIC); // classes and objectness
+            }
+            if (l.scale_x_y != 1) scal_add_ongpu(2 * l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output_gpu + bbox_index, 1);      // scale x,y
+        }
+    }
+    if(!state.train || l.onlyforward){
+        //cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+        if (l.mean_alpha && l.output_avg_gpu) mean_array_gpu(l.output_gpu, l.batch*l.outputs, l.mean_alpha, l.output_avg_gpu);
+        cuda_pull_array_async(l.output_gpu, l.output, l.batch*l.outputs);
+        CHECK_CUDA(cudaPeekAtLastError());
+        return;
+    }
+
+    float *in_cpu = (float *)xcalloc(l.batch*l.inputs, sizeof(float));
+    cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+    memcpy(in_cpu, l.output, l.batch*l.outputs*sizeof(float));
+    float *truth_cpu = 0;
+    if (state.truth) {
+        int num_truth = l.batch*l.truths;
+        truth_cpu = (float *)xcalloc(num_truth, sizeof(float));
+        cuda_pull_array(state.truth, truth_cpu, num_truth);
+    }
+    network_state cpu_state = state;
+    cpu_state.net = state.net;
+    cpu_state.index = state.index;
+    cpu_state.train = state.train;
+    cpu_state.truth = truth_cpu;
+    cpu_state.input = in_cpu;
+    forward_yolo_layer(l, cpu_state);
+    //forward_yolo_layer(l, state);
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
+    free(in_cpu);
+    if (cpu_state.truth) free(cpu_state.truth);
+}
+
+void backward_yolo_layer_gpu(const layer l, network_state state)
+{
+    axpy_ongpu(l.batch*l.inputs, state.net.loss_scale * l.delta_normalizer, l.delta_gpu, 1, state.delta, 1);
+}
+#endif
diff --git a/darknet-master/src/yolo_layer.h b/darknet-master/src/yolo_layer.h
new file mode 100644
index 0000000..08883b0
--- /dev/null
+++ b/darknet-master/src/yolo_layer.h
@@ -0,0 +1,29 @@
+#ifndef YOLO_LAYER_H
+#define YOLO_LAYER_H
+
+//#include "darknet.h"
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes, int max_boxes);
+void forward_yolo_layer(const layer l, network_state state);
+void backward_yolo_layer(const layer l, network_state state);
+void resize_yolo_layer(layer *l, int w, int h);
+int yolo_num_detections(layer l, float thresh);
+int yolo_num_detections_batch(layer l, float thresh, int batch);
+int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter);
+int get_yolo_detections_batch(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter, int batch);
+void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter);
+
+#ifdef GPU
+void forward_yolo_layer_gpu(const layer l, network_state state);
+void backward_yolo_layer_gpu(const layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/darknet-master/src/yolo_v2_class.cpp b/darknet-master/src/yolo_v2_class.cpp
new file mode 100644
index 0000000..19d3810
--- /dev/null
+++ b/darknet-master/src/yolo_v2_class.cpp
@@ -0,0 +1,502 @@
+#include "darknet.h"
+#include "yolo_v2_class.hpp"
+
+#include "network.h"
+
+extern "C" {
+#include "detection_layer.h"
+#include "region_layer.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "parser.h"
+#include "box.h"
+#include "image.h"
+#include "demo.h"
+#include "option_list.h"
+#include <stb_image.h>
+}
+//#include <sys/time.h>
+
+#include <vector>
+#include <iostream>
+#include <algorithm>
+#include <cmath>
+
+#define NFRAMES 3
+
+//static Detector* detector = NULL;
+static std::unique_ptr<Detector> detector;
+
+int init(const char *configurationFilename, const char *weightsFilename, int gpu, int batch_size)
+{
+    detector.reset(new Detector(configurationFilename, weightsFilename, gpu, batch_size));
+    return 1;
+}
+
+int detect_image(const char *filename, bbox_t_container &container)
+{
+    std::vector<bbox_t> detection = detector->detect(filename);
+    for (size_t i = 0; i < detection.size() && i < C_SHARP_MAX_OBJECTS; ++i)
+        container.candidates[i] = detection[i];
+    return detection.size();
+}
+
+int detect_mat(const uint8_t* data, const size_t data_length, bbox_t_container &container) {
+#ifdef OPENCV
+    std::vector<char> vdata(data, data + data_length);
+    cv::Mat image = imdecode(cv::Mat(vdata), 1);
+
+    std::vector<bbox_t> detection = detector->detect(image);
+    for (size_t i = 0; i < detection.size() && i < C_SHARP_MAX_OBJECTS; ++i)
+        container.candidates[i] = detection[i];
+    return detection.size();
+#else
+    return -1;
+#endif    // OPENCV
+}
+
+int dispose() {
+    //if (detector != NULL) delete detector;
+    //detector = NULL;
+    detector.reset();
+    return 1;
+}
+
+int get_device_count() {
+#ifdef GPU
+    int count = 0;
+    cudaGetDeviceCount(&count);
+    return count;
+#else
+    return -1;
+#endif    // GPU
+}
+
+bool built_with_cuda(){
+#ifdef GPU
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool built_with_cudnn(){
+#ifdef CUDNN
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool built_with_opencv(){
+#ifdef OPENCV
+    return true;
+#else
+    return false;
+#endif
+}
+
+
+int get_device_name(int gpu, char* deviceName) {
+#ifdef GPU
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, gpu);
+    std::string result = prop.name;
+    std::copy(result.begin(), result.end(), deviceName);
+    return 1;
+#else
+    return -1;
+#endif    // GPU
+}
+
+#ifdef GPU
+void check_cuda(cudaError_t status) {
+    if (status != cudaSuccess) {
+        const char *s = cudaGetErrorString(status);
+        printf("CUDA Error Prev: %s\n", s);
+    }
+}
+#endif
+
+struct detector_gpu_t {
+    network net;
+    image images[NFRAMES];
+    float *avg;
+    float* predictions[NFRAMES];
+    int demo_index;
+    unsigned int *track_id;
+};
+
+LIB_API Detector::Detector(std::string cfg_filename, std::string weight_filename, int gpu_id, int batch_size)
+    : cur_gpu_id(gpu_id)
+{
+    wait_stream = 0;
+#ifdef GPU
+    int old_gpu_index;
+    check_cuda( cudaGetDevice(&old_gpu_index) );
+#endif
+
+    detector_gpu_ptr = std::make_shared<detector_gpu_t>();
+    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
+
+#ifdef GPU
+    //check_cuda( cudaSetDevice(cur_gpu_id) );
+    cuda_set_device(cur_gpu_id);
+    printf(" Used GPU %d \n", cur_gpu_id);
+#endif
+    network &net = detector_gpu.net;
+    net.gpu_index = cur_gpu_id;
+    //gpu_index = i;
+
+    _cfg_filename = cfg_filename;
+    _weight_filename = weight_filename;
+
+    char *cfgfile = const_cast<char *>(_cfg_filename.c_str());
+    char *weightfile = const_cast<char *>(_weight_filename.c_str());
+
+    net = parse_network_cfg_custom(cfgfile, batch_size, batch_size);
+    if (weightfile) {
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, batch_size);
+    net.gpu_index = cur_gpu_id;
+    fuse_conv_batchnorm(net);
+
+    layer l = net.layers[net.n - 1];
+    int j;
+
+    detector_gpu.avg = (float *)calloc(l.outputs, sizeof(float));
+    for (j = 0; j < NFRAMES; ++j) detector_gpu.predictions[j] = (float*)calloc(l.outputs, sizeof(float));
+    for (j = 0; j < NFRAMES; ++j) detector_gpu.images[j] = make_image(1, 1, 3);
+
+    detector_gpu.track_id = (unsigned int *)calloc(l.classes, sizeof(unsigned int));
+    for (j = 0; j < l.classes; ++j) detector_gpu.track_id[j] = 1;
+
+#ifdef GPU
+    check_cuda( cudaSetDevice(old_gpu_index) );
+#endif
+}
+
+
+LIB_API Detector::~Detector()
+{
+    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
+    //layer l = detector_gpu.net.layers[detector_gpu.net.n - 1];
+
+    free(detector_gpu.track_id);
+
+    free(detector_gpu.avg);
+    for (int j = 0; j < NFRAMES; ++j) free(detector_gpu.predictions[j]);
+    for (int j = 0; j < NFRAMES; ++j) if (detector_gpu.images[j].data) free(detector_gpu.images[j].data);
+
+#ifdef GPU
+    int old_gpu_index;
+    cudaGetDevice(&old_gpu_index);
+    cuda_set_device(detector_gpu.net.gpu_index);
+#endif
+
+    free_network(detector_gpu.net);
+
+#ifdef GPU
+    cudaSetDevice(old_gpu_index);
+#endif
+}
+
+LIB_API int Detector::get_net_width() const {
+    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
+    return detector_gpu.net.w;
+}
+LIB_API int Detector::get_net_height() const {
+    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
+    return detector_gpu.net.h;
+}
+LIB_API int Detector::get_net_color_depth() const {
+    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
+    return detector_gpu.net.c;
+}
+
+
+LIB_API std::vector<bbox_t> Detector::detect(std::string image_filename, float thresh, bool use_mean)
+{
+    std::shared_ptr<image_t> image_ptr(new image_t, [](image_t *img) { if (img->data) free(img->data); delete img; });
+    *image_ptr = load_image(image_filename);
+    return detect(*image_ptr, thresh, use_mean);
+}
+
+static image load_image_stb(char *filename, int channels)
+{
+    int w, h, c;
+    unsigned char *data = stbi_load(filename, &w, &h, &c, channels);
+    if (!data)
+        throw std::runtime_error("file not found");
+    if (channels) c = channels;
+    int i, j, k;
+    image im = make_image(w, h, c);
+    for (k = 0; k < c; ++k) {
+        for (j = 0; j < h; ++j) {
+            for (i = 0; i < w; ++i) {
+                int dst_index = i + w*j + w*h*k;
+                int src_index = k + c*i + c*w*j;
+                im.data[dst_index] = (float)data[src_index] / 255.;
+            }
+        }
+    }
+    free(data);
+    return im;
+}
+
+LIB_API image_t Detector::load_image(std::string image_filename)
+{
+    char *input = const_cast<char *>(image_filename.c_str());
+    image im = load_image_stb(input, 3);
+
+    image_t img;
+    img.c = im.c;
+    img.data = im.data;
+    img.h = im.h;
+    img.w = im.w;
+
+    return img;
+}
+
+
+LIB_API void Detector::free_image(image_t m)
+{
+    if (m.data) {
+        free(m.data);
+    }
+}
+
+LIB_API std::vector<bbox_t> Detector::detect(image_t img, float thresh, bool use_mean)
+{
+    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
+    network &net = detector_gpu.net;
+#ifdef GPU
+    int old_gpu_index;
+    cudaGetDevice(&old_gpu_index);
+    if(cur_gpu_id != old_gpu_index)
+        cudaSetDevice(net.gpu_index);
+
+    net.wait_stream = wait_stream;    // 1 - wait CUDA-stream, 0 - not to wait
+#endif
+    //std::cout << "net.gpu_index = " << net.gpu_index << std::endl;
+
+    image im;
+    im.c = img.c;
+    im.data = img.data;
+    im.h = img.h;
+    im.w = img.w;
+
+    image sized;
+
+    if (net.w == im.w && net.h == im.h) {
+        sized = make_image(im.w, im.h, im.c);
+        memcpy(sized.data, im.data, im.w*im.h*im.c * sizeof(float));
+    }
+    else
+        sized = resize_image(im, net.w, net.h);
+
+    layer l = net.layers[net.n - 1];
+
+    float *X = sized.data;
+
+    float *prediction = network_predict(net, X);
+
+    if (use_mean) {
+        memcpy(detector_gpu.predictions[detector_gpu.demo_index], prediction, l.outputs * sizeof(float));
+        mean_arrays(detector_gpu.predictions, NFRAMES, l.outputs, detector_gpu.avg);
+        l.output = detector_gpu.avg;
+        detector_gpu.demo_index = (detector_gpu.demo_index + 1) % NFRAMES;
+    }
+    //get_region_boxes(l, 1, 1, thresh, detector_gpu.probs, detector_gpu.boxes, 0, 0);
+    //if (nms) do_nms_sort(detector_gpu.boxes, detector_gpu.probs, l.w*l.h*l.n, l.classes, nms);
+
+    int nboxes = 0;
+    int letterbox = 0;
+    float hier_thresh = 0.5;
+    detection *dets = get_network_boxes(&net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes, letterbox);
+    if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+
+    std::vector<bbox_t> bbox_vec;
+
+    for (int i = 0; i < nboxes; ++i) {
+        box b = dets[i].bbox;
+        int const obj_id = max_index(dets[i].prob, l.classes);
+        float const prob = dets[i].prob[obj_id];
+
+        if (prob > thresh)
+        {
+            bbox_t bbox;
+            bbox.x = std::max((double)0, (b.x - b.w / 2.)*im.w);
+            bbox.y = std::max((double)0, (b.y - b.h / 2.)*im.h);
+            bbox.w = b.w*im.w;
+            bbox.h = b.h*im.h;
+            bbox.obj_id = obj_id;
+            bbox.prob = prob;
+            bbox.track_id = 0;
+            bbox.frames_counter = 0;
+            bbox.x_3d = NAN;
+            bbox.y_3d = NAN;
+            bbox.z_3d = NAN;
+
+            bbox_vec.push_back(bbox);
+        }
+    }
+
+    free_detections(dets, nboxes);
+    if(sized.data)
+        free(sized.data);
+
+#ifdef GPU
+    if (cur_gpu_id != old_gpu_index)
+        cudaSetDevice(old_gpu_index);
+#endif
+
+    return bbox_vec;
+}
+
+LIB_API std::vector<std::vector<bbox_t>> Detector::detectBatch(image_t img, int batch_size, int width, int height, float thresh, bool make_nms)
+{
+    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
+    network &net = detector_gpu.net;
+#ifdef GPU
+    int old_gpu_index;
+    cudaGetDevice(&old_gpu_index);
+    if(cur_gpu_id != old_gpu_index)
+        cudaSetDevice(net.gpu_index);
+
+    net.wait_stream = wait_stream;    // 1 - wait CUDA-stream, 0 - not to wait
+#endif
+    //std::cout << "net.gpu_index = " << net.gpu_index << std::endl;
+
+    layer l = net.layers[net.n - 1];
+
+    float hier_thresh = 0.5;
+    image in_img;
+    in_img.c = img.c;
+    in_img.w = img.w;
+    in_img.h = img.h;
+    in_img.data = img.data;
+    det_num_pair* prediction = network_predict_batch(&net, in_img, batch_size, width, height, thresh, hier_thresh, 0, 0, 0);
+
+    std::vector<std::vector<bbox_t>> bbox_vec(batch_size);
+
+    for (int bi = 0; bi < batch_size; ++bi)
+    {
+        auto dets = prediction[bi].dets;
+
+        if (make_nms && nms)
+            do_nms_sort(dets, prediction[bi].num, l.classes, nms);
+
+        for (int i = 0; i < prediction[bi].num; ++i)
+        {
+            box b = dets[i].bbox;
+            int const obj_id = max_index(dets[i].prob, l.classes);
+            float const prob = dets[i].prob[obj_id];
+
+            if (prob > thresh)
+            {
+                bbox_t bbox;
+                bbox.x = std::max((double)0, (b.x - b.w / 2.));
+                bbox.y = std::max((double)0, (b.y - b.h / 2.));
+                bbox.w = b.w;
+                bbox.h = b.h;
+                bbox.obj_id = obj_id;
+                bbox.prob = prob;
+                bbox.track_id = 0;
+                bbox.frames_counter = 0;
+                bbox.x_3d = NAN;
+                bbox.y_3d = NAN;
+                bbox.z_3d = NAN;
+
+                bbox_vec[bi].push_back(bbox);
+            }
+        }
+    }
+    free_batch_detections(prediction, batch_size);
+
+#ifdef GPU
+    if (cur_gpu_id != old_gpu_index)
+        cudaSetDevice(old_gpu_index);
+#endif
+
+    return bbox_vec;
+}
+
+LIB_API std::vector<bbox_t> Detector::tracking_id(std::vector<bbox_t> cur_bbox_vec, bool const change_history,
+    int const frames_story, int const max_dist)
+{
+    detector_gpu_t &det_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
+
+    bool prev_track_id_present = false;
+    for (auto &i : prev_bbox_vec_deque)
+        if (i.size() > 0) prev_track_id_present = true;
+
+    if (!prev_track_id_present) {
+        for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
+            cur_bbox_vec[i].track_id = det_gpu.track_id[cur_bbox_vec[i].obj_id]++;
+        prev_bbox_vec_deque.push_front(cur_bbox_vec);
+        if (prev_bbox_vec_deque.size() > frames_story) prev_bbox_vec_deque.pop_back();
+        return cur_bbox_vec;
+    }
+
+    std::vector<unsigned int> dist_vec(cur_bbox_vec.size(), std::numeric_limits<unsigned int>::max());
+
+    for (auto &prev_bbox_vec : prev_bbox_vec_deque) {
+        for (auto &i : prev_bbox_vec) {
+            int cur_index = -1;
+            for (size_t m = 0; m < cur_bbox_vec.size(); ++m) {
+                bbox_t const& k = cur_bbox_vec[m];
+                if (i.obj_id == k.obj_id) {
+                    float center_x_diff = (float)(i.x + i.w/2) - (float)(k.x + k.w/2);
+                    float center_y_diff = (float)(i.y + i.h/2) - (float)(k.y + k.h/2);
+                    unsigned int cur_dist = sqrt(center_x_diff*center_x_diff + center_y_diff*center_y_diff);
+                    if (cur_dist < max_dist && (k.track_id == 0 || dist_vec[m] > cur_dist)) {
+                        dist_vec[m] = cur_dist;
+                        cur_index = m;
+                    }
+                }
+            }
+
+            bool track_id_absent = !std::any_of(cur_bbox_vec.begin(), cur_bbox_vec.end(),
+                [&i](bbox_t const& b) { return b.track_id == i.track_id && b.obj_id == i.obj_id; });
+
+            if (cur_index >= 0 && track_id_absent){
+                cur_bbox_vec[cur_index].track_id = i.track_id;
+                cur_bbox_vec[cur_index].w = (cur_bbox_vec[cur_index].w + i.w) / 2;
+                cur_bbox_vec[cur_index].h = (cur_bbox_vec[cur_index].h + i.h) / 2;
+            }
+        }
+    }
+
+    for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
+        if (cur_bbox_vec[i].track_id == 0)
+            cur_bbox_vec[i].track_id = det_gpu.track_id[cur_bbox_vec[i].obj_id]++;
+
+    if (change_history) {
+        prev_bbox_vec_deque.push_front(cur_bbox_vec);
+        if (prev_bbox_vec_deque.size() > frames_story) prev_bbox_vec_deque.pop_back();
+    }
+
+    return cur_bbox_vec;
+}
+
+
+void *Detector::get_cuda_context()
+{
+#ifdef GPU
+    int old_gpu_index;
+    cudaGetDevice(&old_gpu_index);
+    if (cur_gpu_id != old_gpu_index)
+        cudaSetDevice(cur_gpu_id);
+
+    void *cuda_context = cuda_get_context();
+
+    if (cur_gpu_id != old_gpu_index)
+        cudaSetDevice(old_gpu_index);
+
+    return cuda_context;
+#else   // GPU
+    return NULL;
+#endif  // GPU
+}
diff --git a/darknet-master/vcpkg.json b/darknet-master/vcpkg.json
new file mode 100644
index 0000000..6f142be
--- /dev/null
+++ b/darknet-master/vcpkg.json
@@ -0,0 +1,127 @@
+{
+  "name": "darknet",
+  "version": "0.2.5.7",
+  "description": "Darknet is an open source neural network framework written in C and CUDA. You only look once (YOLO) is a state-of-the-art, real-time object detection system, best example of darknet functionalities.",
+  "homepage": "https://github.com/alexeyab/darknet",
+  "dependencies": [
+    {
+      "name": "getopt",
+      "platform": "windows & !mingw"
+    },
+    "pthreads",
+    "stb"
+  ],
+  "features": {
+    "cuda": {
+      "description": "Build darknet with support for CUDA",
+      "dependencies": [
+        "cuda"
+      ]
+    },
+    "cudnn": {
+      "description": "Build darknet with support for cuDNN",
+      "dependencies": [
+        "cuda",
+        "cudnn"
+      ]
+    },
+    "cuda-opengl-integration": {
+      "description": "Build darknet with support for running networks straight from OpenGL textures",
+      "dependencies": [
+        "cuda",
+        "opengl"
+      ]
+    },
+    "full": {
+      "description": "Build darknet fully featured",
+      "dependencies": [
+        {
+          "name": "darknet",
+          "features": [
+            "cuda",
+            "cudnn",
+            "opencv-cuda",
+            "cuda-opengl-integration"
+          ]
+        }
+      ]
+    },
+    "opencv-base": {
+      "description": "Build darknet with support for latest version of OpenCV",
+      "dependencies": [
+        {
+          "name": "opencv",
+          "features": [
+            "contrib",
+            "dnn",
+            "ffmpeg"
+          ]
+        }
+      ]
+    },
+    "opencv-cuda": {
+      "description": "Build darknet with support for latest version of CUDA-enabled OpenCV",
+      "dependencies": [
+        {
+          "name": "opencv",
+          "features": [
+            "contrib",
+            "cuda",
+            "dnn",
+            "ffmpeg"
+          ]
+        }
+      ]
+    },
+    "opencv2-base": {
+      "description": "Build darknet with support for OpenCV2",
+      "dependencies": [
+        {
+          "name": "opencv2",
+          "features": [
+            "ffmpeg"
+          ]
+        }
+      ]
+    },
+    "opencv2-cuda": {
+      "description": "Build darknet with support for CUDA-enabled OpenCV2",
+      "dependencies": [
+        {
+          "name": "opencv2",
+          "features": [
+            "cuda",
+            "ffmpeg"
+          ]
+        }
+      ]
+    },
+    "opencv3-base": {
+      "description": "Build darknet with support for OpenCV3",
+      "dependencies": [
+        {
+          "name": "opencv3",
+          "features": [
+            "contrib",
+            "dnn",
+            "ffmpeg"
+          ]
+        }
+      ]
+    },
+    "opencv3-cuda": {
+      "description": "Build darknet with support for CUDA-enabled OpenCV3",
+      "dependencies": [
+        {
+          "name": "opencv3",
+          "features": [
+            "contrib",
+            "cuda",
+            "dnn",
+            "ffmpeg"
+          ]
+        }
+      ]
+    }
+  }
+}
diff --git a/darknet-master/vcpkg.json.opencv23 b/darknet-master/vcpkg.json.opencv23
new file mode 100644
index 0000000..0a45be9
--- /dev/null
+++ b/darknet-master/vcpkg.json.opencv23
@@ -0,0 +1,126 @@
+{
+  "name": "darknet",
+  "version": "0.2.5.7",
+  "description": "Darknet is an open source neural network framework written in C and CUDA. You only look once (YOLO) is a state-of-the-art, real-time object detection system, best example of darknet functionalities.",
+  "homepage": "https://github.com/alexeyab/darknet",
+  "dependencies": [
+    {
+      "name": "getopt",
+      "platform": "windows & !mingw"
+    },
+    "pthreads",
+    "stb"
+  ],
+  "features": {
+    "cuda": {
+      "description": "Build darknet with support for CUDA",
+      "dependencies": [
+        "cuda"
+      ]
+    },
+    "cudnn": {
+      "description": "Build darknet with support for cuDNN",
+      "dependencies": [
+        "cuda",
+        "cudnn"
+      ]
+    },
+    "full": {
+      "description": "Build darknet fully featured",
+      "dependencies": [
+        {
+          "name": "darknet",
+          "features": [
+            "cuda",
+            "cudnn",
+            "opencv-cuda"
+          ]
+        }
+      ]
+    },
+    "opencv-base": {
+      "description": "Build darknet with support for latest version of OpenCV",
+      "dependencies": [
+        {
+          "name": "opencv",
+          "features": [
+            "contrib",
+            "dnn",
+            "ffmpeg"
+          ]
+        }
+      ]
+    },
+    "opencv-cuda": {
+      "description": "Build darknet with support for latest version of CUDA-enabled OpenCV",
+      "dependencies": [
+        {
+          "name": "opencv",
+          "features": [
+            "contrib",
+            "cuda",
+            "dnn",
+            "ffmpeg"
+          ]
+        }
+      ]
+    },
+    "opencv2-base": {
+      "description": "Build darknet with support for OpenCV2",
+      "dependencies": [
+        {
+          "name": "opencv2",
+          "features": [
+            "ffmpeg"
+          ]
+        }
+      ]
+    },
+    "opencv2-cuda": {
+      "description": "Build darknet with support for CUDA-enabled OpenCV2",
+      "dependencies": [
+        {
+          "name": "opencv2",
+          "features": [
+            "cuda",
+            "ffmpeg"
+          ]
+        }
+      ]
+    },
+    "opencv3-base": {
+      "description": "Build darknet with support for OpenCV3",
+      "dependencies": [
+        {
+          "name": "opencv3",
+          "features": [
+            "contrib",
+            "dnn",
+            "ffmpeg"
+          ]
+        }
+      ]
+    },
+    "opencv3-cuda": {
+      "description": "Build darknet with support for CUDA-enabled OpenCV3",
+      "dependencies": [
+        {
+          "name": "opencv3",
+          "features": [
+            "contrib",
+            "cuda",
+            "dnn",
+            "ffmpeg"
+          ]
+        }
+      ]
+    }
+  },
+  "overrides": [
+    {
+      "name": "ffmpeg",
+      "version": "4.4.3"
+    }
+  ],
+  "builtin-baseline": "54cc53c43430c73f489e52af5fadd032c1aced16"
+}
diff --git a/darknet-master/video_yolov3.sh b/darknet-master/video_yolov3.sh
new file mode 100644
index 0000000..2d0346a
--- /dev/null
+++ b/darknet-master/video_yolov3.sh
@@ -0,0 +1,6 @@
+
+
+./darknet detector demo ./cfg/coco.data ./cfg/yolov3.cfg ./yolov3.weights test50.mp4 -i 0 -thresh 0.25
+
+
+
diff --git a/darknet-master/video_yolov4.sh b/darknet-master/video_yolov4.sh
new file mode 100644
index 0000000..159c855
--- /dev/null
+++ b/darknet-master/video_yolov4.sh
@@ -0,0 +1,6 @@
+
+
+./darknet detector demo ./cfg/coco.data ./cfg/yolov4.cfg ./yolov4.weights test50.mp4 -i 0 -thresh 0.25
+
+
+
diff --git a/detecteur.py b/detecteur.py
new file mode 100644
index 0000000..ecd9219
--- /dev/null
+++ b/detecteur.py
@@ -0,0 +1,59 @@
+import os
+import sys
+import time
+import numpy as np
+import cv2
+
+cap = cv2.VideoCapture(0)
+# cap = cv2.VideoCapture("chien.mp4")
+
+kernel_blur = 5
+seuil = 15
+surface = 1000
+
+ret, originale = cap.read()
+originale = cv2.cvtColor(originale, cv2.COLOR_BGR2GRAY)
+originale = cv2.GaussianBlur(originale, (kernel_blur, kernel_blur), 0)
+
+kernel_dilate = np.ones((5, 5), np.uint8)
+
+while True:
+    ret, frame = cap.read()
+    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+    gray = cv2.GaussianBlur(gray, (kernel_blur, kernel_blur), 0)
+    mask = cv2.absdiff(originale, gray)
+    mask = cv2.threshold(mask, seuil, 255, cv2.THRESH_BINARY)[1]
+    mask = cv2.dilate(mask, kernel_dilate, iterations=3)
+    contours, nada = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    frame_contour = frame.copy()
+    for c in contours:
+        cv2.drawContours(frame_contour, [c], 0, (0, 255, 0), 5)
+        if cv2.contourArea(c) < surface:
+            continue
+        x, y, w, h = cv2.boundingRect(c)
+        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 0, 255), 2)
+    originale = gray
+    cv2.putText(frame, "[o|l]seuil: {:d}  [p|m]blur: {:d}  [i|k]surface: {:d}".format(seuil, kernel_blur, surface), (10, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 255, 255), 2)
+    # cv2.imshow("frame", frame)
+    cv2.imshow("contour", frame_contour)
+    # cv2.imshow("mask", mask)
+    intrus = 0
+    key = cv2.waitKey(30) & 0xFF
+
+    if key == ord('q'):
+        break
+    if key == ord('p'):
+        kernel_blur = min(43, kernel_blur + 2)
+    if key == ord('m'):
+        kernel_blur = max(1, kernel_blur - 2)
+    if key == ord('i'):
+        surface += 1000
+    if key == ord('k'):
+        surface = max(1000, surface - 1000)
+    if key == ord('o'):
+        seuil = min(255, seuil + 1)
+    if key == ord('l'):
+        seuil = max(1, seuil - 1)
+
+cap.release()
+cv2.destroyAllWindows()