float123
diff --git a/‎README.md‎
Lines changed: 6 additions & 5 deletions b/‎README.md‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎yolov3-spp/README.md‎
Lines changed: 1 addition & 1 deletion b/‎yolov3-spp/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎yolov3/CMakeLists.txt‎
Lines changed: 14 additions & 6 deletions b/‎yolov3/CMakeLists.txt‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎yolov3/README.md‎
Lines changed: 39 additions & 25 deletions b/‎yolov3/README.md‎
Lines changed: 39 additions & 25 deletions
diff --git a/‎yolov3/Utils.h‎
Lines changed: 94 additions & 0 deletions b/‎yolov3/Utils.h‎
Lines changed: 94 additions & 0 deletions
@@ -12,6 +12,7 @@ All the models are implemented in pytorch first, and export a weights file xxx.w
 
 - `22 May 2020`. A new branch [trt4](https://github.com/wang-xinyu/tensorrtx/tree/trt4) created, which is using TensorRT 4 API. Now the master branch is using TensorRT 7 API. But only `yolov4` has been migrated to TensorRT 7 API for now. The rest will be migrated soon. And a tutorial for `migarating from TensorRT 4 to 7` provided.
 - `28 May 2020`. arcface LResNet50E-IR model from [deepinsight/insightface](https://github.com/deepinsight/insightface) implemented. We got 333fps on GTX1080.
+- `2 June 2020`. yolov3 and yolov3-spp migrated to TensorRT 7 API. The new yolov3 is using pytorch implementation [ultralytics/yolov3](https://github.com/ultralytics/yolov3), the yolov3 in branch `trt4` was using pytorch implementation [ayooshkathuria/pytorch-yolo-v3](https://github.com/ayooshkathuria/pytorch-yolo-v3).
 
 ## Tutorials
 
@@ -44,8 +45,8 @@ Following models are implemented.
 |[shufflenet](./shufflenetv2)| ShuffleNetV2 with 0.5x output channels |
 |[squeezenet](./squeezenet)| SqueezeNet 1.1 model |
 |[vgg](./vgg)| VGG 11-layer model |
-|[yolov3](./yolov3)| darknet-53, weights from yolov3 authors, pytorch implementation from [ayooshkathuria/pytorch-yolo-v3](https://github.com/ayooshkathuria/pytorch-yolo-v3) |
-|[yolov3-spp](./yolov3-spp)| darknet-53, weights from [ultralytics/yolov3](https://github.com/ultralytics/yolov3) |
+|[yolov3](./yolov3)| darknet-53, weights and pytorch implementation from [ultralytics/yolov3](https://github.com/ultralytics/yolov3) |
+|[yolov3-spp](./yolov3-spp)| darknet-53, weights and pytorch implementation from [ultralytics/yolov3](https://github.com/ultralytics/yolov3) |
 |[yolov4](./yolov4)| CSPDarknet53, weights from [AlexeyAB/darknet](https://github.com/AlexeyAB/darknet#pre-trained-models), pytorch implementation from [ultralytics/yolov3](https://github.com/ultralytics/yolov3) |
 |[retinaface](./retinaface)| resnet-50, weights from [biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface) |
 |[arcface](./arcface)| LResNet50E-IR, weights from [deepinsight/insightface](https://github.com/deepinsight/insightface) |
@@ -63,8 +64,8 @@ Some tricky operations encountered in these models, already solved, but might ha
 |torch.chunk()| implement the 'chunk(2, dim=C)' by tensorrt plugin, see shufflenet. |
 |channel shuffle| use two shuffle layers to implement `channel_shuffle`, see shufflenet. |
 |adaptive pool| use fixed input dimension, and use regular average pooling, see shufflenet. |
-|leaky relu| I wrote a leaky relu plugin, but PRelu in `NvInferPlugin.h` can be used, see yolov3. |
-|yolo layer v1| yolo layer is implemented as a plugin, see yolov3. |
+|leaky relu| I wrote a leaky relu plugin, but PRelu in `NvInferPlugin.h` can be used, see yolov3 in branch `trt4`. |
+|yolo layer v1| yolo layer is implemented as a plugin, see yolov3 in branch `trt4`. |
 |yolo layer v2| three yolo layers implemented in one plugin, see yolov3-spp. |
 |upsample| replaced by a deconvolution layer, see yolov3. |
 |hsigmoid| hard sigmoid is implemented as a plugin, hsigmoid and hswish are used in mobilenetv3 |
@@ -76,7 +77,7 @@ Some tricky operations encountered in these models, already solved, but might ha
 
 | Models | Device | BatchSize | Mode | Input Shape(HxW) | FPS |
 |-|-|:-:|:-:|:-:|:-:|
-| YOLOv3(darknet53) | Xavier | 1 | FP16 | 320x320 | 55 |
+| YOLOv3(darknet53) | Xeon E5-2620/GTX1080 | 1 | FP16 | 608x608 | 39.2 |
 | YOLOv3-spp(darknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 256x416 | 94 |
 | YOLOv3-spp(darknet53) | Xeon E5-2620/GTX1080 | 1 | FP16 | 608x608 | 38.5 |
 | YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 1 | FP16 | 608x608 | 35.7 |
 
@@ -56,5 +56,5 @@ sudo ./yolov3-spp -d  ../samples // deserialize plan file and run inference, the
 
 ## More Information
 
-See the [readme](../README.md) in home page
+See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
 
@@ -13,22 +13,30 @@ find_package(CUDA REQUIRED)
 set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
 
 include_directories(${PROJECT_SOURCE_DIR}/include)
-include_directories(/usr/local/cuda-9.0/targets/aarch64-linux/include)
-link_directories(/usr/local/cuda-9.0/targets/aarch64-linux/lib)
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    message("embed_platform on")
+    include_directories(/usr/local/cuda/targets/aarch64-linux/include)
+    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
+else()
+    message("embed_platform off")
+    include_directories(/usr/local/cuda/include)
+    link_directories(/usr/local/cuda/lib64)
+endif()
+
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
 
 #cuda_add_library(leaky ${PROJECT_SOURCE_DIR}/leaky.cu)
-cuda_add_library(yololayer ${PROJECT_SOURCE_DIR}/yololayer.cu)
+cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
 
 find_package(OpenCV)
 include_directories(OpenCV_INCLUDE_DIRS)
 
-add_executable(yolov3 ${PROJECT_SOURCE_DIR}/plugin_factory.cpp ${PROJECT_SOURCE_DIR}/yolov3.cpp)
-target_link_libraries(yolov3 nvinfer nvinfer_plugin)
+add_executable(yolov3 ${PROJECT_SOURCE_DIR}/yolov3.cpp)
+target_link_libraries(yolov3 nvinfer)
 target_link_libraries(yolov3 cudart)
 target_link_libraries(yolov3 yololayer)
-target_link_libraries(yolov3 ${OpenCV_LIBRARIES})
+target_link_libraries(yolov3 ${OpenCV_LIBS})
 
 add_definitions(-O2 -pthread)
 
@@ -1,41 +1,55 @@
-# yolo v3
+# yolov3
 
-Thanks to Ayoosh Kathuria, for his remarkable tutorials of yolov3. The github link is [ayooshkathuria/pytorch-yolo-v3](https://github.com/ayooshkathuria/pytorch-yolo-v3).
+The Pytorch implementation is [ultralytics/yolov3](https://github.com/ultralytics/yolov3). It provides two trained weights of yolov3, `yolov3.weights` and `yolov3.pt`
 
-I forked his github repo, and implement inference_on_one_pic and export weights for tensorrt. You can refer to 
+This branch is using tensorrt7 API, there is also a yolov3 implementation using tensorrt4 API, go to [branch trt4/yolov3](https://github.com/wang-xinyu/tensorrtx/tree/trt4/yolov3), which is using [ayooshkathuria/pytorch-yolo-v3](https://github.com/ayooshkathuria/pytorch-yolo-v3).
 
-[pytorchx/pytorch-yolo-v3](https://github.com/wang-xinyu/pytorchx/tree/master/pytorch-yolo-v3)
-
-Following tricks are used in this yolov3,
-
-- I wrote a leaky relu plugin(leaky.cu  leaky.cuh  leakyplugin.cpp  leakyplugin.h) in the beginning, but I found there is PRelu in `NvInferPlugin.h`.
-- yolo layer is implemented as a plugin. I learn a lot from [lewes6369/TensorRT-Yolov3](https://github.com/lewes6369/TensorRT-Yolov3).
-- upsample layer is replaced by a deconvolution layer.
-- Batchnorm layer, implemented by scale layer.
-
-For FP16 mode, just need add one line `builder->setFp16Mode(true);`. On my TX1, it's 115ms in fp16, while 145ms in fp32.
+## Excute:
 
 ```
-// 1. generate yolov3.wts from [pytorchx/pytorch-yolo-v3](https://github.com/wang-xinyu/pytorchx/tree/master/pytorch-yolo-v3)
-
-// 2. put yolov3.wts into tensorrtx/yolov3
+1. generate yolov3.wts from pytorch implementation with yolov3.cfg and yolov3.weights
 
-// 3. build and run
+git clone https://github.com/wang-xinyu/tensorrtx.git
+git clone https://github.com/ultralytics/yolov3.git
+// download its weights 'yolov3.pt' or 'yolov3.weights'
+cd yolov3
+cp ../tensorrtx/yolov3/gen_wts.py .
+python gen_wts.py yolov3.weights
+// a file 'yolov3.wts' will be generated.
+// the master branch of yolov3 should work, if not, you can checkout cf7a4d31d37788023a9186a1a143a2dab0275ead
 
-cd tensorrtx/yolov3
+2. put yolov3.wts into tensorrtx/yolov3, build and run
 
+mv yolov3.wts ../tensorrtx/yolov3/
+cd ../tensorrtx/yolov3
 mkdir build
-
 cd build
-
 cmake ..
-
 make
+sudo ./yolov3 -s             // serialize model to plan file i.e. 'yolov3.engine'
+sudo ./yolov3 -d  ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed.
 
-sudo ./yolov3 -s   // serialize model to plan file i.e. 'yolov3.engine'
-sudo ./yolov3 -d   // deserialize plan file and run inference
-
-// 4. see if the output is same as pytorchx/pytorch-yolo-v3
+3. check the images generated, as follows. _zidane.jpg and _bus.jpg
 ```
 
+<p align="center">
+<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg">
+</p>
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/15235574/78247970-60b27c00-751e-11ea-88df-41473fed4823.jpg">
+</p>
+
+## Config
+
+- Input shape defined in yololayer.h
+- Number of classes defined in yololayer.h
+- FP16/FP32 can be selected by the macro in yolov3.cpp
+- GPU id can be selected by the macro in yolov3.cpp
+- NMS thresh in yolov3.cpp
+- BBox confidence thresh in yolov3.cpp
+
+## More Information
+
+See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
 
@@ -0,0 +1,94 @@
+#ifndef __TRT_UTILS_H_
+#define __TRT_UTILS_H_
+
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <cudnn.h>
+
+#ifndef CUDA_CHECK
+
+#define CUDA_CHECK(callstr)                                                                    \
+    {                                                                                          \
+        cudaError_t error_code = callstr;                                                      \
+        if (error_code != cudaSuccess) {                                                       \
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+            assert(0);                                                                         \
+        }                                                                                      \
+    }
+
+#endif
+
+namespace Tn
+{
+    class Profiler : public nvinfer1::IProfiler
+    {
+    public:
+        void printLayerTimes(int itrationsTimes)
+        {
+            float totalTime = 0;
+            for (size_t i = 0; i < mProfile.size(); i++)
+            {
+                printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
+                totalTime += mProfile[i].second;
+            }
+            printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
+        }
+    private:
+        typedef std::pair<std::string, float> Record;
+        std::vector<Record> mProfile;
+
+        virtual void reportLayerTime(const char* layerName, float ms)
+        {
+            auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
+            if (record == mProfile.end())
+                mProfile.push_back(std::make_pair(layerName, ms));
+            else
+                record->second += ms;
+        }
+    };
+
+    //Logger for TensorRT info/warning/errors
+    class Logger : public nvinfer1::ILogger
+    {
+    public:
+
+        Logger(): Logger(Severity::kWARNING) {}
+
+        Logger(Severity severity): reportableSeverity(severity) {}
+
+        void log(Severity severity, const char* msg) override
+        {
+            // suppress messages with severity enum value greater than the reportable
+            if (severity > reportableSeverity) return;
+
+            switch (severity)
+            {
+                case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
+                case Severity::kERROR: std::cerr << "ERROR: "; break;
+                case Severity::kWARNING: std::cerr << "WARNING: "; break;
+                case Severity::kINFO: std::cerr << "INFO: "; break;
+                default: std::cerr << "UNKNOWN: "; break;
+            }
+            std::cerr << msg << std::endl;
+        }
+
+        Severity reportableSeverity{Severity::kWARNING};
+    };
+
+    template<typename T> 
+    void write(char*& buffer, const T& val)
+    {
+        *reinterpret_cast<T*>(buffer) = val;
+        buffer += sizeof(T);
+    }
+
+    template<typename T> 
+    void read(const char*& buffer, T& val)
+    {
+        val = *reinterpret_cast<const T*>(buffer);
+        buffer += sizeof(T);
+    }
+}
+
+#endif
Original file line number	Diff line number	Diff line change
`@@ -56,5 +56,5 @@ sudo ./yolov3-spp -d ../samples // deserialize plan file and run inference, the`
`56`	`56`
`57`	`57`	`## More Information`
`58`	`58`
`59`		`-See the [readme](../README.md) in home page`
	`59`	`+See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)`
`60`	`60`