321 lines
No EOL
47 KiB
HTML
321 lines
No EOL
47 KiB
HTML
<!DOCTYPE html>
|
||
<html>
|
||
<head>
|
||
<title>report</title>
|
||
<style>
|
||
/*
|
||
Name: GeekPark article style for Mou app
|
||
Author: hzlzh(hzlzh.dev@gmail.com)
|
||
URL: https://github.com/GeekPark/Doc/blob/master/GeekPark/GeekPark-Style-for-Mou.css
|
||
*/
|
||
body{ font-family:"Microsoft Yahei","Helvetica Neue","Luxi Sans","DejaVu Sans",Tahoma,"Hiragino Sans GB",STHeiti; font-size:14px; line-height:1.6; color:#666666; padding-top:10px; padding-bottom:10px; background-color:white; padding:30px; }
|
||
body > *:first-child{ margin-top:0 !important; }
|
||
body > *:last-child{ margin-bottom:0 !important; }
|
||
a{ color:#109EFF; text-decoration:none; }
|
||
a:hover{ border-bottom:1px dotted #109EFF; }
|
||
a:visited, a:active{ color:#109EFF; }
|
||
a.absent{ color:#cc0000; }
|
||
a.anchor{ display:block; padding-left:30px; margin-left:-30px; cursor:pointer; position:absolute; top:0; left:0; bottom:0; }
|
||
p a{ margin:0 2px; }
|
||
h1, h2, h3, h4, h5, h6{ color:#333333; margin:20px 0 10px; padding:0; font-weight:bold; -webkit-font-smoothing:antialiased; cursor:text; position:relative; }
|
||
h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor, h5:hover a.anchor, h6:hover a.anchor{ background:url() no-repeat 10px center; text-decoration:none; }
|
||
h1 tt, h1 code{ font-size:inherit; }
|
||
h2 tt, h2 code{ font-size:inherit; }
|
||
h3 tt, h3 code{ font-size:inherit; }
|
||
h4 tt, h4 code{ font-size:inherit; }
|
||
h5 tt, h5 code{ font-size:inherit; }
|
||
h6 tt, h6 code{ font-size:inherit; }
|
||
h1{ font-size:18px; font-weight:bold; line-height:22px; margin-bottom:5px; color:#333; }
|
||
h2{ font-size:16px; font-weight:bolder; line-height:18px; margin:20px 0; }
|
||
h3{ font-size:14px; }
|
||
h4{ color:#666; font-size:14px; font-weight:bolder; line-height:18px; margin:20px 0; }
|
||
h5{ font-size:14px; }
|
||
h6{ color:#777777; font-size:14px; }
|
||
p, blockquote, ul, ol, dl, li, table, pre{ margin:15px 0; line-height:150%; }
|
||
hr{ background:transparent url() repeat-x 0 0; border:0 none; color:#cccccc; height:4px; padding:0; }
|
||
body > h2:first-child{ margin-top:0; padding-top:0; }
|
||
body > h1:first-child{ margin-top:0; padding-top:0; }
|
||
body > h1:first-child + h2{ margin-top:0; padding-top:0; }
|
||
body > h3:first-child, body > h4:first-child, body > h5:first-child, body > h6:first-child{ margin-top:0; padding-top:0; }
|
||
a:first-child h1, a:first-child h2, a:first-child h3, a:first-child h4, a:first-child h5, a:first-child h6{ margin-top:0; padding-top:0; }
|
||
h1 p, h2 p, h3 p, h4 p, h5 p, h6 p{ margin-top:0; }
|
||
li p.first{ display:inline-block; }
|
||
li{ font-size:14px; line-height:150%; margin-bottom:5px; margin-top:0; }
|
||
ul, ol{ padding-left:30px; }
|
||
ul :first-child, ol :first-child{ margin-top:0; }
|
||
dl{ padding:0; }
|
||
dl dt{ font-size:14px; font-weight:bold; font-style:italic; padding:0; margin:15px 0 5px; }
|
||
dl dt:first-child{ padding:0; }
|
||
dl dt > :first-child{ margin-top:0; }
|
||
dl dt > :last-child{ margin-bottom:0; }
|
||
dl dd{ margin:0 0 15px; padding:0 15px; }
|
||
dl dd > :first-child{ margin-top:0; }
|
||
dl dd > :last-child{ margin-bottom:0; }
|
||
blockquote{ background:url("") no-repeat scroll left 11px transparent; color:#999999; margin-left:28px; min-height:30px; padding:17px 40px 0; }
|
||
blockquote p{ color:#999999 !important; margin-bottom:25px !important; }
|
||
blockquote > :first-child{ margin-top:0; }
|
||
blockquote > :last-child{ margin-bottom:0; }
|
||
table{ padding:0; border-collapse:collapse; }
|
||
table tr{ border-top:1px solid #cccccc; background-color:white; margin:0; padding:0; }
|
||
table tr:nth-child(2n){ background-color:#f8f8f8; }
|
||
table tr th{ font-weight:bold; border:1px solid #cccccc; text-align:left; margin:0; padding:6px 13px; }
|
||
table tr td{ border:1px solid #cccccc; text-align:left; margin:0; padding:6px 13px; }
|
||
table tr th :first-child, table tr td :first-child{ margin-top:0; }
|
||
table tr th :last-child, table tr td :last-child{ margin-bottom:0; }
|
||
img{ border:1px solid #E1E1E1; margin:0 auto 10px; display:block; max-width:100%; padding:5px; }
|
||
figure { margin: 0; }
|
||
span.frame{ display:block; overflow:hidden; }
|
||
span.frame > span{ border:1px solid #dddddd; display:block; float:left; overflow:hidden; margin:13px 0 0; padding:7px; width:auto; }
|
||
span.frame span img{ display:block; float:left; }
|
||
span.frame span span{ clear:both; color:#333333; display:block; padding:5px 0 0; }
|
||
span.align-center{ display:block; overflow:hidden; clear:both; }
|
||
span.align-center > span{ display:block; overflow:hidden; margin:13px auto 0; text-align:center; }
|
||
span.align-center span img{ margin:0 auto; text-align:center; }
|
||
span.align-right{ display:block; overflow:hidden; clear:both; }
|
||
span.align-right > span{ display:block; overflow:hidden; margin:13px 0 0; text-align:right; }
|
||
span.align-right span img{ margin:0; text-align:right; }
|
||
span.float-left{ display:block; margin-right:13px; overflow:hidden; float:left; }
|
||
span.float-left span{ margin:13px 0 0; }
|
||
span.float-right{ display:block; margin-left:13px; overflow:hidden; float:right; }
|
||
span.float-right > span{ display:block; overflow:hidden; margin:13px auto 0; text-align:right; }
|
||
code, tt{ margin:0 2px; padding:0 5px; white-space:nowrap; border:1px solid #eaeaea; background-color:#f8f8f8; border-radius:3px; }
|
||
pre code{ margin:0; padding:0; white-space:pre; border:none; background:transparent; }
|
||
.highlight pre{ background-color:#f8f8f8; border:1px solid #cccccc; font-size:13px; line-height:19px; overflow:auto; padding:6px 10px; border-radius:3px; }
|
||
pre{ background-color:#f8f8f8; border:1px solid #cccccc; font-size:13px; line-height:19px; overflow:auto; padding:6px 10px; border-radius:3px; }
|
||
pre code, pre tt{ background-color:transparent; border:none; }
|
||
@media screen and (min-width: 914px){
|
||
body{ width:854px; margin:0 auto; }
|
||
}
|
||
</style>
|
||
</head>
|
||
<body>
|
||
<h1 id="introduction">1. Introduction</h1>
|
||
|
||
<h2 id="goal">1.1. Goal</h2>
|
||
|
||
<p>The goal with this project was to give a computer a drawing of either a cats or a dogs face and let it with high probability predict if it is a cat or a dog shown in that drawing.</p>
|
||
|
||
<h2 id="scope">1.2. Scope</h2>
|
||
|
||
<p>First I was thinking that I would get lots of people to draw cat and dog faces for me which I later found out was far to time consuming. Therefore I had to change the scope from recognizing random peoples drawings to recognizing my own drawings, which is obviously easier. Nevertheless everything else didn’t change that much, I just would get better results.</p>
|
||
|
||
<h1 id="preparation">2. Preparation</h1>
|
||
|
||
<h2 id="drawingandtakingaphoto">2.1. Drawing and taking a photo</h2>
|
||
|
||
<figure>
|
||
<img src="https://jeena.net/images/2013/catdog/drawing-taking-photo.jpg" alt="The raw drawings" />
|
||
<figcaption>The raw drawings</figcaption></figure>
|
||
|
||
<p>I drew eight A4 sheets of such cat and dog faces which resulted in 64 drawings of dog faces and 60 drawings of cat faces. Then I took pictures of them with my digital camera.</p>
|
||
|
||
<p>There was a huge difference in quality between the pictures I took with my iPhone 4 cammera and the ones I took with my Nikon D5000. In fact I wasn’t able to use the pictures I took with the iPhone because it was impossible to find straight lines in them.</p>
|
||
|
||
<p>You can see here the result, one with the iPhone image as a source and the other with the Nikon one:</p>
|
||
|
||
<figure>
|
||
<img src="https://jeena.net/images/2013/catdog/iphone-sample.jpg" alt="iPhone vs. Nikon sample" />
|
||
<figcaption>iPhone vs. Nikon sample</figcaption></figure>
|
||
|
||
<h2 id="photoshop">2.2. Photoshop</h2>
|
||
|
||
<p>I had to clean up the drawings so it would be easier for the algorithm later on to find everything. What I did was I opened the pictures of the drawings in Photoshop and played with the contrast and brightness.</p>
|
||
|
||
<p>Then I cut out one drawing after another from the big image and saved it as a black and white PNG image without dither.</p>
|
||
|
||
<figure>
|
||
<img src="https://jeena.net/images/2013/catdog/photoshop.jpg" alt="Steps in Photoshop" />
|
||
<figcaption>Steps in Photoshop</figcaption></figure>
|
||
|
||
<h2 id="resizing">2.3. Resizing</h2>
|
||
|
||
<p>I wrote a small shellscript which would take all pictures resize them proportionally to a max width and height of 200 px and fill up the missibg borders with a white background color. To do that I used the <a href="www.imagemagick.org">ImageMagick suite</a> software suite:</p>
|
||
|
||
<pre>#!/bin/sh
|
||
|
||
NEW="new_$1"
|
||
rm -rf $NEW
|
||
mkdir $NEW
|
||
|
||
for i in `ls -1 $1`
|
||
do
|
||
convert $1/$i \
|
||
-adaptive-resize 200x200\> \
|
||
-size 200x200 xc:white +swap \
|
||
-gravity center \
|
||
-composite \
|
||
$NEW/$i
|
||
done</pre>
|
||
|
||
<p>After that all the images had uniform size and colors so now I was able to compare them in a meaningfull way.</p>
|
||
|
||
<h1 id="featureextraction">3. Feature extraction</h1>
|
||
|
||
<p>The next step was to extract the features from the images which means to find something in the pictures which I was able to count and it would be unique enough to find a difference between cats and dogs but broad enough so all dogs would fall into one category and all cats to another.</p>
|
||
|
||
<h2 id="straightlines">3.1. Straight lines</h2>
|
||
|
||
<p>The first thing which came to mind was counting and doing other stuff with straight lines in the image.</p>
|
||
|
||
<h3 id="cannyedgedetector">3.1.1 Canny edge detector</h3>
|
||
|
||
<p>I used a the edge detector algorithm called Canny to preprocess the images which like its name says finds edges in images. Because of my preparation with Photoshop it was quite easy for it to find them. It is not easy to see that step with my drawings, here a picture how it looks like if you do that with a photo instead:</p>
|
||
|
||
<figure>
|
||
<img src="https://jeena.net/images/2013/catdog/canny.jpg" alt="Canny on a photo from Wikipedia" />
|
||
<figcaption>Canny on a photo from Wikipedia</figcaption></figure>
|
||
|
||
<p>What it does is basically it does noise reduction with a gausian filter and then finds the intentisty gradians of the image with help of some trigonometry.</p>
|
||
|
||
<p>I didn’t implement the algorithm myself, instead I used the <a href="http://docs.opencv.org/doc/tutorials/imgproc/imgtrans/canny_detector/canny_detector.html">OpenCV implementation</a>.</p>
|
||
|
||
<h3 id="houghtransform">3.1.2 Hough transform</h3>
|
||
|
||
<p>To find the lines I used the <a href="https://en.wikipedia.org/wiki/Hough_transform">Hough transform</a> algorithm. The red lines are those which the Hough transform algorithm found in the example picture:</p>
|
||
|
||
<figure>
|
||
<img src="https://jeena.net/images/2013/catdog/hough.png" alt="Hough lines" />
|
||
<figcaption>Hough lines</figcaption></figure>
|
||
|
||
<p>What it basically does is grouping edges, which can be imperfect, to object candidates by performing an explicit voting procedure. Detecting straight lines can be done by describing them as <code>y = mx + b</code> where <code>m</code> is the slope of the line and <code>b</code> is the intercept. The line is not represented by descrete points <code>(x1,y1)(x2,y2)</code> but instead as a <code>point(x,y)</code> in the parameter space, which makes detection of lines which are a bit off possible. In practice it is still more complicated, please read the <a href="https://en.wikipedia.org/wiki/Hough_transform">Wikipedia article</a> about it.</p>
|
||
|
||
<p>Because of lack of time I didn’t implement it myself but used the probabilistic <a href="http://docs.opencv.org/modules/imgproc/doc/feature_detection.html?highlight=houghlinesp#houghlinesp">OpenCV implementation</a>.</p>
|
||
|
||
<h2 id="linesfeatures">3.2. Lines features</h2>
|
||
|
||
<p>I extracted these features from the lines:</p>
|
||
|
||
<ul>
|
||
<li>amount of lines</li>
|
||
<li>average length of lines</li>
|
||
<li>average angle of lines</li>
|
||
</ul>
|
||
|
||
<h2 id="otherfeatures">3.3. Other features</h2>
|
||
|
||
<p>I also extracted the amount of black pixels in the image to use it as a possible feature which wasn’t using the extracted lines.</p>
|
||
|
||
<h1 id="k-nearestneighboralgorithm">4. <em>k</em>-nearest neighbor algorithm</h1>
|
||
|
||
<p>I chose to use the <em>k</em>-Nearest Neighbors algorithm which only locally looks at the neighbors of the document in a radius predefined by the user. It assumes that the document is of the same category as the highest number of neighbors within this radius.</p>
|
||
|
||
<p>In the following figure you can see that depending if the user choses k = 3, as showed by the solid line, the algorithm will conclude that the document in the center (green smiley) is of the type triangle because most of this three neighbors are triangles. If on the other hand the user choses k = 7, as showed by the dotted line, then the amount of neighbors which are rectangles is greater as the amount of neighbors which are triangles, so it concludes that the smiley is of type rectangle.</p>
|
||
|
||
<figure>
|
||
<img src="https://jeena.net/images/2013/catdog/k-nearest-neighbours.png" alt="k-Nearest Neighbours as a graphic" />
|
||
<figcaption>k-Nearest Neighbours as a graphic</figcaption></figure>
|
||
|
||
<p>In the picture above you see how it would look with two dimensions. I have been using four features so the algorithm had to check the distance to the neighbours in four dimensions. This isn’t really more difficult, it is just more to calculate.</p>
|
||
|
||
<h1 id="results">5. Results</h1>
|
||
|
||
<p>The results were quite encouraging, I assume it is because I only used one style to draw the dogs and one style to draw the cats.</p>
|
||
|
||
<h2 id="k-foldcross-validation">5.1. k-fold Cross-validation</h2>
|
||
|
||
<p>I used 10 fold cross-validation for every test I did, which means that I used 90% of the available data for the learning algorithms and then the remaining 10% to test how they performed. I repeated this ten times until all data has been used for testing once.</p>
|
||
|
||
<h2 id="resultswithallfeatures">5.2. Results with all features</h2>
|
||
|
||
<p>When I used all of the features and three nearest neighbours I got amazing 100% accuracy, which was kind of suspect because that normally means that you most probably did something wrong.</p>
|
||
|
||
<h2 id="resultswithareducedfeatureset">5.3. Results with a reduced feature set</h2>
|
||
|
||
<p>Therefor I tried to reduce the features to check if it would perform worse.</p>
|
||
|
||
<ol>
|
||
<li>When I removed the information about the amount of black pixels basically nothing happened.</li>
|
||
<li>When I removed the information about the amount of lines and average length at least I got a couple of wrong categorized images, the accuracy went down to 95%.</li>
|
||
<li>When I removed the information about the average angle of the lines, that was when I got significant errors. The accuracy dropped down to about 60%, which is still better then pure chanse.</li>
|
||
</ol>
|
||
|
||
<p>So it seems like the best feature to detect cat and dog face drawings done by me was the average angle of the straight lines in the image.</p>
|
||
|
||
<h1 id="futurestudy">6. Future study</h1>
|
||
|
||
<p>The most important next step would be to gather many more drawings done by other people who use other styles to draw cat and dog faces.</p>
|
||
|
||
<p>Then it would be interesting to use other learning algorithms like Bayes, Perceptron, etc.</p>
|
||
|
||
<p>And then it would be interesting to use this approach on photos of real cats and dogs.</p>
|
||
|
||
<h1 id="code">7. Code</h1>
|
||
|
||
<pre><code>#!/usr/bin/env python
|
||
|
||
import cv2, cv, sys, math, os, numpy
|
||
from scipy.spatial import KDTree
|
||
|
||
def extractFeatures(label):
|
||
|
||
directory = "img/" + label + "/"
|
||
|
||
features = []
|
||
|
||
for fn in os.listdir(directory):
|
||
|
||
img = cv2.imread(directory + fn, 0)
|
||
|
||
# find edges
|
||
canny = cv2.Canny(img, 50, 100)
|
||
|
||
# find colored
|
||
black_pixels = numpy.count_nonzero(img)
|
||
|
||
# find lines lines
|
||
lines = cv2.HoughLinesP(canny, 1, math.pi/360, 5, None, 10, 1)
|
||
|
||
lengths = []
|
||
angles = []
|
||
try:
|
||
for line in lines[0]:
|
||
x1, y1, x2, y2 = line
|
||
|
||
# Pythagoras
|
||
a2 = math.pow((x1-x2), 2)
|
||
b2 = math.pow((y1-y2), 2)
|
||
length = int(math.sqrt(a2 + b2))
|
||
lengths.append(length)
|
||
|
||
angle = int(math.degrees(math.atan((y1-y2) / (x1-x2))))
|
||
angles.append(angle)
|
||
except:
|
||
pass
|
||
|
||
# print out everything
|
||
lines_count = len(lengths)
|
||
mid_length = sum(lengths) / lines_count
|
||
mid_angle = sum(angles) / lines_count
|
||
|
||
features.append([
|
||
[lines_count, mid_length, mid_angle, black_pixels],
|
||
label
|
||
])
|
||
|
||
return features
|
||
|
||
|
||
if __name__ == "__main__":
|
||
cats = extractFeatures("cat")
|
||
dogs = extractFeatures("dog")
|
||
|
||
test_count = 5
|
||
|
||
test_data = dogs[:test_count] + cats[:test_count]
|
||
test_labels = map(lambda a: a[1], test_data)
|
||
test_features = map(lambda a: a[0], test_data)
|
||
|
||
data = cats[test_count:] + dogs[test_count:]
|
||
labels = map(lambda a: a[1], data)
|
||
features = map(lambda a: a[0], data)
|
||
|
||
tree = KDTree(features)
|
||
|
||
for t in xrange(0, test_count * 2):
|
||
d, i = tree.query(test_features[t], k=3)
|
||
print "-"
|
||
for j in xrange(0, len(i)):
|
||
print test_labels[t] + " is a " + labels[i[j]]
|
||
</code></pre>
|
||
</body>
|
||
</html> |